diff --git a/README.md b/README.md index d998a7a..660508c 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,8 @@ This project is a chatbot for Mattermost that integrates with the Anthropic API - **Extracts text content from links** shared in the messages. Also supports **FlareSolverr** to bypass Javascript/CAPTCHA restrictions - Supports the **Vision API** for describing images provided as URLs within the chat message -- **Gets transcripts of YouTube videos** for easy tl;dw summarizations +- **Gets transcripts of YouTube videos** for easy tl;dw summarizations. Title, description and uploader are also + provided - Maintains context of the conversation within a thread - Sends typing indicators to show that the chatbot is processing the message - Utilizes a thread pool to handle multiple requests concurrently (due to `mattermostdriver-asyncio` being outdated) @@ -26,7 +27,7 @@ This project is a chatbot for Mattermost that integrates with the Anthropic API ## Prerequisites -- Python 3.11 or just a server with [Docker](https://docs.docker.com/get-started/). _(you can get away with using 3.8 if +- Python 3.11 or just a server with [Docker](https://docs.docker.com/get-started/) _(you can get away with using 3.8 if you use datetime.datetime.utcnow() instead of datetime.datetime.now(datetime.UTC))_ - Anthropic API key - Mattermost server with API access @@ -49,7 +50,7 @@ This project is a chatbot for Mattermost that integrates with the Anthropic API ``` _or alternatively:_ ```bash - python3.12 -m pip install anthropic mattermostdriver certifi beautifulsoup4 pillow httpx youtube-transcript-api + python3.12 -m pip install anthropic mattermostdriver certifi beautifulsoup4 pillow httpx youtube-transcript-api yt-dlp ``` 3. Set the following environment variables with your own values (most are optional): @@ -137,4 +138,5 @@ This project is licensed under the MIT License. - [Mattermost](https://mattermost.com/) for the messaging platform - [mattermostdriver](https://github.com/Vaelor/python-mattermost-driver) for the Mattermost API client library - [chatgpt-mattermost-bot](https://github.com/yGuy/chatgpt-mattermost-bot) for inspiring me to write this python code -- [youtube-transcript-api](https://pypi.org/project/youtube-transcript-api/) for the YouTube Transcript Fetch library \ No newline at end of file +- [youtube-transcript-api](https://pypi.org/project/youtube-transcript-api/) for the YouTube Transcript Fetch library +- [yt-dlp](https://pypi.org/project/yt-dlp/) for the YouTube API that allows us to fetch details \ No newline at end of file diff --git a/chatbot.py b/chatbot.py index 0dc54a9..28e5f8b 100644 --- a/chatbot.py +++ b/chatbot.py @@ -16,6 +16,7 @@ from mattermostdriver.driver import Driver from bs4 import BeautifulSoup from youtube_transcript_api import YouTubeTranscriptApi +from yt_dlp import YoutubeDL from anthropic import Anthropic logging.basicConfig(level=logging.INFO) @@ -428,8 +429,18 @@ async def message_handler(event): continue try: if yt_is_valid_url(link): - transcript_text = yt_get_transcript(link) - extracted_text += transcript_text + title, description, uploader = yt_get_video_info( + link + ) + transcript = yt_get_transcript(link) + extracted_text += f""" + + {title} + {description} + {uploader} + {transcript} + + """ continue with client.stream( @@ -465,7 +476,7 @@ async def message_handler(event): image_data += chunk total_size += len(chunk) if total_size > max_response_size: - extracted_text += "*WEBSITE SIZE EXCEEDED THE MAXIMUM LIMIT FOR THE CHATBOT, WARN THE CHATBOT USER*" + extracted_text += "website size exceeded the maximum limit for the chatbot, warn the chatbot user" raise Exception( "Response size exceeds the maximum limit at image processing" ) @@ -549,11 +560,8 @@ async def message_handler(event): # Handle text content try: if flaresolverr_endpoint: - extracted_text += ( - extract_content_with_flaresolverr( - link - ) - ) + website_text = extract_content_with_flaresolverr(link) + extracted_text += f"{website_text}" else: raise Exception( "FlareSolverr endpoint not available" @@ -568,15 +576,16 @@ async def message_handler(event): content_chunks.append(chunk) total_size += len(chunk) if total_size > max_response_size: - extracted_text += "*WEBSITE SIZE EXCEEDED THE MAXIMUM LIMIT FOR THE CHATBOT, WARN THE CHATBOT USER*" + extracted_text += "website size exceeded the maximum limit for the chatbot, warn the chatbot user" raise Exception( "Response size exceeds the maximum limit" ) content = b"".join(content_chunks) soup = BeautifulSoup(content, "html.parser") - extracted_text += soup.get_text( + website_text = soup.get_text( " | ", strip=True ) + extracted_text += f"{website_text}" except Exception as e: logging.error( f"Error extracting content from link {link}: {str(e)} {traceback.format_exc()}" @@ -671,9 +680,26 @@ def yt_get_transcript(url): except Exception as e: logging.info(f"YouTube Transcript Exception: {str(e)}") - return ( - "*COULD NOT FETCH THE VIDEO TRANSCRIPT FOR THE CHATBOT, WARN THE CHATBOT USER*" - ) + return "could not fetch the video transcript for the chatbot, warn the chatbot user" + + +def yt_get_video_info(url): + ydl_opts = { + "quiet": True, + # 'no_warnings': True, + } + + # Create a YoutubeDL instance + with YoutubeDL(ydl_opts) as ydl: + # Extract video info + info = ydl.extract_info(url, download=False) + + # Get the desired fields from the info dictionary + title = info["title"] + description = info["description"] + uploader = info["uploader"] + + return title, description, uploader def yt_is_valid_url(url): diff --git a/requirements.txt b/requirements.txt index b2a7a5a..491e2c2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ beautifulsoup4 pillow httpx youtube-transcript-api +yt-dlp anthropic \ No newline at end of file