YouTube: also provide video title, description, uploader; stylistic …

…changes
Elehiggle · Apr 17, 2024 · fb88b1d · fb88b1d
1 parent 3327958
commit fb88b1d
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -18,15 +18,16 @@ This project is a chatbot for Mattermost that integrates with the Anthropic API
 - **Extracts text content from links** shared in the messages. Also supports **FlareSolverr** to bypass
   Javascript/CAPTCHA restrictions
 - Supports the **Vision API** for describing images provided as URLs within the chat message
-- **Gets transcripts of YouTube videos** for easy tl;dw summarizations
+- **Gets transcripts of YouTube videos** for easy tl;dw summarizations. Title, description and uploader are also
+  provided
 - Maintains context of the conversation within a thread
 - Sends typing indicators to show that the chatbot is processing the message
 - Utilizes a thread pool to handle multiple requests concurrently (due to `mattermostdriver-asyncio` being outdated)
 - Offers **Docker support** for easy deployment
 
 ## Prerequisites
 
-- Python 3.11 or just a server with [Docker](https://docs.docker.com/get-started/). _(you can get away with using 3.8 if
+- Python 3.11 or just a server with [Docker](https://docs.docker.com/get-started/) _(you can get away with using 3.8 if
   you use datetime.datetime.utcnow() instead of datetime.datetime.now(datetime.UTC))_
 - Anthropic API key
 - Mattermost server with API access
@@ -49,7 +50,7 @@ This project is a chatbot for Mattermost that integrates with the Anthropic API
     ```
    _or alternatively:_
     ```bash
-    python3.12 -m pip install anthropic mattermostdriver certifi beautifulsoup4 pillow httpx youtube-transcript-api
+    python3.12 -m pip install anthropic mattermostdriver certifi beautifulsoup4 pillow httpx youtube-transcript-api yt-dlp
     ```
 
 3. Set the following environment variables with your own values (most are optional):
@@ -137,4 +138,5 @@ This project is licensed under the MIT License.
 - [Mattermost](https://mattermost.com/) for the messaging platform
 - [mattermostdriver](https://github.com/Vaelor/python-mattermost-driver) for the Mattermost API client library
 - [chatgpt-mattermost-bot](https://github.com/yGuy/chatgpt-mattermost-bot) for inspiring me to write this python code
-- [youtube-transcript-api](https://pypi.org/project/youtube-transcript-api/) for the YouTube Transcript Fetch library
+- [youtube-transcript-api](https://pypi.org/project/youtube-transcript-api/) for the YouTube Transcript Fetch library
+- [yt-dlp](https://pypi.org/project/yt-dlp/) for the YouTube API that allows us to fetch details
diff --git a/chatbot.py b/chatbot.py
@@ -16,6 +16,7 @@
 from mattermostdriver.driver import Driver
 from bs4 import BeautifulSoup
 from youtube_transcript_api import YouTubeTranscriptApi
+from yt_dlp import YoutubeDL
 from anthropic import Anthropic
 
 logging.basicConfig(level=logging.INFO)
@@ -428,8 +429,18 @@ async def message_handler(event):
                                 continue
                             try:
                                 if yt_is_valid_url(link):
-                                    transcript_text = yt_get_transcript(link)
-                                    extracted_text += transcript_text
+                                    title, description, uploader = yt_get_video_info(
+                                        link
+                                    )
+                                    transcript = yt_get_transcript(link)
+                                    extracted_text += f"""
+                                    <youtube_video_details>
+                                        <title>{title}</title>
+                                        <description>{description}</description>
+                                        <uploader>{uploader}</uploader>
+                                        <transcript>{transcript}</transcript>
+                                    </youtube_video_details>
+                                    """
                                     continue
 
                                 with client.stream(
@@ -465,7 +476,7 @@ async def message_handler(event):
                                             image_data += chunk
                                             total_size += len(chunk)
                                             if total_size > max_response_size:
-                                                extracted_text += "*WEBSITE SIZE EXCEEDED THE MAXIMUM LIMIT FOR THE CHATBOT, WARN THE CHATBOT USER*"
+                                                extracted_text += "<chatbot_error>website size exceeded the maximum limit for the chatbot, warn the chatbot user</chatbot_error>"
                                                 raise Exception(
                                                     "Response size exceeds the maximum limit at image processing"
                                                 )
@@ -549,11 +560,8 @@ async def message_handler(event):
                                         # Handle text content
                                         try:
                                             if flaresolverr_endpoint:
-                                                extracted_text += (
-                                                    extract_content_with_flaresolverr(
-                                                        link
-                                                    )
-                                                )
+                                                website_text = extract_content_with_flaresolverr(link)
+                                                extracted_text += f"<website_extracted_text_content>{website_text}</website_extracted_text_content>"
                                             else:
                                                 raise Exception(
                                                     "FlareSolverr endpoint not available"
@@ -568,15 +576,16 @@ async def message_handler(event):
                                             content_chunks.append(chunk)
                                             total_size += len(chunk)
                                             if total_size > max_response_size:
-                                                extracted_text += "*WEBSITE SIZE EXCEEDED THE MAXIMUM LIMIT FOR THE CHATBOT, WARN THE CHATBOT USER*"
+                                                extracted_text += "<chatbot_error>website size exceeded the maximum limit for the chatbot, warn the chatbot user</chatbot_error>"
                                                 raise Exception(
                                                     "Response size exceeds the maximum limit"
                                                 )
                                         content = b"".join(content_chunks)
                                         soup = BeautifulSoup(content, "html.parser")
-                                        extracted_text += soup.get_text(
+                                        website_text = soup.get_text(
                                             " | ", strip=True
                                         )
+                                        extracted_text += f"<website_extracted_text_content>{website_text}</website_extracted_text_content>"
                             except Exception as e:
                                 logging.error(
                                     f"Error extracting content from link {link}: {str(e)} {traceback.format_exc()}"
@@ -671,9 +680,26 @@ def yt_get_transcript(url):
     except Exception as e:
         logging.info(f"YouTube Transcript Exception: {str(e)}")
 
-    return (
-        "*COULD NOT FETCH THE VIDEO TRANSCRIPT FOR THE CHATBOT, WARN THE CHATBOT USER*"
-    )
+    return "<chatbot_error>could not fetch the video transcript for the chatbot, warn the chatbot user</chatbot_error>"
+
+
+def yt_get_video_info(url):
+    ydl_opts = {
+        "quiet": True,
+        # 'no_warnings': True,
+    }
+
+    # Create a YoutubeDL instance
+    with YoutubeDL(ydl_opts) as ydl:
+        # Extract video info
+        info = ydl.extract_info(url, download=False)
+
+        # Get the desired fields from the info dictionary
+        title = info["title"]
+        description = info["description"]
+        uploader = info["uploader"]
+
+        return title, description, uploader
 
 
 def yt_is_valid_url(url):

diff --git a/requirements.txt b/requirements.txt
@@ -4,4 +4,5 @@ beautifulsoup4
 pillow
 httpx
 youtube-transcript-api
+yt-dlp
 anthropic
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,4 +4,5 @@ beautifulsoup4 @@
     pillow
     httpx
     youtube-transcript-api
+    yt-dlp
     anthropic