From 5aa754fa1d3a056c3047af32a8cf3ef145a14333 Mon Sep 17 00:00:00 2001 From: Elehiggle Date: Fri, 24 May 2024 16:45:45 +0200 Subject: [PATCH] UP2DATE with GPT repo: Add .pylintrc rules file; add live data by function calling for ECB exchange rates, cryptocurrency data and stock data; use defusedxml, add timed lru cache; switch from local RAG helper file to pymupdf4llm pypi, work around some rare FlareSolverr issue --- .github/workflows/docker-publish.yml | 9 +- .pylintrc | 15 ++ README.md | 14 +- chatbot.py | 333 +++++++++++++++++++++++--- helpers/pymupdf_rag.py | 337 --------------------------- requirements.txt | 21 +- 6 files changed, 350 insertions(+), 379 deletions(-) create mode 100644 .pylintrc delete mode 100644 helpers/pymupdf_rag.py diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index e554926..b4ec37d 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -5,6 +5,10 @@ on: paths-ignore: - 'README.md' - '**.png' + pull_request: + paths-ignore: + - 'README.md' + - '**.png' jobs: build-and-push: @@ -52,9 +56,10 @@ jobs: uses: docker/build-push-action@v5 with: context: . - push: true + file: ./Dockerfile + push: ${{ github.event_name != 'pull_request' }} provenance: false - platforms: linux/amd64,linux/arm64 + platforms: linux/amd64,linux/arm64/v8 labels: ${{ steps.meta.outputs.labels }} tags: ${{ steps.meta.outputs.tags }} cache-from: type=gha diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..75c32a0 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,15 @@ +[MASTER] +disable= + C0114, # missing-module-docstring + C0116, # missing-function-docstring + W0703, # broad-exception-caught + W1203, # logging-fstring-interpolation + W0613, # unused-argument + W0719, # broad-exception-raised + W0603, # global-statement + C0301, # line-too-long + R0915, # too-many-statements + R0912, # too-many-branches + R0914, # too-many-locals + R1702, # too-many-nested-blocks + C0302, # too-many-lines \ No newline at end of file diff --git a/README.md b/README.md index fa3c2e8..89edde3 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,10 @@ This project is a chatbot for Mattermost that integrates with the Anthropic API - Supports the **Vision API** for describing images. Images from PDFs will also be sent here. - **Gets transcripts of YouTube videos** for easy tl;dw summarizations. Title, description and uploader are also provided +- Accesses additional live information via function calling. Currently supported: **stock data** (via Yahoo Finance, eg. + ask about AAPL), **cryptocurrency data** ( + via [Coingecko](https://api.coingecko.com/api/v3/coins/markets?vs_currency=usd&order=market_cap_desc&per_page=500&page=1&sparkline=false&price_change_percentage=24h%2C7d)), + **fiat currency exchange rates** (via [ECB](https://www.ecb.europa.eu/stats/eurofxref/eurofxref-daily.xml)) - Maintains context of the conversation within a thread - Sends typing indicators to show that the chatbot is processing the message - Utilizes a thread pool to handle multiple requests concurrently (due to `mattermostdriver-asyncio` being outdated) @@ -55,7 +59,7 @@ This project is a chatbot for Mattermost that integrates with the Anthropic API ``` _or alternatively:_ ```bash - python3.12 -m pip install anthropic mattermostdriver certifi beautifulsoup4 pillow httpx youtube-transcript-api yt-dlp PyMuPDF + python3 -m pip install anthropic mattermostdriver certifi beautifulsoup4 pillow httpx youtube-transcript-api yt-dlp PyMuPDF defusedxml yfinance pymupdf4llm ``` 4. Set the following environment variables with your own values: @@ -74,19 +78,19 @@ This project is a chatbot for Mattermost that integrates with the Anthropic API | Parameter | Description | |-------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `AI_SYSTEM_PROMPT` | The system prompt/instructions. Default: [click](https://github.com/Elehiggle/Claude3MattermostChatbot/blob/f709433ac05996992a7cb13a9c4a77472161772e/chatbot.py#L64) (Subject to change. current_time and chatbot_username variables inside the prompt will be auto-formatted and substituted. | +| `AI_SYSTEM_PROMPT` | The system prompt/instructions. Default: [click](https://github.com/Elehiggle/Claude3MattermostChatbot/blob/f709433ac05996992a7cb13a9c4a77472161772e/chatbot.py#L64) (Subject to change. current_time and CHATBOT_USERNAME variables inside the prompt will be auto-formatted and substituted. | | `AI_TIMEOUT` | The timeout for the AI API call in seconds. Default: "120" | | `MAX_TOKENS` | The maximum number of tokens to generate in the response. Default: "4096" (max) | | `TEMPERATURE` | The temperature value for controlling the randomness of the generated responses (0.0 = analytical, 1.0 = fully random). Default: "0.15" | | `MAX_RESPONSE_SIZE_MB` | The maximum size of the website or file content to extract (in megabytes, per URL/file). Default: "100" | -| `FLARESOLVERR_ENDPOINT` | Endpoint URL to your [FlareSolverr](https://github.com/FlareSolverr/FlareSolverr) instance (eg. "http://192.168.1.55:8191/v1"). If you use this, MAX_RESPONSE_SIZE_MB won't be honored since it can't stream content. For most effectiveness, use a residential IP endpoint | +| `FLARESOLVERR_ENDPOINT` | Endpoint URL to your [FlareSolverr](https://github.com/FlareSolverr/FlareSolverr) instance (eg. ""). If you use this, MAX_RESPONSE_SIZE_MB won't be honored since it can't stream content. For most effectiveness, use a residential IP endpoint | | `KEEP_ALL_URL_CONTENT` | Whether to feed the AI all URL content from the whole conversation thread. The website result is cached in memory. If you only want it to know about the current message's URL content (due to context size or cost), set to "FALSE". Default: "TRUE" | | `MATTERMOST_IGNORE_SENDER_ID` | The user ID of a user to ignore (optional, useful if you have multiple chatbots that are not real bot accounts to prevent endless loops). Supports multiple, separated by comma | | `MATTERMOST_PORT` | The port of your Mattermost server. Default: "443" | | `MATTERMOST_SCHEME` | The scheme of the connection. Default: "https" | | `MATTERMOST_BASEPATH` | The basepath of your Mattermost server. Default: "/api/v4" | | `MATTERMOST_CERT_VERIFY` | Cert verification. Default: True (also: string path to your certificate file) | -| `AI_API_BASEURL` | AI API Base URL. Default: None (which will use "https://api.anthropic.com"). Useful if you want to use a different AI with Anthropic compatible endpoint | +| `AI_API_BASEURL` | AI API Base URL. Default: None (which will use ""). Useful if you want to use a different AI with Anthropic compatible endpoint | | `LOG_LEVEL` | The log level. Default: "INFO" | | `LOG_LEVEL_ROOT` | The root log level (for other modules than this chatbot). Default: "INFO" | @@ -138,7 +142,7 @@ Also there is: While the official plugin certainly will improve over time, this bot here will too and there will be certain features that will absolutely never make it into the official plugin, due to it primarily focusing on features for developers -like function calling and retrieving GitHub issues, for example. +like function calling to retrieve GitHub issues, for example. ## Known Issues diff --git a/chatbot.py b/chatbot.py index fcca360..2202834 100644 --- a/chatbot.py +++ b/chatbot.py @@ -9,12 +9,16 @@ import logging import concurrent.futures import base64 -from functools import lru_cache +from time import monotonic_ns +from functools import lru_cache, wraps from io import BytesIO +from defusedxml import ElementTree +import yfinance import certifi # noinspection PyPackageRequirements import fitz +import pymupdf4llm import httpx from PIL import Image from mattermostdriver.driver import Driver @@ -22,7 +26,6 @@ from youtube_transcript_api import YouTubeTranscriptApi from yt_dlp import YoutubeDL from anthropic import Anthropic -from helpers.pymupdf_rag import to_markdown log_level_root = os.getenv("LOG_LEVEL_ROOT", "INFO").upper() logging.basicConfig(level=log_level_root) @@ -51,6 +54,40 @@ def cdc(*args, **kwargs): # monkey patching ssl.create_default_context to fix SSL error ssl.create_default_context = cdc + +def timed_lru_cache(_func=None, *, seconds: int = 600, maxsize: int = 128, typed: bool = False): + """Extension of functools lru_cache with a timeout + + Parameters: + seconds (int): Timeout in seconds to clear the WHOLE cache, default = 10 minutes + maxsize (int): Maximum Size of the Cache + typed (bool): Same value of different type will be a different entry + + """ + + def wrapper_cache(f): + f = lru_cache(maxsize=maxsize, typed=typed)(f) + f.delta = seconds * 10 ** 9 + f.expiration = monotonic_ns() + f.delta + + @wraps(f) + def wrapped_f(*args, **kwargs): + if monotonic_ns() >= f.expiration: + f.cache_clear() + f.expiration = monotonic_ns() + f.delta + return f(*args, **kwargs) + + wrapped_f.cache_info = f.cache_info + wrapped_f.cache_clear = f.cache_clear + return wrapped_f + + # To allow decorator to be used without arguments + if _func is None: + return wrapper_cache + + return wrapper_cache(_func) + + # AI parameters api_key = os.environ["AI_API_KEY"] model = os.getenv("AI_MODEL", "claude-3-opus-20240229") @@ -77,6 +114,61 @@ def cdc(*args, **kwargs): If an error occurs, provide the information from the tag to the user along with your answer.""", ) +tools = [ + { + "name": "get_exchange_rates", + "description": "Retrieve the latest exchange rates from the ECB, base currency: EUR", + "input_schema": { + "type": "object", + "properties": {}, + } + }, + { + "name": "get_cryptocurrency_data_by_id", + "description": "Fetches cryptocurrency data by ID (ex. ethereum) or symbol (ex. BTC)", + "input_schema": { + "type": "object", + "properties": { + "crypto_id": { + "type": "string", + "description": "The identifier or symbol of the cryptocurrency" + } + }, + "required": ["crypto_id"] + } + }, + { + "name": "get_cryptocurrency_data_by_market_cap", + "description": "Fetches cryptocurrency data for the top N currencies by market cap", + "input_schema": { + "type": "object", + "properties": { + "num_currencies": { + "type": "integer", + "description": "The number of top cryptocurrencies to retrieve. Optional", + "default": 15, + "max": 20 + } + }, + "required": [] + } + }, + { + "name": "get_stock_ticker_data", + "description": "Retrieves information about a specified company from the stock market", + "input_schema": { + "type": "object", + "properties": { + "ticker_symbol": { + "type": "string", + "description": "The stock ticker symbol of the company (ex. AAPL)" + } + }, + "required": ["ticker_symbol"] + } + } +] + # Mattermost server details mattermost_url = os.environ["MATTERMOST_URL"] mattermost_scheme = os.getenv("MATTERMOST_SCHEME", "https") @@ -97,8 +189,8 @@ def cdc(*args, **kwargs): keep_all_url_content = os.getenv("KEEP_ALL_URL_CONTENT", "TRUE").upper() == "TRUE" # For filtering local links -regex_local_links = ( - r"(?:127\.|192\.168\.|10\.|172\.(1[6-9]|2[0-9]|3[0-1])\.|::1|(?= 15: # Limit the number of function calls + raise Exception("Maximum amount of function calls reached") + + if call.name == "get_stock_ticker_data": + arguments = call.input + ticket_symbol = arguments["ticker_symbol"] + stock_data = get_stock_ticker_data(ticket_symbol) + func_response = { + "type": "tool_result", + "tool_use_id": call.id, + "content": json.dumps(stock_data), + } + + tool_messages.append(func_response) + elif call.name == "get_cryptocurrency_data_by_market_cap": + arguments = call.input + num_currencies = arguments["num_currencies"] if "num_currencies" in arguments else 15 + crypto_data = get_cryptocurrency_data_by_market_cap(num_currencies) + func_response = { + "type": "tool_result", + "tool_use_id": call.id, + "content": json.dumps(crypto_data), + } + + tool_messages.append(func_response) + elif call.name == "get_cryptocurrency_data_by_id": + arguments = call.input + crypto_id = arguments["crypto_id"] + crypto_data = get_cryptocurrency_data_by_id(crypto_id) + func_response = { + "type": "tool_result", + "tool_use_id": call.id, + "content": json.dumps(crypto_data), + } + + tool_messages.append(func_response) + elif call.name == "get_exchange_rates": + exchange_rates = get_exchange_rates() + func_response = { + "type": "tool_result", + "tool_use_id": call.id, + "content": json.dumps(exchange_rates), + } + + tool_messages.append(func_response) + else: + func_response = { + "tool_call_id": call.id, + "role": "tool", + "name": call.function.name, + "content": "You hallucinated this function call, it does not exist", + } + + tool_messages.append(func_response) + + # Requery in case there are new messages from function calls + if tool_messages: + # Add the initial response to the messages array as it contains infos about tool calls + messages.append({"role": "assistant", "content": initial_message_response}) + + # Construct the final func_response using the accumulated result blocks + messages.append({"role": "user", "content": tool_messages}) + + response = ai_client.beta.tools.messages.create( + model=model, + max_tokens=max_tokens, + system=get_system_instructions(), + messages=messages, + timeout=timeout, + temperature=temperature, + tools=tools, + tool_choice={"type": "auto"}, # Set to none if and when they support it + ) + + text_block_exists = any( + content_block.type == "text" + for content_block in response.content + ) + + if not text_block_exists: + raise Exception("Empty AI response, likely API error or mishandling") + response_text = "" - for block in content_blocks: - if block.type == "text": - response_text += block.text + + for content_block in response.content: + if content_block.type == "text": + response_text += content_block.text # Failsafe in case the response contains the username XML tag - response_text = re.sub(r"^.*?", "", response_text).strip() + response_text = re.sub(r".*?", "", response_text, flags=re.DOTALL).strip() + + # Remove Chain-of-Thought XML tags added by the model due to tools usage, pray they change this one day + response_text = re.sub(r".*?", "", response_text, flags=re.DOTALL).strip() # Split the response into multiple messages if necessary response_parts = split_message(response_text) @@ -318,6 +502,95 @@ def handle_generation(current_message, messages, channel_id, root_id): driver.posts.create_post({"channel_id": channel_id, "message": f"Error occurred: {str(e)}", "root_id": root_id}) +@timed_lru_cache(seconds=300, maxsize=100) +def get_stock_ticker_data(ticker_symbol): + stock = yfinance.Ticker(ticker_symbol) + + stock_data = { + "info": str(stock.info), + "calendar": str(stock.calendar), + "news": str(stock.news), + "dividends": str(stock.dividends), + "splits": str(stock.splits), + "quarterly_financials": str(stock.quarterly_financials), + "financials": str(stock.financials), + "cashflow": str(stock.cashflow), + } + + return json.dumps(stock_data) + + +@timed_lru_cache(seconds=7200, maxsize=100) +def get_exchange_rates(): + ecb_url = "https://www.ecb.europa.eu/stats/eurofxref/eurofxref-daily.xml" + + with httpx.Client() as client: + response = client.get(ecb_url, timeout=4) + response.raise_for_status() + + root = ElementTree.fromstring(response.content) + namespace = { + "gesmes": "http://www.gesmes.org/xml/2002-08-01", + "ecb": "http://www.ecb.int/vocabulary/2002-08-01/eurofxref", + } + + rates = root.find(".//ecb:Cube/ecb:Cube", namespaces=namespace) + exchange_rates = {"base_currency": "EUR"} + for rate in rates.findall("ecb:Cube", namespaces=namespace): + exchange_rates[rate.get("currency")] = rate.get("rate") + + return exchange_rates + + +@timed_lru_cache(seconds=180, maxsize=100) +def get_cryptocurrency_data_by_market_cap(num_currencies): + num_currencies = min(num_currencies, 20) # Limit to 20 + + url = "https://api.coingecko.com/api/v3/coins/markets" # possible alternatives: coincap.io, mobula.io + params = { + "vs_currency": "usd", + "order": "market_cap_desc", + "per_page": num_currencies, + "page": 1, + "sparkline": "false", + "price_change_percentage": "24h,7d", + } + + with httpx.Client() as client: + response = client.get(url, timeout=15, params=params) + response.raise_for_status() + + data = response.json() + return data + + +@timed_lru_cache(seconds=180, maxsize=100) +def get_cryptocurrency_data_by_id(crypto_id): + crypto_id = crypto_id.lower() + + url = "https://api.coingecko.com/api/v3/coins/markets" + params = { + "vs_currency": "usd", + "order": "market_cap_desc", + "per_page": 500, + "page": 1, + "sparkline": "false", + "price_change_percentage": "24h,7d", + } + + with httpx.Client() as client: + response = client.get(url, timeout=15, params=params) + response.raise_for_status() + + data = response.json() + # Filter data to find the cryptocurrency with the matching id or symbol + matched_crypto = next((item for item in data if crypto_id in (item["id"], item["symbol"])), None) + if matched_crypto: + return matched_crypto + + return {"error": "No data found for the specified cryptocurrency ID/symbol."} + + def process_message(event_data): post = json.loads(event_data["data"]["post"]) if should_ignore_post(post): @@ -369,7 +642,7 @@ def process_message(event_data): image_messages = [] for link in links: - if re.search(regex_local_links, link): + if re.search(REGEX_LOCAL_LINKS, link): logger.info(f"Skipping local URL: {link}") continue @@ -430,7 +703,7 @@ def should_ignore_post(post): def extract_post_data(post, event_data): # Remove the "@chatbot" mention from the message - message = post["message"].replace(chatbot_username_at, "").strip() + message = post["message"].replace(CHATBOT_USERNAME_AT, "").strip() channel_id = post["channel_id"] sender_name = sanitize_username(event_data["data"]["sender_name"]) root_id = post["root_id"] @@ -480,7 +753,7 @@ def get_thread_posts(root_id, post_id): sorted_posts = sorted(thread["posts"].values(), key=lambda x: x["create_at"]) for thread_post in sorted_posts: thread_sender_name = get_username_from_user_id(thread_post["user_id"]) - thread_message = thread_post["message"].replace(chatbot_username_at, "").strip() + thread_message = thread_post["message"].replace(CHATBOT_USERNAME_AT, "").strip() role = "assistant" if thread_post["user_id"] == driver.client.userid else "user" messages.append((thread_post, thread_sender_name, role, thread_message)) if thread_post["id"] == post_id: @@ -491,7 +764,7 @@ def get_thread_posts(root_id, post_id): def is_chatbot_invoked(post, post_id, root_id, channel_display_name): # We directly access the message here as we filter the mention earlier - if chatbot_username_at in post["message"]: + if CHATBOT_USERNAME_AT in post["message"]: return True # It is a direct message @@ -506,7 +779,7 @@ def is_chatbot_invoked(post, post_id, root_id, channel_display_name): return True # Needed when you mention the chatbot and send a fast message afterward - if chatbot_username_at in thread_post["message"]: + if CHATBOT_USERNAME_AT in thread_post["message"]: return True return False @@ -542,7 +815,7 @@ def extract_pdf_content(stream): image_messages = [] with fitz.open(None, stream, "pdf") as pdf: - pdf_text_content += to_markdown(pdf).strip() + pdf_text_content += pymupdf4llm.to_markdown(pdf).strip() for page in pdf: # Extract images @@ -701,7 +974,7 @@ def request_flaresolverr(link): "url": link, "maxTimeout": 30000, } - response = httpx.post(flaresolverr_endpoint, json=payload) + response = httpx.post(flaresolverr_endpoint, json=payload, timeout=30.0) response.raise_for_status() data = response.json() @@ -739,6 +1012,14 @@ def request_link_text_content(link, prev_response): soup = BeautifulSoup(raw_content, "html.parser") website_content = soup.get_text(" | ", strip=True) + if website_content == "New Tab": + logger.debug( + "Website content is 'New Tab', retrying with HTTPX." + ) # FlareSolverr issue I haven't figured out yet, happens with direct .CSV files for example + raw_content = request_httpx(prev_response) + soup = BeautifulSoup(raw_content, "html.parser") + website_content = soup.get_text(" | ", strip=True) + if not website_content: raise Exception("No text content found on website") @@ -764,7 +1045,7 @@ def request_link_image_content(prev_response, content_type): return [construct_image_content_message(content_type, image_data_base64)] -@lru_cache(maxsize=100) +@timed_lru_cache(seconds=1800, maxsize=100) def request_link_content(link): if yt_is_valid_url(link): return yt_get_content(link), [] @@ -774,7 +1055,7 @@ def request_link_content(link): with client.stream("GET", link, timeout=4, follow_redirects=True) as response: final_url = str(response.url) - if re.search(regex_local_links, final_url): + if re.search(REGEX_LOCAL_LINKS, final_url): logger.info(f"Skipping local URL after redirection: {final_url}") raise Exception("Local URL is disallowed") @@ -873,9 +1154,9 @@ def main(): try: # Log in to the Mattermost server driver.login() - global chatbot_username, chatbot_username_at - chatbot_username = driver.client.username - chatbot_username_at = f"@{chatbot_username}" + global CHATBOT_USERNAME, CHATBOT_USERNAME_AT + CHATBOT_USERNAME = driver.client.username + CHATBOT_USERNAME_AT = f"@{CHATBOT_USERNAME}" logger.debug(f"SYSTEM PROMPT: {get_system_instructions()}") diff --git a/helpers/pymupdf_rag.py b/helpers/pymupdf_rag.py deleted file mode 100644 index 397b187..0000000 --- a/helpers/pymupdf_rag.py +++ /dev/null @@ -1,337 +0,0 @@ -""" -This script accepts a PDF document filename and converts it to a text file -in Markdown format, compatible with the GitHub standard. - -It must be invoked with the filename like this: - -python pymupdf_rag.py input.pdf [-pages PAGES] - -The "PAGES" parameter is a string (containing no spaces) of comma-separated -page numbers to consider. Each item is either a single page number or a -number range "m-n". Use "N" to address the document's last page number. -Example: "-pages 2-15,40,43-N" - -It will produce a markdown text file called "input.md". - -Text will be sorted in Western reading order. Any table will be included in -the text in markdwn format as well. - -Use in some other script -------------------------- -import fitz -from to_markdown import to_markdown - -doc = fitz.open("input.pdf") -page_list = [ list of 0-based page numbers ] -md_text = to_markdown(doc, pages=page_list) - -Dependencies -------------- -PyMuPDF v1.24.0 or later - -Copyright and License ----------------------- -Copyright 2024 Artifex Software, Inc. -License GNU Affero GPL 3.0 -""" - -import string - -import fitz - -if fitz.pymupdf_version_tuple < (1, 24, 0): - raise NotImplementedError("PyMuPDF version 1.24.0 or later is needed.") - - -def to_markdown(doc: fitz.Document, pages: list = None) -> str: - """Process the document and return the text of its selected pages.""" - SPACES = set(string.whitespace) # used to check relevance of text pieces - if not pages: # use all pages if argument not given - pages = range(doc.page_count) - - class IdentifyHeaders: - """Compute data for identifying header text.""" - - def __init__(self, doc, pages: list = None, body_limit: float = None): - """Read all text and make a dictionary of fontsizes. - - Args: - pages: optional list of pages to consider - body_limit: consider text with larger font size as some header - """ - if pages is None: # use all pages if omitted - pages = range(doc.page_count) - fontsizes = {} - for pno in pages: - page = doc[pno] - blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"] - for span in [ # look at all non-empty horizontal spans - s for b in blocks for l in b["lines"] for s in l["spans"] if not SPACES.issuperset(s["text"]) - ]: - fontsz = round(span["size"]) - count = fontsizes.get(fontsz, 0) + len(span["text"].strip()) - fontsizes[fontsz] = count - - # maps a fontsize to a string of multiple # header tag characters - self.header_id = {} - if body_limit is None: # body text fontsize if not provided - body_limit = sorted( - [(k, v) for k, v in fontsizes.items()], - key=lambda i: i[1], - reverse=True, - )[ - 0 - ][0] - - sizes = sorted([f for f in fontsizes.keys() if f > body_limit], reverse=True) - - # make the header tag dictionary - for i, size in enumerate(sizes): - self.header_id[size] = "#" * (i + 1) + " " - - def get_header_id(self, span): - """Return appropriate markdown header prefix. - - Given a text span from a "dict"/"radict" extraction, determine the - markdown header prefix string of 0 to many concatenated '#' characters. - """ - fontsize = round(span["size"]) # compute fontsize - hdr_id = self.header_id.get(fontsize, "") - return hdr_id - - def resolve_links(links, span): - """Accept a span bbox and return a markdown link string.""" - bbox = fitz.Rect(span["bbox"]) # span bbox - # a link should overlap at least 70% of the span - bbox_area = 0.7 * abs(bbox) - for link in links: - hot = link["from"] # the hot area of the link - if not abs(hot & bbox) >= bbox_area: - continue # does not touch the bbox - text = f'[{span["text"].strip()}]({link["uri"]})' - return text - - def write_text(page, clip, hdr_prefix): - """Output the text found inside the given clip. - - This is an alternative for plain text in that it outputs - text enriched with markdown styling. - The logic is capable of recognizing headers, body text, code blocks, - inline code, bold, italic and bold-italic styling. - There is also some effort for list supported (ordered / unordered) in - that typical characters are replaced by respective markdown characters. - """ - out_string = "" - code = False # mode indicator: outputting code - - # extract URL type links on page - links = [l for l in page.get_links() if l["kind"] == 2] - - blocks = page.get_text( - "dict", - clip=clip, - flags=fitz.TEXTFLAGS_TEXT, - sort=True, - )["blocks"] - - for block in blocks: # iterate textblocks - previous_y = 0 - for line in block["lines"]: # iterate lines in block - if line["dir"][1] != 0: # only consider horizontal lines - continue - spans = [s for s in line["spans"]] - - this_y = line["bbox"][3] # current bottom coord - - # check for still being on same line - same_line = abs(this_y - previous_y) <= 3 and previous_y > 0 - - if same_line and out_string.endswith("\n"): - out_string = out_string[:-1] - - # are all spans in line in a mono-spaced font? - all_mono = all([s["flags"] & 8 for s in spans]) - - # compute text of the line - text = "".join([s["text"] for s in spans]) - if not same_line: - previous_y = this_y - if not out_string.endswith("\n"): - out_string += "\n" - - if all_mono: - # compute approx. distance from left - assuming a width - # of 0.5*fontsize. - delta = int((spans[0]["bbox"][0] - block["bbox"][0]) / (spans[0]["size"] * 0.5)) - if not code: # if not already in code output mode: - out_string += "```" # switch on "code" mode - code = True - if not same_line: # new code line with left indentation - out_string += "\n" + " " * delta + text + " " - previous_y = this_y - else: # same line, simply append - out_string += text + " " - continue # done with this line - - for i, s in enumerate(spans): # iterate spans of the line - # this line is not all-mono, so switch off "code" mode - if code: # still in code output mode? - out_string += "```\n" # switch of code mode - code = False - # decode font properties - mono = s["flags"] & 8 - bold = s["flags"] & 16 - italic = s["flags"] & 2 - - if mono: - # this is text in some monospaced font - out_string += f"`{s['text'].strip()}` " - else: # not a mono text - # for first span, get header prefix string if present - if i == 0: - hdr_string = hdr_prefix.get_header_id(s) - else: - hdr_string = "" - prefix = "" - suffix = "" - if hdr_string == "": - if bold: - prefix = "**" - suffix += "**" - if italic: - prefix += "_" - suffix = "_" + suffix - - ltext = resolve_links(links, s) - if ltext: - text = f"{hdr_string}{prefix}{ltext}{suffix} " - else: - text = f"{hdr_string}{prefix}{s['text'].strip()}{suffix} " - text = ( - text.replace("<", "<") - .replace(">", ">") - .replace(chr(0xF0B7), "-") - .replace(chr(0xB7), "-") - .replace(chr(8226), "-") - .replace(chr(9679), "-") - ) - out_string += text - previous_y = this_y - if not code: - out_string += "\n" - out_string += "\n" - if code: - out_string += "```\n" # switch of code mode - code = False - return out_string.replace(" \n", "\n") - - hdr_prefix = IdentifyHeaders(doc, pages=pages) - md_string = "" - - for pno in pages: - page = doc[pno] - # 1. first locate all tables on page - tabs = page.find_tables() - - # 2. make a list of table boundary boxes, sort by top-left corner. - # Must include the header bbox, which may be external. - tab_rects = sorted( - [(fitz.Rect(t.bbox) | fitz.Rect(t.header.bbox), i) for i, t in enumerate(tabs.tables)], - key=lambda r: (r[0].y0, r[0].x0), - ) - - # 3. final list of all text and table rectangles - text_rects = [] - # compute rectangles outside tables and fill final rect list - for i, (r, idx) in enumerate(tab_rects): - if i == 0: # compute rect above all tables - tr = page.rect - tr.y1 = r.y0 - if not tr.is_empty: - text_rects.append(("text", tr, 0)) - text_rects.append(("table", r, idx)) - continue - # read previous rectangle in final list: always a table! - _, r0, idx0 = text_rects[-1] - - # check if a non-empty text rect is fitting in between tables - tr = page.rect - tr.y0 = r0.y1 - tr.y1 = r.y0 - if not tr.is_empty: # empty if two tables overlap vertically! - text_rects.append(("text", tr, 0)) - - text_rects.append(("table", r, idx)) - - # there may also be text below all tables - if i == len(tab_rects) - 1: - tr = page.rect - tr.y0 = r.y1 - if not tr.is_empty: - text_rects.append(("text", tr, 0)) - - if not text_rects: # this will happen for table-free pages - text_rects.append(("text", page.rect, 0)) - else: - rtype, r, idx = text_rects[-1] - if rtype == "table": - tr = page.rect - tr.y0 = r.y1 - if not tr.is_empty: - text_rects.append(("text", tr, 0)) - - # we have all rectangles and can start outputting their contents - for rtype, r, idx in text_rects: - if rtype == "text": # a text rectangle - md_string += write_text(page, r, hdr_prefix) # write MD content - md_string += "\n" - else: # a table rect - md_string += tabs[idx].to_markdown(clean=False) - - md_string += "\n-----\n\n" - - return md_string - - -if __name__ == "__main__": - import os - import sys - import time - - try: - filename = sys.argv[1] - except IndexError: - print(f"Usage:\npython {os.path.basename(__file__)} input.pdf") - sys.exit() - - t0 = time.perf_counter() # start a time - - doc = fitz.open(filename) # open input file - parms = sys.argv[2:] # contains ["-pages", "PAGES"] or empty list - pages = range(doc.page_count) # default page range - if len(parms) == 2 and parms[0] == "-pages": # page sub-selection given - pages = [] # list of desired page numbers - - # replace any variable "N" by page count - pages_spec = parms[1].replace("N", f"{doc.page_count}") - for spec in pages_spec.split(","): - if "-" in spec: - start, end = map(int, spec.split("-")) - pages.extend(range(start - 1, end)) - else: - pages.append(int(spec) - 1) - - # make a set of invalid page numbers - wrong_pages = set([n + 1 for n in pages if n >= doc.page_count][:4]) - if wrong_pages != set(): # if any invalid numbers given, exit. - sys.exit(f"Page number(s) {wrong_pages} not in '{doc}'.") - - # get the markdown string - md_string = to_markdown(doc, pages=pages) - - # output to a text file with extension ".md" - out = open(doc.name.replace(".pdf", ".md"), "w") - out.write(md_string) - out.close() - t1 = time.perf_counter() # stop timer - print(f"Markdown creation time for {doc.name=} {round(t1-t0,2)} sec.") diff --git a/requirements.txt b/requirements.txt index fd856ab..e411c4e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,12 @@ -mattermostdriver -certifi -beautifulsoup4 -pillow -httpx -youtube-transcript-api -yt-dlp -anthropic -PyMuPDF \ No newline at end of file +mattermostdriver>=7.3.2 +certifi>=2024.2.2 +beautifulsoup4>=4.12.3 +pillow>=10.3.0 +httpx>=0.27.0 +youtube-transcript-api>=0.6.2 +yt-dlp>=2024.4.9 +anthropic>=0.26.1 +PyMuPDF>=1.24.4 +yfinance>=0.2.40 +pymupdf4llm>=0.0.3 +defusedxml>=0.7.1 \ No newline at end of file