diff --git a/CHANGELOG.md b/CHANGELOG.md index a4af7e8..4cade4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,3 +6,5 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## Unreleased + +- Fixed git checks \ No newline at end of file diff --git a/README.md b/README.md index 4c6162f..c1661c3 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ # olmOCR -Toolkit for training language models to work with PDF documents in the wild. +A toolkit for training language models to work with PDF documents in the wild. olmOCR Logo
-Online demo: [https://olmocr.allen.ai/](https://olmocr.allen.ai/) +Try the online demo: [https://olmocr.allen.ai/](https://olmocr.allen.ai/) What is included: - A prompting strategy to get really good natural text parsing using ChatGPT 4o - [buildsilver.py](https://github.com/allenai/olmocr/blob/main/olmocr/data/buildsilver.py) @@ -22,15 +22,15 @@ Requirements: - Recent NVIDIA GPU (tested on RTX 4090, L40S, A100, H100) - 30GB of free disk space -You will need to install poppler-utils and some additional fonts as a prerequisite. olmOCR uses poppler to render its PDF images. +You will need to install poppler-utils and additional fonts for rendering PDF images. -Linux Ubuntu/Debian +Install dependencies (Ubuntu/Debian) ```bash sudo apt-get update sudo apt-get install poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools ``` -Set up a conda environment, then clone and install the olmocr package +Set up a conda environment and install olmocr ```bash conda create -n olmocr python=3.11 conda activate olmocr @@ -40,7 +40,7 @@ cd olmocr pip install -e . ``` -Finally, make sure you have sglang with [flashinfer](https://github.com/flashinfer-ai/flashinfer) installed if you want to run inference on your own GPU. +Install sglang with [flashinfer](https://github.com/flashinfer-ai/flashinfer) if you want to run inference on GPU. ```bash pip install sgl-kernel==0.0.3.post1 --force-reinstall --no-deps pip install "sglang[all]==0.4.2" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ @@ -48,37 +48,32 @@ pip install "sglang[all]==0.4.2" --find-links https://flashinfer.ai/whl/cu124/to **BETA TESTER NOTE:** -If you are a beta tester, you will need to login using the hugging-face CLI -to make sure you have access to https://huggingface.co/allenai/olmocr-preview - -`huggingface-cli login` - +If you’re a beta tester, log in with Hugging Face CLI to access (olmOCR)[https://huggingface.co/allenai/olmocr-preview] preview model: +``` bash +huggingface-cli login +``` ### Local Usage Example -The easiest way to try out olmOCR on one or two PDFs is to check out the [web demo](https://olmocr.allen.ai/). - -Once you are ready to run locally, a local GPU is required, as inference is powered by [sglang](https://github.com/sgl-project/sglang) -under the hood. - -This command will convert one PDF into a directory called `localworkspace`: +For quick testing, try the [web demo](https://olmocr.allen.ai/). To run locally, a GPU is required, as inference is powered by [sglang](https://github.com/sgl-project/sglang) under the hood. +Convert a Single PDF: ```bash -python -m olmocr.pipeline ./localworkspace --pdfs tests/gnarly_pdfs/horribleocr.pdf +python -m olmocr.pipeline ./localworkspace --pdfs tests/gnarly_pdfs/horribleocr.pdf # will convert one PDF into a directory called `localworkspace` ``` -You can also bulk convert many PDFS with a glob pattern: +Convert Multiple PDFs: ```bash python -m olmocr.pipeline ./localworkspace --pdfs tests/gnarly_pdfs/*.pdf ``` #### Viewing Results -Once that finishes, output is stored as [Dolma](https://github.com/allenai/dolma)-style JSONL inside of the `./localworkspace/results` directory. +Extracted text is stored as [Dolma](https://github.com/allenai/dolma)-style JSONL inside of the `./localworkspace/results` directory. ```bash cat localworkspace/results/output_*.jsonl ``` -You can view your documents side-by-side with the original PDF renders using the `dolmaviewer` command. +View results side-by-side with the original PDFs (uses `dolmaviewer` command): ```bash python -m olmocr.viewer.dolmaviewer localworkspace/results/output_*.jsonl @@ -106,7 +101,7 @@ Now on any subsequent nodes, just run this and they will start grabbing items fr python -m olmocr.pipeline s3://my_s3_bucket/pdfworkspaces/exampleworkspace ``` -If you are at AI2 and want to linearize millions of PDFs efficiently using [beaker](https://www.beaker.org), just add the `--beaker` +If you are at Ai2 and want to linearize millions of PDFs efficiently using [beaker](https://www.beaker.org), just add the `--beaker` flag. This will prepare the workspace on your local machine, and then launch N GPU workers in the cluster to start converting PDFs. diff --git a/olmocr/data/buildsilverdatasummary.py b/olmocr/data/buildsilverdatasummary.py index 04633f4..3607a2e 100644 --- a/olmocr/data/buildsilverdatasummary.py +++ b/olmocr/data/buildsilverdatasummary.py @@ -1,18 +1,19 @@ import argparse -import collections import csv import json import os import random import re import sqlite3 +from collections import Counter from concurrent.futures import ProcessPoolExecutor, as_completed +from typing import Optional from urllib.parse import urlparse from tqdm import tqdm -def parse_pdf_hash(pretty_pdf_path: str) -> str: +def parse_pdf_hash(pretty_pdf_path: str) -> Optional[str]: pattern = r"s3://ai2-s2-pdfs/([a-f0-9]{4})/([a-f0-9]+)\.pdf-\d+" match = re.match(pattern, pretty_pdf_path) if match: @@ -58,7 +59,7 @@ def cache_athena_csv_to_db(athena_csv_path: str) -> str: return db_path -def get_uri_from_db(db_path: str, pdf_hash: str) -> str: +def get_uri_from_db(db_path: str, pdf_hash: str) -> Optional[str]: conn = sqlite3.connect(db_path) cursor = conn.cursor() cursor.execute("SELECT uri FROM pdf_mapping WHERE pdf_hash = ?", (pdf_hash,)) @@ -154,7 +155,7 @@ def main(): for cid, uri, domain in all_rows: writer.writerow([cid, uri if uri else "", domain if domain else ""]) - domain_counter = collections.Counter() + domain_counter: Counter[str] = Counter() for _, _, domain in all_rows: if domain: domain_counter[domain] += 1 diff --git a/olmocr/data/renderpdf.py b/olmocr/data/renderpdf.py index d3db16b..ecdad56 100644 --- a/olmocr/data/renderpdf.py +++ b/olmocr/data/renderpdf.py @@ -1,6 +1,7 @@ import base64 import io import subprocess +from typing import List from PIL import Image @@ -25,12 +26,11 @@ def get_pdf_media_box_width_height(local_pdf_path: str, page_num: int) -> tuple[ # Parse the output to find MediaBox output = result.stdout - media_box = None for line in output.splitlines(): if "MediaBox" in line: - media_box = line.split(":")[1].strip().split() - media_box = [float(x) for x in media_box] + media_box_str: List[str] = line.split(":")[1].strip().split() + media_box: List[float] = [float(x) for x in media_box_str] return abs(media_box[0] - media_box[2]), abs(media_box[3] - media_box[1]) raise ValueError("MediaBox not found in the PDF info.") diff --git a/olmocr/data/runopenaibatch.py b/olmocr/data/runopenaibatch.py index 735febc..7090a03 100644 --- a/olmocr/data/runopenaibatch.py +++ b/olmocr/data/runopenaibatch.py @@ -144,8 +144,8 @@ def get_estimated_space_usage(folder_path): def get_next_work_item(folder_path): - all_states = get_state(folder_path) - all_states = [s for s in all_states.values() if s["state"] not in FINISHED_STATES] + all_states = list(get_state(folder_path).values()) + all_states = [s for s in all_states if s["state"] not in FINISHED_STATES] all_states.sort(key=lambda s: s["last_checked"]) return all_states[0] if len(all_states) > 0 else None diff --git a/olmocr/eval/buildelo.py b/olmocr/eval/buildelo.py index f3eea20..2b94cc7 100644 --- a/olmocr/eval/buildelo.py +++ b/olmocr/eval/buildelo.py @@ -27,11 +27,17 @@ class Comparison: @property def comparison_a_method(self): - return re.search(r"page[0-9]+_(\w+)\.md$", self.comparison_a_path).group(1) + match = re.search(r"page[0-9]+_(\w+)\.md$", self.comparison_a_path) + if match: + return match.group(1) + raise ValueError(f"No match found in path: {self.comparison_a_path}") @property def comparison_b_method(self): - return re.search(r"page[0-9]+_(\w+)\.md$", self.comparison_b_path).group(1) + match = re.search(r"page[0-9]+_(\w+)\.md$", self.comparison_b_path) + if match: + return match.group(1) + raise ValueError(f"No match found in path: {self.comparison_b_path}") def process_single_pdf(pdf_path, all_mds, comparisons, segmenter_name="spacy"): diff --git a/olmocr/eval/runeval.py b/olmocr/eval/runeval.py index 994cf09..969e65a 100644 --- a/olmocr/eval/runeval.py +++ b/olmocr/eval/runeval.py @@ -230,8 +230,8 @@ def list_jsonl_files(path: str) -> list: # Returns the average Levenshtein distance match between the data def process_jsonl_file(jsonl_file, gold_data, comparer): page_data = {} - total_alignment_score = 0 - char_weighted_alignment_score = 0 + total_alignment_score: float = 0.0 + char_weighted_alignment_score: float = 0.0 total_pages = 0 total_chars = 0 total_errors = 0 diff --git a/olmocr/eval/scoreelo.py b/olmocr/eval/scoreelo.py index 6d510e0..49393cb 100644 --- a/olmocr/eval/scoreelo.py +++ b/olmocr/eval/scoreelo.py @@ -1,9 +1,10 @@ import csv import re from collections import defaultdict +from typing import Any, DefaultDict from urllib.parse import parse_qs, urlencode, urlsplit, urlunsplit -import requests +import requests # type: ignore def fetch_review_page_html(url): @@ -108,7 +109,7 @@ def build_comparison_report(entries_dict, datastore): comparisons[(A, B)] = [A_wins, B_wins], where A < B lexicographically in that tuple. """ - comparisons = defaultdict(lambda: [0, 0]) + comparisons: DefaultDict[Any, list[int]] = defaultdict(lambda: [0, 0]) for entry_id, vote in datastore.items(): if entry_id not in entries_dict: diff --git a/olmocr/filter/filter.py b/olmocr/filter/filter.py index 470c81b..d017ab4 100644 --- a/olmocr/filter/filter.py +++ b/olmocr/filter/filter.py @@ -2,6 +2,7 @@ import re import subprocess from collections import Counter +from typing import List from lingua import Language, LanguageDetectorBuilder from pypdf import PdfReader @@ -142,7 +143,7 @@ def process_pdf(s3_path): # Load the list of S3 paths with a progress bar with open("/home/ubuntu/s2pdf_paths_1M.txt", "r") as f: - s3_work_paths = list(filter(None, (line.strip() for line in tqdm(f, desc="Loading paths")))) + s3_work_paths: List[str] = list(filter(None, (line.strip() for line in tqdm(f, desc="Loading paths")))) # Initialize the PDF filter filter = PdfFilter( @@ -173,7 +174,7 @@ def process_pdf(s3_path): while pending_futures: # Wait for the next future to complete - done, _ = wait( + done, _ = wait( # type: ignore pending_futures.keys(), timeout=0.1, return_when=FIRST_COMPLETED, diff --git a/olmocr/metrics.py b/olmocr/metrics.py index 29d6c68..6795fd2 100644 --- a/olmocr/metrics.py +++ b/olmocr/metrics.py @@ -1,7 +1,7 @@ import asyncio import time from collections import defaultdict, deque -from typing import Dict +from typing import Any, Deque, Dict, List, Set class MetricsKeeper: @@ -15,7 +15,7 @@ def __init__(self, window=60 * 5): self.window = window # Time window in seconds self.start_time = time.time() # Timestamp when MetricsKeeper was created self.total_metrics = defaultdict(int) # Cumulative metrics since start - self.window_metrics = deque() # Deque to store (timestamp, metrics_dict) + self.window_metrics: Deque[Any] = deque() # Deque to store (timestamp, metrics_dict) self.window_sum = defaultdict(int) # Sum of metrics within the window def add_metrics(self, **kwargs): @@ -108,16 +108,16 @@ async def get_status_table(self) -> str: """ async with self.lock: # Determine all unique states across all workers - all_states = set() + all_states: Set[str] = set() for states in self.worker_status.values(): all_states.update(states.keys()) - all_states = sorted(all_states) + sorted_states: List[str] = sorted(all_states) - headers = ["Worker ID"] + all_states + headers = ["Worker ID"] + sorted_states # type: ignore rows = [] for worker_id, states in sorted(self.worker_status.items()): row = [str(worker_id)] - for state in all_states: + for state in sorted_states: count = states.get(state, 0) row.append(str(count)) rows.append(row) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 77eb654..b64597e 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -115,7 +115,7 @@ async def build_page_query(local_pdf_path: str, page: int, target_longest_image_ process_pool, partial(get_anchor_text, pdf_engine="pdfreport", target_length=target_anchor_text_len), local_pdf_path, page ) - image_base64, anchor_text = await asyncio.gather(image_base64, anchor_text) + image_base64, anchor_text = await asyncio.gather(image_base64, anchor_text) # type: ignore if image_rotation != 0: image_bytes = base64.b64decode(image_base64) with Image.open(BytesIO(image_bytes)) as img: @@ -659,7 +659,7 @@ async def metrics_reporter(work_queue): def submit_beaker_job(args): - from beaker import ( + from beaker import ( # type: ignore Beaker, Constraints, EnvVar, diff --git a/olmocr/prompts/anchor.py b/olmocr/prompts/anchor.py index e920986..278ea60 100644 --- a/olmocr/prompts/anchor.py +++ b/olmocr/prompts/anchor.py @@ -35,7 +35,7 @@ def get_anchor_text( scores = {label: get_document_coherency(text) for label, text in options.items()} - best_option_label = max(scores, key=scores.get) + best_option_label = max(scores, key=scores.get) # type: ignore best_option = options[best_option_label] print(f"topcoherency chosen: {best_option_label}") @@ -194,7 +194,7 @@ def bboxes_overlap(b1: BoundingBox, b2: BoundingBox, tolerance: float) -> bool: union(i, j) # Group images by their root parent - groups = {} + groups: dict[int, list[int]] = {} for i in range(n): root = find(i) groups.setdefault(root, []).append(i) @@ -268,21 +268,21 @@ def _linearize_pdf_report(report: PageReport, max_length: int = 4000) -> str: # Process text elements text_strings = [] - for element in report.text_elements: - if len(element.text.strip()) == 0: + for element in report.text_elements: # type: ignore + if len(element.text.strip()) == 0: # type: ignore continue - element_text = _cleanup_element_text(element.text) - text_str = f"[{element.x:.0f}x{element.y:.0f}]{element_text}\n" + element_text = _cleanup_element_text(element.text) # type: ignore + text_str = f"[{element.x:.0f}x{element.y:.0f}]{element_text}\n" # type: ignore text_strings.append((element, text_str)) # Combine all elements with their positions for sorting - all_elements = [] + all_elements: list[tuple[str, ImageElement, str, tuple[float, float]]] = [] for elem, s in image_strings: position = (elem.bbox.x0, elem.bbox.y0) all_elements.append(("image", elem, s, position)) for elem, s in text_strings: - position = (elem.x, elem.y) + position = (elem.x, elem.y) # type: ignore all_elements.append(("text", elem, s, position)) # Calculate total length @@ -311,7 +311,7 @@ def _linearize_pdf_report(report: PageReport, max_length: int = 4000) -> str: max_x_text = max(text_elements, key=lambda e: e.x) min_y_text = min(text_elements, key=lambda e: e.y) max_y_text = max(text_elements, key=lambda e: e.y) - edge_elements.update([min_x_text, max_x_text, min_y_text, max_y_text]) + edge_elements.update([min_x_text, max_x_text, min_y_text, max_y_text]) # type: ignore # Keep track of element IDs to prevent duplication selected_element_ids = set() diff --git a/olmocr/s3_utils.py b/olmocr/s3_utils.py index 3c4e9bd..4bef7ae 100644 --- a/olmocr/s3_utils.py +++ b/olmocr/s3_utils.py @@ -12,7 +12,7 @@ from urllib.parse import urlparse import boto3 -import requests +import requests # type: ignore import zstandard as zstd from boto3.s3.transfer import TransferConfig from botocore.config import Config @@ -58,7 +58,7 @@ def expand_s3_glob(s3_client, s3_glob: str) -> dict[str, str]: for page in paginator.paginate(Bucket=bucket, Prefix=prefix): for obj in page.get("Contents", []): key = obj["Key"] - if glob.fnmatch.fnmatch(key, posixpath.join(prefix, pattern)): + if glob.fnmatch.fnmatch(key, posixpath.join(prefix, pattern)): # type: ignore matched[f"s3://{bucket}/{key}"] = obj["ETag"].strip('"') return matched diff --git a/olmocr/train/core/cli.py b/olmocr/train/core/cli.py index 0de1176..4be8366 100644 --- a/olmocr/train/core/cli.py +++ b/olmocr/train/core/cli.py @@ -33,7 +33,7 @@ from omegaconf.errors import OmegaConfBaseException from rich.console import Console from rich.syntax import Syntax -from yaml import safe_load +from yaml import safe_load # type: ignore from .errors import DolmaRefineError @@ -116,7 +116,7 @@ def _make_parser(parser: A, config: Type[DataClass], prefix: Optional[str] = Non # here's where we check if T is a dataclass if is_dataclass(typ_): # recursively add subparsers - _make_parser(parser, typ_, prefix=field_name) + _make_parser(parser, typ_, prefix=field_name) # type: ignore continue if typ_ is bool: diff --git a/olmocr/train/dataloader.py b/olmocr/train/dataloader.py index 3dde420..7304399 100644 --- a/olmocr/train/dataloader.py +++ b/olmocr/train/dataloader.py @@ -52,7 +52,7 @@ def list_dataset_files(s3_glob_path: str): return glob.glob(s3_glob_path) -def load_jsonl_into_ds(s3_glob_path: str, first_n_files: int = None) -> Dataset: +def load_jsonl_into_ds(s3_glob_path: str, first_n_files: Optional[int] = None) -> Dataset: """ Loads JSONL files from the specified S3 path into a Hugging Face Dataset. """ diff --git a/olmocr/train/molmo/modeling_molmo.py b/olmocr/train/molmo/modeling_molmo.py index 0002ddd..e4d8460 100644 --- a/olmocr/train/molmo/modeling_molmo.py +++ b/olmocr/train/molmo/modeling_molmo.py @@ -1,3 +1,4 @@ +# type: ignore import logging import math from copy import deepcopy diff --git a/olmocr/work_queue.py b/olmocr/work_queue.py index 353f66a..8d6be16 100644 --- a/olmocr/work_queue.py +++ b/olmocr/work_queue.py @@ -5,8 +5,9 @@ import logging import os import random +from asyncio import Queue from dataclasses import dataclass -from typing import List, Optional +from typing import Any, List, Optional from olmocr.s3_utils import ( download_zstd_csv, @@ -196,7 +197,7 @@ def __init__(self, workspace_path: str): os.makedirs(self._locks_dir, exist_ok=True) # Internal queue - self._queue = asyncio.Queue() + self._queue: Queue[Any] = Queue() async def populate_queue(self, work_paths: List[str], items_per_group: int) -> None: """ @@ -401,7 +402,7 @@ def __init__(self, s3_client, workspace_path: str): self._index_path = os.path.join(self.workspace_path, "work_index_list.csv.zstd") self._output_glob = os.path.join(self.workspace_path, "results", "*.jsonl") - self._queue = asyncio.Queue() + self._queue: Queue[Any] = Queue() async def populate_queue(self, work_paths: List[str], items_per_group: int) -> None: """ diff --git a/tests/test_sglang.py b/tests/test_sglang.py index 1668b22..806ec41 100644 --- a/tests/test_sglang.py +++ b/tests/test_sglang.py @@ -35,6 +35,7 @@ "s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/checkpoint-9500/bf16/" ) + @unittest.skip("Skip these tests when running CI, they are mostly for experimentation") class TestSglangServer(unittest.IsolatedAsyncioTestCase): async def asyncSetUp(self):