Skip to content

Commit

Permalink
adding the omnidocbench benchmarkl
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Dec 29, 2024
1 parent 80bf928 commit b214077
Show file tree
Hide file tree
Showing 5 changed files with 447 additions and 0 deletions.
Empty file.
325 changes: 325 additions & 0 deletions docling_eval/benchmarks/omnidocbench/create.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,325 @@
import argparse
import copy
import json
import logging
import os
from pathlib import Path
from typing import Dict, List
import glob

import pypdfium2 as pdfium
from tqdm import tqdm # type: ignore

from docling_core.types.doc.labels import DocItemLabel

from docling_core.types.doc.document import (
DoclingDocument,
ImageRef,
PageItem,
PictureItem,
ProvenanceItem,
TableCell,
TableData,
TableItem, ImageRefMode
)

from docling_eval.benchmarks.constants import BenchMarkColumns
from docling_eval.benchmarks.utils import write_datasets_info
from docling_eval.docling.conversion import create_converter

from docling_eval.docling.utils import (
crop_bounding_box,
docling_version,
extract_images,
get_binary,
save_shard_to_disk,
)


# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

def get_filenames(omnidocbench_dir: Path):

page_images = sorted(glob.glob(str(omnidocbench_dir / "images/*.jpg")))
page_pdfs = sorted(glob.glob(str(omnidocbench_dir / "ori_pdfs/*.pdf")))

assert len(page_images)==len(page_pdfs), f"len(page_images)!=len(page_pdfs) => {len(page_images)}!={len(page_pdfs)}"

return list(zip(page_images, page_pdfs))


def update_gt_into_map(gt):

result = {}

for item in gt:
path = item["page_info"]["image_path"]
result[path] = item

return result

def create_true_doc(jpg_path, pdf_path, gt):

true_doc = DoclingDocument(name=f"ground-truth {os.path.basename(jpg_path)}")

for item in gt["layout_dets"]:

#print(json.dumps(item, indent=2))

label = item["category_type"]
#text = item["text"]

if label=="title":
pass

elif label=="text_block":
pass

elif label=="text_mask":
pass

elif label=="table":
pass

elif label=="table_caption":
pass

elif label=="table_footnote":
pass

elif label=="table_mask":
pass

elif label=="figure":
pass

elif label=="figure_caption":
pass

elif label=="figure_footnote":
pass

elif label=="equation_isolated":
pass

elif label=="equation_caption":
pass

elif label=="code_txt":
pass

elif label=="abandon":
pass

elif label=="need_mask":
pass

elif label=="header":
pass

elif label=="footer":
pass

elif label=="reference":
pass

elif label=="page_footnote":
pass

elif label=="page_number":
pass

else:
logging.error(f"label {label} is not assigned!")
exit(-1)


return true_doc


def create_omnidocbench_e2e_dataset(
omnidocbench_dir: Path, output_dir: Path, image_scale: float = 1.0
):

# Create Converter
doc_converter = create_converter(
artifacts_path=output_dir / "artifacts", page_image_scale=image_scale
)

# load the groundtruth
with open(omnidocbench_dir / f"OmniDocBench.json", "r") as fr:
gt = json.load(fr)

gt = update_gt_into_map(gt)

viz_dir = output_dir / "vizualisations"
os.makedirs(viz_dir, exist_ok=True)

records = []

page_tuples = get_filenames(omnidocbench_dir)

cnt = 0

for page_tuple in tqdm(page_tuples, total=len(page_tuples), ncols=128, desc="Processing files for OmniDocBench with end-to-end"):

jpg_path = page_tuple[0]
pdf_path = page_tuple[1]

logging.info(f"file: {pdf_path}")

if not os.path.basename(jpg_path) in gt:
logging.error(f"did not find ground-truth for {os.path.basename(jpg_path)}")
continue

true_doc = create_true_doc(jpg_path, pdf_path, gt[os.path.basename(jpg_path)])



"""
conv_results = doc_converter.convert(source=pdf_path, raises_on_error=True)
conv_results.document.save_as_html(filename = viz_dir / f"{os.path.basename(pdf_path)}.html",
image_mode = ImageRefMode.EMBEDDED)
pred_doc, pictures, page_images = extract_images(
conv_results.document,
pictures_column=BenchMarkColumns.PICTURES.value, # pictures_column,
page_images_column=BenchMarkColumns.PAGE_IMAGES.value, # page_images_column,
)
record = {
BenchMarkColumns.DOCLING_VERSION: docling_version(),
BenchMarkColumns.STATUS: "SUCCESS",
BenchMarkColumns.DOC_ID: str(os.path.basename(pdf_path)),
BenchMarkColumns.GROUNDTRUTH: "", #json.dumps(true_doc.export_to_dict()),
BenchMarkColumns.PREDICTION: json.dumps(pred_doc.export_to_dict()),
BenchMarkColumns.ORIGINAL: get_binary(pdf_path),
BenchMarkColumns.MIMETYPE: "application/pdf",
BenchMarkColumns.PAGE_IMAGES: page_images,
BenchMarkColumns.PICTURES: pictures,
}
records.append(record)
if cnt>=3:
break
else:
cnt += 1
"""

test_dir = output_dir / "test"
os.makedirs(test_dir, exist_ok=True)

save_shard_to_disk(items=records, dataset_path=test_dir)

write_datasets_info(
name="DPBench: end-to-end",
output_dir=output_dir,
num_train_rows=0,
num_test_rows=len(records),
)


def create_omnidocbench_layout_dataset(
omnidocbench_dir: Path, output_dir: Path, image_scale: float = 1.0
):
create_omnidocbench_e2e_dataset(
omnidocbench_dir=omnidocbench_dir, output_dir=output_dir, image_scale=image_scale
)


def create_omnidocbench_tableformer_dataset(
omnidocbench_dir: Path, output_dir: Path, image_scale: float = 1.0
):
logger.error("Not implemented")
exit(-1)


def parse_arguments():
"""Parse arguments for DP-Bench parsing."""

parser = argparse.ArgumentParser(
description="Process DP-Bench benchmark from directory into HF dataset."
)
parser.add_argument(
"-i",
"--omnidocbench-directory",
help="input directory with documents",
required=True,
)
parser.add_argument(
"-o",
"--output-directory",
help="output directory with shards",
required=False,
default="./benchmarks/omnidocbench",
)
parser.add_argument(
"-s",
"--image-scale",
help="image-scale of the pages",
required=False,
default=1.0,
)
parser.add_argument(
"-m",
"--mode",
help="mode of dataset",
required=False,
choices=["end-2-end", "table", "formula", "all"],
)
args = parser.parse_args()

return (
Path(args.omnidocbench_directory),
Path(args.output_directory),
float(args.image_scale),
args.mode,
)



def main():

omnidocbench_dir, output_dir, image_scale, mode = parse_arguments()

# Create the directory if it does not exist
os.makedirs(output_dir, exist_ok=True)

odir_e2e = Path(output_dir) / "end_to_end"
odir_tab = Path(output_dir) / "tables"
odir_eqn = Path(output_dir) / "formulas"

os.makedirs(odir_e2e, exist_ok=True)
os.makedirs(odir_tab, exist_ok=True)
# os.makedirs(odir_eqn, exist_ok=True)

for _ in ["test", "train"]:
os.makedirs(odir_e2e / _, exist_ok=True)
os.makedirs(odir_tab / _, exist_ok=True)

if mode == "end-2-end":
create_omnidocbench_e2e_dataset(
omnidocbench_dir=omnidocbench_dir, output_dir=odir_e2e, image_scale=image_scale
)

elif mode == "table":
create_omnidocbench_tableformer_dataset(
omnidocbench_dir=omnidocbench_dir, output_dir=odir_tab, image_scale=image_scale
)

elif mode == "all":
create_omnidocbench_e2e_dataset(
omnidocbench_dir=omnidocbench_dir, output_dir=odir_e2e, image_scale=image_scale
)

create_omnidocbench_tableformer_dataset(
omnidocbench_dir=omnidocbench_dir, output_dir=odir_tab, image_scale=image_scale
)


if __name__ == "__main__":
main()
56 changes: 56 additions & 0 deletions docling_eval/utils/repository.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import json
import logging
import os
import subprocess
from pathlib import Path

# Configure logging
logging.basicConfig(
level=logging.INFO, # Set the logging level
format="%(asctime)s - %(levelname)s - %(message)s",
)

logger = logging.getLogger(__name__)


def is_git_lfs_installed():
"""
Check if Git LFS is installed.
"""
try:
result = subprocess.run(
["git", "lfs", "version"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
if result.returncode == 0:
logger.info("Git LFS is installed.")
return True
else:
logger.warning("Git LFS is not installed.")
return False
except FileNotFoundError:
logger.error("Git is not installed.")
return False


def clone_repository(repo_url:str, target_directory:Path):
"""
Clone a Git repository to the specified target directory.
"""
if os.path.exists(target_directory):
logger.warning(
f"Target directory '{target_directory}' already exists. Skipping clone."
)
return

try:
subprocess.run(
["git", "clone", repo_url, target_directory],
check=True,
)
logger.info(f"Repository cloned into '{target_directory}'.")
except subprocess.CalledProcessError as e:
logger.error(f"Failed to clone repository: {e}")
raise
Empty file added docs/examples/__init__.py
Empty file.
Loading

0 comments on commit b214077

Please sign in to comment.