diff --git a/docling_eval/benchmarks/dpbench/create.py b/docling_eval/benchmarks/dpbench/create.py index 1a04919..9216221 100644 --- a/docling_eval/benchmarks/dpbench/create.py +++ b/docling_eval/benchmarks/dpbench/create.py @@ -5,9 +5,9 @@ import os from pathlib import Path from typing import Dict, List -from tqdm import tqdm import pypdfium2 as pdfium +from tqdm import tqdm # type: ignore # Configure logging logging.basicConfig( @@ -235,7 +235,9 @@ def update(doc: DoclingDocument, annots: Dict, page_width: float, page_height: f return -def create_dpbench_e2e_dataset(dpbench_dir: Path, output_dir: Path, image_scale: float = 1.0): +def create_dpbench_e2e_dataset( + dpbench_dir: Path, output_dir: Path, image_scale: float = 1.0 +): # Create Converter doc_converter = create_converter( @@ -248,7 +250,12 @@ def create_dpbench_e2e_dataset(dpbench_dir: Path, output_dir: Path, image_scale: records = [] - for filename, annots in tqdm(gt.items(), desc="Processing files for DP-Bench with end-to-end", total=len(gt), ncols=128): + for filename, annots in tqdm( + gt.items(), + desc="Processing files for DP-Bench with end-to-end", + total=len(gt), + ncols=128, + ): pdf_path = dpbench_dir / f"dataset/pdfs/{filename}" # logging.info(f"\n\n===============================\n\nfile: {pdf_path}\n\n") @@ -287,7 +294,7 @@ def create_dpbench_e2e_dataset(dpbench_dir: Path, output_dir: Path, image_scale: test_dir = output_dir / "test" os.makedirs(test_dir, exist_ok=True) - + save_shard_to_disk(items=records, dataset_path=test_dir) write_datasets_info( @@ -298,7 +305,9 @@ def create_dpbench_e2e_dataset(dpbench_dir: Path, output_dir: Path, image_scale: ) -def create_dpbench_tableformer_dataset(dpbench_dir: Path, output_dir: Path, image_scale: float = 1.0): +def create_dpbench_tableformer_dataset( + dpbench_dir: Path, output_dir: Path, image_scale: float = 1.0 +): tf_config = init_tf_model() @@ -308,7 +317,12 @@ def create_dpbench_tableformer_dataset(dpbench_dir: Path, output_dir: Path, imag records = [] - for filename, annots in tqdm(gt.items(), desc="Processing files for DP-Bench with TableFormer", total=len(gt), ncols=128): + for filename, annots in tqdm( + gt.items(), + desc="Processing files for DP-Bench with TableFormer", + total=len(gt), + ncols=128, + ): pdf_path = dpbench_dir / f"dataset/pdfs/{filename}" # logging.info(f"\n\n===============================\n\nfile: {pdf_path}\n\n") @@ -453,7 +467,7 @@ def create_dpbench_tableformer_dataset(dpbench_dir: Path, output_dir: Path, imag test_dir = output_dir / "test" os.makedirs(test_dir, exist_ok=True) - + save_shard_to_disk(items=records, dataset_path=test_dir) write_datasets_info( diff --git a/docling_eval/benchmarks/utils.py b/docling_eval/benchmarks/utils.py index db63243..12ee1e0 100644 --- a/docling_eval/benchmarks/utils.py +++ b/docling_eval/benchmarks/utils.py @@ -1,9 +1,7 @@ import json from pathlib import Path -from docling_eval.benchmarks.constants import BenchMarkNames - -from docling_eval.benchmarks.constants import BenchMarkColumns +from docling_eval.benchmarks.constants import BenchMarkColumns, BenchMarkNames def write_datasets_info( @@ -37,5 +35,3 @@ def write_datasets_info( with open(output_dir / f"dataset_infos.json", "w") as fw: fw.write(json.dumps(dataset_infos, indent=2)) - - diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py index 153165a..5c3cd75 100644 --- a/docling_eval/cli/main.py +++ b/docling_eval/cli/main.py @@ -1,20 +1,26 @@ -import os import json import logging +import os from enum import Enum, auto from pathlib import Path from typing import Annotated, Optional +import matplotlib.pyplot as plt import typer from docling_eval.benchmarks.constants import BenchMarkNames - -from docling_eval.benchmarks.dpbench.create import create_dpbench_e2e_dataset, create_dpbench_tableformer_dataset - -from docling_eval.evaluators.layout_evaluator import LayoutEvaluator, DatasetLayoutEvaluation -from docling_eval.evaluators.table_evaluator import TableEvaluator, DatasetTableEvaluation - -import matplotlib.pyplot as plt +from docling_eval.benchmarks.dpbench.create import ( + create_dpbench_e2e_dataset, + create_dpbench_tableformer_dataset, +) +from docling_eval.evaluators.layout_evaluator import ( + DatasetLayoutEvaluation, + LayoutEvaluator, +) +from docling_eval.evaluators.table_evaluator import ( + DatasetTableEvaluation, + TableEvaluator, +) # Configure logging logging.basicConfig( @@ -35,6 +41,7 @@ class EvaluationTask(str, Enum): EVALUATE = "evaluate" VISUALIZE = "visualize" + class EvaluationModality(str, Enum): END2END = "end-to-end" LAYOUT = "layout" @@ -42,7 +49,13 @@ class EvaluationModality(str, Enum): CODEFORMER = "codeformer" -def create(modality:EvaluationModality, benchmark:BenchMarkNames, idir:Path, odir:Path=None, image_scale:float=1.0): +def create( + modality: EvaluationModality, + benchmark: BenchMarkNames, + idir: Path, + odir: Path, + image_scale: float = 1.0, +): r"""""" if not os.path.exists(idir): log.error(f"Benchmark directory not found: {idir}") @@ -50,97 +63,116 @@ def create(modality:EvaluationModality, benchmark:BenchMarkNames, idir:Path, odi if odir is None: odir = Path("./benchmarks") / benchmark.value / modality.value - - match benchmark: - case BenchMarkNames.DPBENCH: - if(modality==EvaluationModality.END2END or - modality==EvaluationModality.LAYOUT): - create_dpbench_e2e_dataset(dpbench_dir=idir, output_dir=odir, image_scale=image_scale) + if benchmark == BenchMarkNames.DPBENCH: - elif(modality==EvaluationModality.TABLEFORMER): - create_dpbench_tableformer_dataset(dpbench_dir=idir, output_dir=odir, image_scale=image_scale) + if ( + modality == EvaluationModality.END2END + or modality == EvaluationModality.LAYOUT + ): + create_dpbench_e2e_dataset( + dpbench_dir=idir, output_dir=odir, image_scale=image_scale + ) - else: - log.error(f"{modality} is not yet implemented for {benchmark}") + elif modality == EvaluationModality.TABLEFORMER: + create_dpbench_tableformer_dataset( + dpbench_dir=idir, output_dir=odir, image_scale=image_scale + ) - case _: - log.error(f"{benchmark} is not yet implemented") + else: + log.error(f"{modality} is not yet implemented for {benchmark}") + else: + log.error(f"{benchmark} is not yet implemented") -def evaluate(modality:EvaluationModality, benchmark:BenchMarkNames, idir:Path, odir:Path): + +def evaluate( + modality: EvaluationModality, benchmark: BenchMarkNames, idir: Path, odir: Path +): r"""""" if not os.path.exists(idir): log.error(f"Benchmark directory not found: {idir}") - - match modality: - case EvaluationModality.END2END: - pass - - case EvaluationModality.LAYOUT: - layout_evaluator = LayoutEvaluator() - ds_evaluation = layout_evaluator(idir, split="test") - - case EvaluationModality.TABLEFORMER: - table_evaluator = TableEvaluator() - ds_evaluation = table_evaluator(idir, split="test") - - case EvaluationModality.CODEFORMER: - pass # Save the evaluation save_fn = odir / f"evaluation_{benchmark.value}_{modality.value}.json" - with open(save_fn, "w") as fd: - json.dump(ds_evaluation.model_dump(), fd, indent=2, sort_keys=True) + + if modality == EvaluationModality.END2END: + logging.error("not supported") + + elif modality == EvaluationModality.LAYOUT: + layout_evaluator = LayoutEvaluator() + layout_evaluation = layout_evaluator(idir, split="test") + + with open(save_fn, "w") as fd: + json.dump(layout_evaluation.model_dump(), fd, indent=2, sort_keys=True) + + elif modality == EvaluationModality.TABLEFORMER: + table_evaluator = TableEvaluator() + table_evaluation = table_evaluator(idir, split="test") + + with open(save_fn, "w") as fd: + json.dump(table_evaluation.model_dump(), fd, indent=2, sort_keys=True) + + elif modality == EvaluationModality.CODEFORMER: + pass + log.info("The evaluation has been saved in '%s'", save_fn) -def visualise(modality:EvaluationModality, benchmark:BenchMarkNames, idir:Path, odir:Path): + +def visualise( + modality: EvaluationModality, benchmark: BenchMarkNames, idir: Path, odir: Path +): filename = odir / f"evaluation_{benchmark.value}_{modality.value}.json" - - match modality: - case EvaluationModality.END2END: - pass - - case EvaluationModality.LAYOUT: - pass - - case EvaluationModality.TABLEFORMER: - - with open(filename, "r") as fd: - evaluation = DatasetTableEvaluation.parse_file(filename) - - # Calculate bin widths - bin_widths = [evaluation.TEDS.bins[i + 1] - evaluation.TEDS.bins[i] for i in range(len(evaluation.TEDS.bins) - 1)] - bin_middle = [(evaluation.TEDS.bins[i + 1] + evaluation.TEDS.bins[i])/2.0 for i in range(len(evaluation.TEDS.bins) - 1)] - - for i in range(len(evaluation.TEDS.bins)-1): - logging.info(f"{i:02} [{evaluation.TEDS.bins[i]:.3f}, {evaluation.TEDS.bins[i+1]:.3f}]: {evaluation.TEDS.hist[i]}") - - # Plot histogram - plt.bar(bin_middle, evaluation.TEDS.hist, width=bin_widths, edgecolor="black") - #width=(evaluation.TEDS.bins[1] - evaluation.TEDS.bins[0]), - - plt.xlabel("TEDS") - plt.ylabel("Frequency") - plt.title(f"benchmark: {benchmark.value}, modality: {modality.value}") - - figname = odir / f"evaluation_{benchmark.value}_{modality.value}.png" - logging.info(f"saving figure to {figname}") - plt.savefig(figname) - - case EvaluationModality.CODEFORMER: - pass - - case _: - pass - + + if modality == EvaluationModality.END2END: + pass + + elif modality == EvaluationModality.LAYOUT: + pass + + elif modality == EvaluationModality.TABLEFORMER: + + with open(filename, "r") as fd: + evaluation = DatasetTableEvaluation.parse_file(filename) + + # Calculate bin widths + bin_widths = [ + evaluation.TEDS.bins[i + 1] - evaluation.TEDS.bins[i] + for i in range(len(evaluation.TEDS.bins) - 1) + ] + bin_middle = [ + (evaluation.TEDS.bins[i + 1] + evaluation.TEDS.bins[i]) / 2.0 + for i in range(len(evaluation.TEDS.bins) - 1) + ] + + for i in range(len(evaluation.TEDS.bins) - 1): + logging.info( + f"{i:02} [{evaluation.TEDS.bins[i]:.3f}, {evaluation.TEDS.bins[i+1]:.3f}]: {evaluation.TEDS.hist[i]}" + ) + + # Plot histogram + plt.bar(bin_middle, evaluation.TEDS.hist, width=bin_widths, edgecolor="black") + # width=(evaluation.TEDS.bins[1] - evaluation.TEDS.bins[0]), + + plt.xlabel("TEDS") + plt.ylabel("Frequency") + plt.title(f"benchmark: {benchmark.value}, modality: {modality.value}") + + figname = odir / f"evaluation_{benchmark.value}_{modality.value}.png" + logging.info(f"saving figure to {figname}") + plt.savefig(figname) + + elif modality == EvaluationModality.CODEFORMER: + pass + + @app.command(no_args_is_help=True) def main( task: Annotated[ EvaluationTask, typer.Option( - ..., #EvaluationTask.CREATE, + ..., # EvaluationTask.CREATE, "-t", # Short name "--task", # Long name help="Evaluation task", @@ -149,21 +181,21 @@ def main( modality: Annotated[ EvaluationModality, typer.Option( - ..., #EvaluationModality.TABLEFORMER, + ..., # EvaluationModality.TABLEFORMER, "-m", # Short name "--modality", # Long name help="Evaluation modality", ), - ], + ], benchmark: Annotated[ BenchMarkNames, typer.Option( - ..., #BenchMarkNames.DPBENCH, + ..., # BenchMarkNames.DPBENCH, "-b", # Short name "--benchmark", # Long name help="Benchmark name", ), - ], + ], idir: Annotated[ Path, typer.Option( @@ -181,7 +213,7 @@ def main( "--output-dir", # Long name help="Output directory", ), - ], + ], ): # Dispatch the command if task == EvaluationTask.CREATE: @@ -191,10 +223,10 @@ def main( evaluate(modality, benchmark, idir, odir) elif task == EvaluationTask.VISUALIZE: - visualise(modality, benchmark, idir, odir) + visualise(modality, benchmark, idir, odir) else: - _log.error("Unsupported command: '%s'", command) + log.error("Unsupported command: '%s'", task.value) if __name__ == "__main__": diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py index ddbf11d..18073bc 100644 --- a/docling_eval/evaluators/layout_evaluator.py +++ b/docling_eval/evaluators/layout_evaluator.py @@ -1,97 +1,108 @@ +import glob import logging import os -import glob - -from tqdm import tqdm - -from typing import Optional, Tuple, List, Dict from pathlib import Path +from typing import Dict, List, Optional, Tuple import torch -from torchmetrics.detection.mean_ap import MeanAveragePrecision - -from pydantic import BaseModel - +from datasets import Dataset, load_dataset +from docling_core.types.doc.document import ( + DEFAULT_EXPORT_LABELS, + DocItem, + DoclingDocument, +) from docling_core.types.doc.labels import DocItemLabel -from docling_core.types.doc.document import DoclingDocument, DocItem, DEFAULT_EXPORT_LABELS from pydantic import BaseModel - -from datasets import Dataset -from datasets import load_dataset +from torchmetrics.detection.mean_ap import MeanAveragePrecision +from tqdm import tqdm # type: ignore from docling_eval.benchmarks.constants import BenchMarkColumns class LayoutEvaluation(BaseModel): name: str - label: str = None + label: str value: float + class DatasetLayoutEvaluation(BaseModel): true_labels: Dict[str, int] pred_labels: Dict[str, int] intersecting_labels: List[str] - + evaluations: List[LayoutEvaluation] - + class LayoutEvaluator: def __init__(self) -> None: - self.filter_labels=[] - self.label_names={} + self.filter_labels = [] + self.label_names = {} - for i,_ in enumerate(DEFAULT_EXPORT_LABELS): + for i, _ in enumerate(DEFAULT_EXPORT_LABELS): self.filter_labels.append(_) self.label_names[i] = _ - def __call__(self, ds_path: Path, split: str="test") -> DatasetLayoutEvaluation: - + def __call__(self, ds_path: Path, split: str = "test") -> DatasetLayoutEvaluation: + test_path = str(ds_path / "test" / "*.parquet") train_path = str(ds_path / "train" / "*.parquet") - + test_files = glob.glob(test_path) train_files = glob.glob(train_path) logging.info(f"test-files: {test_files}, train-files: {train_files}") - + # Load all files into the `test`-`train` split ds = None - if len(test_files)>0 and len(train_files)>0: - ds = load_dataset("parquet", data_files={"test": test_files, "train": train_files}) - elif len(test_files)>0 and len(train_files)==0: + if len(test_files) > 0 and len(train_files) > 0: + ds = load_dataset( + "parquet", data_files={"test": test_files, "train": train_files} + ) + elif len(test_files) > 0 and len(train_files) == 0: ds = load_dataset("parquet", data_files={"test": test_files}) - + logging.info(f"oveview of dataset: {ds}") - ds = ds[split] + if ds is not None: + ds_selection = ds[split] + + true_labels, pred_labels, intersection_labels = self._find_intersecting_labels( + ds_selection + ) + logging.info(f"Intersection labels: {intersection_labels}") - true_labels, pred_labels, intersection_labels = self._find_intersecting_labels(ds) - logging.info(f"Intersection labels: {intersection_labels}") - ground_truths = [] predictions = [] - - for i, data in tqdm(enumerate(ds), desc="Layout evaluations", ncols=120, total=len(ds)): + + for i, data in tqdm( + enumerate(ds_selection), + desc="Layout evaluations", + ncols=120, + total=len(ds_selection), + ): true_doc_dict = data[BenchMarkColumns.GROUNDTRUTH] true_doc = DoclingDocument.model_validate_json(true_doc_dict) pred_doc_dict = data[BenchMarkColumns.PREDICTION] pred_doc = DoclingDocument.model_validate_json(pred_doc_dict) - - gts, preds = self._evaluate_layouts_in_documents(doc_id=data[BenchMarkColumns.DOC_ID], - true_doc=true_doc, - pred_doc=pred_doc, - filter_labels=intersection_labels) - + gts, preds = self._evaluate_layouts_in_documents( + doc_id=data[BenchMarkColumns.DOC_ID], + true_doc=true_doc, + pred_doc=pred_doc, + filter_labels=intersection_labels, + ) + ground_truths.extend(gts) predictions.extend(preds) - assert len(ground_truths)==len(predictions), "len(ground_truths)==len(predictions)" - + assert len(ground_truths) == len( + predictions + ), "len(ground_truths)==len(predictions)" + # Initialize Mean Average Precision metric metric = MeanAveragePrecision(iou_type="bbox", class_metrics=True) - + # Update metric with predictions and ground truths metric.update(predictions, ground_truths) @@ -99,14 +110,19 @@ def __call__(self, ds_path: Path, split: str="test") -> DatasetLayoutEvaluation: result = metric.compute() evaluations: List[LayoutEvaluation] = [] - for key,value in result.items(): + for key, value in result.items(): if isinstance(value, float): - evaluations.append(LayoutEvaluation(name=key, value=value)) + evaluations.append(LayoutEvaluation(name=key, value=value, label=None)) if "map_per_class" in result: for label_idx, class_map in enumerate(result["map_per_class"]): - evaluations.append(LayoutEvaluation(name="Class mAP[0.5:0.95]", label=intersection_labels[label_idx].value, value=class_map)) - + evaluations.append( + LayoutEvaluation( + name="Class mAP[0.5:0.95]", + label=intersection_labels[label_idx].value, + value=class_map, + ) + ) # Print results print("Results:") @@ -115,25 +131,33 @@ def __call__(self, ds_path: Path, split: str="test") -> DatasetLayoutEvaluation: print(f"{key}: {value:.3f}") except: print(f"{key}: {value}") - + # Overall mAP print(f"Overall mAP[0.5:0.95]: {result['map'].item():.3f}") print("\nPer-Class mAP[0.5:0.95]:") if "map_per_class" in result: for label_idx, class_map in enumerate(result["map_per_class"]): - #label_name = self.label_names.get(label_idx, f"Class {label_idx}") # Use label name or default - print(f" => {label_idx} {intersection_labels[label_idx].value}: {class_map:.3f}") - - - return DatasetLayoutEvaluation(evaluations=evaluations, true_labels=true_labels, pred_labels=pred_labels, intersecting_labels=[_.value for _ in intersection_labels]) + # label_name = self.label_names.get(label_idx, f"Class {label_idx}") # Use label name or default + print( + f" => {label_idx} {intersection_labels[label_idx].value}: {class_map:.3f}" + ) + + return DatasetLayoutEvaluation( + evaluations=evaluations, + true_labels=true_labels, + pred_labels=pred_labels, + intersecting_labels=[_.value for _ in intersection_labels], + ) def _find_intersecting_labels(self, ds: Dataset): - true_labels = {} - pred_labels = {} - - for i, data in tqdm(enumerate(ds), desc="Layout evaluations", ncols=120, total=len(ds)): + true_labels: Dict[str, int] = {} + pred_labels: Dict[str, int] = {} + + for i, data in tqdm( + enumerate(ds), desc="Layout evaluations", ncols=120, total=len(ds) + ): true_doc_dict = data[BenchMarkColumns.GROUNDTRUTH] true_doc = DoclingDocument.model_validate_json(true_doc_dict) @@ -141,7 +165,7 @@ def _find_intersecting_labels(self, ds: Dataset): pred_doc = DoclingDocument.model_validate_json(pred_doc_dict) for item, level in true_doc.iterate_items(): - if isinstance(item, DocItem): # and item.label in filter_labels: + if isinstance(item, DocItem): # and item.label in filter_labels: for prov in item.prov: if item.label in true_labels: true_labels[item.label] += 1 @@ -149,41 +173,44 @@ def _find_intersecting_labels(self, ds: Dataset): true_labels[item.label] = 1 for item, level in pred_doc.iterate_items(): - if isinstance(item, DocItem): # and item.label in filter_labels: + if isinstance(item, DocItem): # and item.label in filter_labels: for prov in item.prov: if item.label in pred_labels: pred_labels[item.label] += 1 else: - pred_labels[item.label] = 1 + pred_labels[item.label] = 1 logging.info(f"True labels:") - for label,count in true_labels.items(): + for label, count in true_labels.items(): logging.info(f" => {label}: {count}") - logging.info(f"Pred labels:") - for label,count in pred_labels.items(): + logging.info(f"Pred labels:") + for label, count in pred_labels.items(): logging.info(f" => {label}: {count}") - - intersection_labels = [] - intersection_map = {} - for label,count in true_labels.items(): + + intersection_labels: List[str] = [] + for label, count in true_labels.items(): if label in pred_labels: - intersection_map[len(intersection_labels)] = label intersection_labels.append(label) return true_labels, pred_labels, intersection_labels - - def _evaluate_layouts_in_documents(self, doc_id:str, - true_doc: DoclingDocument, - pred_doc: DoclingDocument, - filter_labels: List[DocItemLabel]): - # logging.info(f"#-true-tables: {len(true_tables)}, #-pred-tables: {len(pred_tables)}") - assert len(true_doc.pages)==len(pred_doc.pages), "len(true_doc.pages)==len(pred_doc.pages)" + def _evaluate_layouts_in_documents( + self, + doc_id: str, + true_doc: DoclingDocument, + pred_doc: DoclingDocument, + filter_labels: List[DocItemLabel], + ): + + # logging.info(f"#-true-tables: {len(true_tables)}, #-pred-tables: {len(pred_tables)}") + assert len(true_doc.pages) == len( + pred_doc.pages + ), "len(true_doc.pages)==len(pred_doc.pages)" true_pages_to_objects: Dict[int, List[DocItem]] = {} pred_pages_to_objects: Dict[int, List[DocItem]] = {} - + for item, level in true_doc.iterate_items(): if isinstance(item, DocItem) and item.label in filter_labels: for prov in item.prov: @@ -198,20 +225,20 @@ def _evaluate_layouts_in_documents(self, doc_id:str, if prov.page_no not in pred_pages_to_objects: pred_pages_to_objects[prov.page_no] = [item] else: - pred_pages_to_objects[prov.page_no].append(item) + pred_pages_to_objects[prov.page_no].append(item) ground_truths = [] predictions = [] # logging.info("\n\n ============================================ \n\n") - + for page_no, items in true_pages_to_objects.items(): page_size = true_doc.pages[page_no].size - + page_height = page_size.height page_width = page_size.width - + bboxes = [] labels = [] for item in items: @@ -222,22 +249,24 @@ def _evaluate_layouts_in_documents(self, doc_id:str, bbox = bbox.scaled(100.0) # logging.info(f"ground-truth {page_no}: {page_width, page_height} -> {item.label}, {bbox.coord_origin}: [{bbox.l}, {bbox.t}, {bbox.r}, {bbox.b}]") - + bboxes.append([bbox.l, bbox.t, bbox.r, bbox.b]) labels.append(filter_labels.index(item.label)) - - ground_truths.append({ - "boxes": torch.tensor(bboxes), - "labels": torch.tensor(labels), - }) + + ground_truths.append( + { + "boxes": torch.tensor(bboxes), + "labels": torch.tensor(labels), + } + ) for page_no, items in pred_pages_to_objects.items(): page_size = pred_doc.pages[page_no].size - + page_height = page_size.height page_width = page_size.width - + bboxes = [] labels = [] scores = [] @@ -247,19 +276,23 @@ def _evaluate_layouts_in_documents(self, doc_id:str, bbox = prov.bbox.to_bottom_left_origin(page_height=page_height) bbox = bbox.normalized(page_size) bbox = bbox.scaled(100.0) - + # logging.info(f"prediction {page_no}: {page_width, page_height} -> {item.label}, {bbox.coord_origin}: [{bbox.l}, {bbox.b}, {bbox.r}, {bbox.t}]") - + bboxes.append([bbox.l, bbox.b, bbox.r, bbox.t]) labels.append(filter_labels.index(item.label)) - scores.append(1.0) # FIXME - - predictions.append({ - "boxes": torch.tensor(bboxes), - "labels": torch.tensor(labels), - "scores": torch.tensor(scores), - }) - - assert len(ground_truths)==len(predictions), "len(ground_truths)==len(predictions)" - + scores.append(1.0) # FIXME + + predictions.append( + { + "boxes": torch.tensor(bboxes), + "labels": torch.tensor(labels), + "scores": torch.tensor(scores), + } + ) + + assert len(ground_truths) == len( + predictions + ), "len(ground_truths)==len(predictions)" + return ground_truths, predictions diff --git a/docling_eval/evaluators/table_evaluator.py b/docling_eval/evaluators/table_evaluator.py index 08c2205..4df23fc 100644 --- a/docling_eval/evaluators/table_evaluator.py +++ b/docling_eval/evaluators/table_evaluator.py @@ -1,36 +1,32 @@ +import glob import logging import os -import glob import statistics import time from pathlib import Path -from typing import Optional, Tuple, List -from tqdm import tqdm - -import numpy as np +from typing import List, Optional, Tuple import datasets - +import numpy as np +from datasets import Dataset, load_dataset from docling_core.types.doc.document import DoclingDocument, TableItem from lxml import html from pydantic import BaseModel - -from docling_eval.utils.teds import TEDScorer - -from datasets import Dataset -from datasets import load_dataset +from tqdm import tqdm # type: ignore from docling_eval.benchmarks.constants import BenchMarkColumns +from docling_eval.utils.teds import TEDScorer _log = logging.getLogger(__name__) class TableEvaluation(BaseModel): - filename: str = None - table_id: int = -1 + filename: str = "" + table_id: int = -1 TEDS: float is_complex: bool = False + class DatasetStatistics(BaseModel): total: int @@ -38,17 +34,54 @@ class DatasetStatistics(BaseModel): median: float std: float - bins: Tuple[float, float, float, float, float, - float, float, float, float, float, - float, float, float, float, float, - float, float, float, float, float, - float] - - hist: Tuple[float, float, float, float, float, - float, float, float, float, float, - float, float, float, float, float, - float, float, float, float, float] - + bins: Tuple[ + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + ] + + hist: Tuple[ + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + float, + ] + + class DatasetTableEvaluation(BaseModel): evaluations: list[TableEvaluation] @@ -56,20 +89,24 @@ class DatasetTableEvaluation(BaseModel): TEDS_simple: DatasetStatistics TEDS_complex: DatasetStatistics + def compute_stats(values: List[float]) -> DatasetStatistics: - total:int = len(values) + total: int = len(values) - mean:float = statistics.mean(values) if len(values) > 0 else None - median:float = statistics.median(values) if len(values) > 0 else None - std:float = statistics.stdev(values) if len(values) > 0 else None + mean: float = statistics.mean(values) if len(values) > 0 else -1 + median: float = statistics.median(values) if len(values) > 0 else -1 + std: float = statistics.stdev(values) if len(values) > 0 else -1 logging.info(f"total: {total}, mean: {mean}, median: {median}, std: {std}") - + # Compute the histogram with 20 bins between 0 and 1 hist, bins = np.histogram(values, bins=20, range=(0, 1)) logging.info(f"#-hist: {len(hist)}, #-bins: {len(bins)}") - return DatasetStatistics(total=total, mean=mean, median=median, std=std, hist=hist, bins=bins) - + return DatasetStatistics( + total=total, mean=mean, median=median, std=std, hist=hist, bins=bins + ) + + def is_complex_table(table: TableItem) -> bool: r""" Implement the logic to check if table is complex @@ -89,7 +126,7 @@ def __init__(self) -> None: self._teds_scorer = TEDScorer() self._stopwords = ["", "", "", "", "", ""] - def __call__(self, ds_path: Path, split: str="test") -> DatasetTableEvaluation: + def __call__(self, ds_path: Path, split: str = "test") -> DatasetTableEvaluation: r""" Load a dataset in HF format. Expected columns with DoclingDocuments "GTDoclingDocument" @@ -98,40 +135,49 @@ def __call__(self, ds_path: Path, split: str="test") -> DatasetTableEvaluation: logging.info(f"loading from: {ds_path}") # Load the Parquet file - #dataset = Dataset.from_parquet("benchmarks/dpbench-tableformer/test/shard_000000_000000.parquet") - #dataset.save_to_disk("benchmarks/dpbench-tableformer-dataset") + # dataset = Dataset.from_parquet("benchmarks/dpbench-tableformer/test/shard_000000_000000.parquet") + # dataset.save_to_disk("benchmarks/dpbench-tableformer-dataset") test_path = str(ds_path / "test" / "*.parquet") train_path = str(ds_path / "train" / "*.parquet") - + test_files = glob.glob(test_path) train_files = glob.glob(train_path) logging.info(f"test-files: {test_files}, train-files: {train_files}") - + # Load all files into the `test`-`train` split ds = None - if len(test_files)>0 and len(train_files)>0: - ds = load_dataset("parquet", data_files={"test": test_files, "train": train_files}) - elif len(test_files)>0 and len(train_files)==0: + if len(test_files) > 0 and len(train_files) > 0: + ds = load_dataset( + "parquet", data_files={"test": test_files, "train": train_files} + ) + elif len(test_files) > 0 and len(train_files) == 0: ds = load_dataset("parquet", data_files={"test": test_files}) - + logging.info(f"oveview of dataset: {ds}") - + table_evaluations = [] - #ds = datasets.load_from_disk(ds_path) - ds = ds[split] - for i, data in tqdm(enumerate(ds), desc="Table evaluations", ncols=120, total=len(ds)): - #gt_doc_dict = data["GroundTruthDoclingDocument"] + # ds = datasets.load_from_disk(ds_path) + if ds is not None: + ds_selection: Dataset = ds[split] + + for i, data in tqdm( + enumerate(ds_selection), + desc="Table evaluations", + ncols=120, + total=len(ds_selection), + ): + # gt_doc_dict = data["GroundTruthDoclingDocument"] gt_doc_dict = data[BenchMarkColumns.GROUNDTRUTH] gt_doc = DoclingDocument.model_validate_json(gt_doc_dict) - #pred_doc_dict = data["PredictedDoclingDocument"] + # pred_doc_dict = data["PredictedDoclingDocument"] pred_doc_dict = data[BenchMarkColumns.PREDICTION] pred_doc = DoclingDocument.model_validate_json(pred_doc_dict) - results = self._evaluate_tables_in_documents(doc_id=data[BenchMarkColumns.DOC_ID], - gt_doc=gt_doc, - pred_doc=pred_doc) - + results = self._evaluate_tables_in_documents( + doc_id=data[BenchMarkColumns.DOC_ID], gt_doc=gt_doc, pred_doc=pred_doc + ) + table_evaluations.extend(results) # Compute TED statistics for the entire dataset @@ -140,17 +186,17 @@ def __call__(self, ds_path: Path, split: str="test") -> DatasetTableEvaluation: teds_all = [] for te in table_evaluations: teds_all.append(te.TEDS) - + if te.is_complex: teds_complex.append(te.TEDS) else: teds_simple.append(te.TEDS) dataset_evaluation = DatasetTableEvaluation( - evaluations = table_evaluations, - TEDS = compute_stats(teds_all), - TEDS_simple = compute_stats(teds_simple), - TEDS_complex = compute_stats(teds_complex), + evaluations=table_evaluations, + TEDS=compute_stats(teds_all), + TEDS_simple=compute_stats(teds_simple), + TEDS_complex=compute_stats(teds_complex), ) return dataset_evaluation @@ -166,10 +212,10 @@ def _evaluate_tables_in_documents( gt_tables = gt_doc.tables pred_tables = pred_doc.tables - # logging.info(f"#-true-tables: {len(gt_tables)}, #-pred-tables: {len(pred_tables)}") - assert len(gt_tables)==len(pred_tables), "len(gt_tables)!=len(pred_tables)" - - for table_id in range(len(gt_tables)):#, len(pred_tables)): + # logging.info(f"#-true-tables: {len(gt_tables)}, #-pred-tables: {len(pred_tables)}") + assert len(gt_tables) == len(pred_tables), "len(gt_tables)!=len(pred_tables)" + + for table_id in range(len(gt_tables)): # , len(pred_tables)): try: gt_table = gt_tables[table_id] @@ -185,15 +231,21 @@ def _evaluate_tables_in_documents( gt_html_obj = html.fromstring(gt_html) predicted_html_obj = html.fromstring(predicted_html) - teds = self._teds_scorer(gt_html_obj, predicted_html_obj, structure_only) - #logging.info(f"teds: {teds}") - + teds = self._teds_scorer( + gt_html_obj, predicted_html_obj, structure_only + ) + # logging.info(f"teds: {teds}") + teds = round(teds, 3) - table_evaluation = TableEvaluation(TEDS=teds, is_complex=is_complex, filename=doc_id, table_id=table_id) + table_evaluation = TableEvaluation( + TEDS=teds, is_complex=is_complex, filename=doc_id, table_id=table_id + ) table_evaluations.append(table_evaluation) except Exception as exc: - logging.error(f"Table {table_id} from document {doc_id} could not be compared!") - + logging.error( + f"Table {table_id} from document {doc_id} could not be compared!" + ) + return table_evaluations # def _dump_full_table_html(self, image_filename: str, full_table_html: str):