diff --git a/docling_eval/benchmarks/dpbench/create.py b/docling_eval/benchmarks/dpbench/create.py index 62445b8..f441ada 100644 --- a/docling_eval/benchmarks/dpbench/create.py +++ b/docling_eval/benchmarks/dpbench/create.py @@ -33,6 +33,9 @@ write_datasets_info, ) from docling_eval.docling.conversion import create_converter +from docling_eval.docling.models.reading_order.reading_order_updater import ( + ReadingOrderUpdater, +) from docling_eval.docling.models.tableformer.tf_model_prediction import ( TableFormerUpdater, ) @@ -424,28 +427,36 @@ def create_dpbench_tableformer_dataset( ) -def create_dpbench_readingorder_dataset( - dpbench_dir: Path, output_dir: Path, image_scale: float = 1.0 +def create_dpbench_reading_order_dataset( + dpbench_dir: Path, output_dir: Path, image_scale: float = 1.0, do_viz: bool = True ): - # Init the TableFormer model - tf_updater = TableFormerUpdater() - + r""" + Steps: + 1. Initialiaze the ReadingOrderUpdater + 2. Create the ground-truth instance of DoclingDocument. + 3. Pass the original pdf and the true_doc to the updater to generate the predicted doc. + """ # load the groundtruth with open(dpbench_dir / f"dataset/reference.json", "r") as fr: gt = json.load(fr) - viz_dir = output_dir / "vizualisations" - os.makedirs(viz_dir, exist_ok=True) + # Ensure output dirs + os.makedirs(output_dir, exist_ok=True) + if do_viz: + viz_dir = output_dir / "vizualisations" + os.makedirs(viz_dir, exist_ok=True) records = [] + # Init the ReadingOrderUpdater + reading_order_updater = ReadingOrderUpdater() + for filename, annots in tqdm( gt.items(), - desc="Processing files for DP-Bench with TableFormer", + desc="Processing files for DP-Bench with the GLM model", total=len(gt), ncols=128, ): - pdf_path = dpbench_dir / f"dataset/pdfs/{filename}" # Create the groundtruth Document @@ -469,35 +480,32 @@ def create_dpbench_readingorder_dataset( page_height=page_height, ) - # Create the updated Document - updated, pred_doc = tf_updater.replace_tabledata( - pdf_path=pdf_path, true_doc=true_doc - ) - - if updated: + pred_doc = reading_order_updater(pdf_path, true_doc) + if not pred_doc: + continue - if True: - save_comparison_html( - filename=viz_dir / f"{os.path.basename(pdf_path)}-comp.html", - true_doc=true_doc, - pred_doc=pred_doc, - page_image=true_page_images[0], - true_labels=TRUE_HTML_EXPORT_LABELS, - pred_labels=PRED_HTML_EXPORT_LABELS, - ) + if do_viz: + save_comparison_html( + filename=viz_dir / f"{os.path.basename(pdf_path)}-comp.html", + true_doc=true_doc, + pred_doc=pred_doc, + page_image=true_page_images[0], + true_labels=TRUE_HTML_EXPORT_LABELS, + pred_labels=PRED_HTML_EXPORT_LABELS, + ) - record = { - BenchMarkColumns.DOCLING_VERSION: docling_version(), - BenchMarkColumns.STATUS: "SUCCESS", - BenchMarkColumns.DOC_ID: str(os.path.basename(pdf_path)), - BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()), - BenchMarkColumns.PREDICTION: json.dumps(pred_doc.export_to_dict()), - BenchMarkColumns.ORIGINAL: get_binary(pdf_path), - BenchMarkColumns.MIMETYPE: "application/pdf", - BenchMarkColumns.PAGE_IMAGES: true_page_images, - BenchMarkColumns.PICTURES: [], # pred_pictures, - } - records.append(record) + record = { + BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.STATUS: "SUCCESS", + BenchMarkColumns.DOC_ID: str(os.path.basename(pdf_path)), + BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()), + BenchMarkColumns.PREDICTION: json.dumps(pred_doc.export_to_dict()), + BenchMarkColumns.ORIGINAL: get_binary(pdf_path), + BenchMarkColumns.MIMETYPE: "application/pdf", + BenchMarkColumns.PAGE_IMAGES: true_page_images, + BenchMarkColumns.PICTURES: [], # pred_pictures, + } + records.append(record) test_dir = output_dir / "test" os.makedirs(test_dir, exist_ok=True) @@ -505,7 +513,7 @@ def create_dpbench_readingorder_dataset( save_shard_to_disk(items=records, dataset_path=test_dir) write_datasets_info( - name="DPBench: readingorder", + name="DPBench: reading_order", output_dir=output_dir, num_train_rows=0, num_test_rows=len(records), diff --git a/docling_eval/benchmarks/omnidocbench/create.py b/docling_eval/benchmarks/omnidocbench/create.py index 6df393b..b27fcbd 100644 --- a/docling_eval/benchmarks/omnidocbench/create.py +++ b/docling_eval/benchmarks/omnidocbench/create.py @@ -472,9 +472,9 @@ def create_omnidocbench_reading_order_dataset( TODO: Most of the code is similar to create_omnidocbench_tableformer_dataset. Steps: - 1. Initialiaze the LayoutModelUpdater + 1. Initialiaze the ReadingOrderUpdater 2. Create the ground-truth instance of DoclingDocument. - 3. Pass the original pdf and the true_doc to the layout updater to generate the predicted doc. + 3. Pass the original pdf and the true_doc to the updater to generate the predicted doc. """ # load the groundtruth with open(omnidocbench_dir / f"OmniDocBench.json", "r") as fr: @@ -491,14 +491,14 @@ def create_omnidocbench_reading_order_dataset( records = [] page_tuples = get_filenames(omnidocbench_dir) - # Init the LayoutModelUpdater + # Init the ReadingOrderUpdater reading_order_updater = ReadingOrderUpdater() for page_tuple in tqdm( page_tuples, total=len(page_tuples), ncols=128, - desc="Processing files for OmniDocBench with LayoutModel", + desc="Processing files for OmniDocBench with GLM model", ): jpg_path = page_tuple[0] pdf_path = Path(page_tuple[1]) diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py index 36256e8..3683fa5 100644 --- a/docling_eval/cli/main.py +++ b/docling_eval/cli/main.py @@ -11,6 +11,7 @@ from docling_eval.benchmarks.constants import BenchMarkNames, EvaluationModality from docling_eval.benchmarks.dpbench.create import ( create_dpbench_e2e_dataset, + create_dpbench_reading_order_dataset, create_dpbench_tableformer_dataset, ) from docling_eval.benchmarks.omnidocbench.create import ( @@ -79,6 +80,10 @@ def create( create_dpbench_tableformer_dataset( dpbench_dir=idir, output_dir=odir, image_scale=image_scale ) + elif modality == EvaluationModality.READING_ORDER: + create_dpbench_reading_order_dataset( + dpbench_dir=idir, output_dir=odir, image_scale=image_scale + ) else: log.error(f"{modality} is not yet implemented for {benchmark}") diff --git a/docling_eval/docling/models/reading_order/reading_order_updater.py b/docling_eval/docling/models/reading_order/reading_order_updater.py index 6e47f87..92216c8 100644 --- a/docling_eval/docling/models/reading_order/reading_order_updater.py +++ b/docling_eval/docling/models/reading_order/reading_order_updater.py @@ -30,12 +30,13 @@ def __call__( self, pdf_path: Path, true_doc: DoclingDocument ) -> Optional[DoclingDocument]: r""" """ + print(true_doc.name) # deep copy of the true-document pred_doc = copy.deepcopy(true_doc) pred_doc_legacy = docling_document_to_legacy(pred_doc) ds_doc_dict = pred_doc_legacy.model_dump(by_alias=True, exclude_none=True) try: - # TODO: Make a concrete check against missing bboxes in the legacy document + # TODO: Understand why some documents fail here glm_doc = self._nlp_model.apply_on_doc(ds_doc_dict) except RuntimeError as ex: # print("nlp_model.apply_on_doc()") @@ -44,6 +45,15 @@ def __call__( # Map from value to key.value before calling to_docling_document for page_element in glm_doc["page-elements"]: page_element["name"] = self._labels_forward_mapping[page_element["name"]] + + # When true_doc.name == "ground-truth 01030000000016.pdf" + # pydantic_core._pydantic_core.ValidationError: 1 validation error for TextItem label + # Input should be , , + # , , + # , , , + # , , , + # or + # [type=literal_error, input_value=, input_type=DocItemLabel] pred_doc = to_docling_document(glm_doc) return pred_doc