Skip to content

Commit

Permalink
WIP: Working to fix exception of ReadingOrderUpdater for DPBench create
Browse files Browse the repository at this point in the history
Signed-off-by: Nikos Livathinos <[email protected]>
  • Loading branch information
nikos-livathinos committed Jan 8, 2025
1 parent 83209ff commit ba8e98a
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 42 deletions.
82 changes: 45 additions & 37 deletions docling_eval/benchmarks/dpbench/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
write_datasets_info,
)
from docling_eval.docling.conversion import create_converter
from docling_eval.docling.models.reading_order.reading_order_updater import (
ReadingOrderUpdater,
)
from docling_eval.docling.models.tableformer.tf_model_prediction import (
TableFormerUpdater,
)
Expand Down Expand Up @@ -424,28 +427,36 @@ def create_dpbench_tableformer_dataset(
)


def create_dpbench_readingorder_dataset(
dpbench_dir: Path, output_dir: Path, image_scale: float = 1.0
def create_dpbench_reading_order_dataset(
dpbench_dir: Path, output_dir: Path, image_scale: float = 1.0, do_viz: bool = True
):
# Init the TableFormer model
tf_updater = TableFormerUpdater()

r"""
Steps:
1. Initialiaze the ReadingOrderUpdater
2. Create the ground-truth instance of DoclingDocument.
3. Pass the original pdf and the true_doc to the updater to generate the predicted doc.
"""
# load the groundtruth
with open(dpbench_dir / f"dataset/reference.json", "r") as fr:
gt = json.load(fr)

viz_dir = output_dir / "vizualisations"
os.makedirs(viz_dir, exist_ok=True)
# Ensure output dirs
os.makedirs(output_dir, exist_ok=True)
if do_viz:
viz_dir = output_dir / "vizualisations"
os.makedirs(viz_dir, exist_ok=True)

records = []

# Init the ReadingOrderUpdater
reading_order_updater = ReadingOrderUpdater()

for filename, annots in tqdm(
gt.items(),
desc="Processing files for DP-Bench with TableFormer",
desc="Processing files for DP-Bench with the GLM model",
total=len(gt),
ncols=128,
):

pdf_path = dpbench_dir / f"dataset/pdfs/{filename}"

# Create the groundtruth Document
Expand All @@ -469,43 +480,40 @@ def create_dpbench_readingorder_dataset(
page_height=page_height,
)

# Create the updated Document
updated, pred_doc = tf_updater.replace_tabledata(
pdf_path=pdf_path, true_doc=true_doc
)

if updated:
pred_doc = reading_order_updater(pdf_path, true_doc)
if not pred_doc:
continue

if True:
save_comparison_html(
filename=viz_dir / f"{os.path.basename(pdf_path)}-comp.html",
true_doc=true_doc,
pred_doc=pred_doc,
page_image=true_page_images[0],
true_labels=TRUE_HTML_EXPORT_LABELS,
pred_labels=PRED_HTML_EXPORT_LABELS,
)
if do_viz:
save_comparison_html(
filename=viz_dir / f"{os.path.basename(pdf_path)}-comp.html",
true_doc=true_doc,
pred_doc=pred_doc,
page_image=true_page_images[0],
true_labels=TRUE_HTML_EXPORT_LABELS,
pred_labels=PRED_HTML_EXPORT_LABELS,
)

record = {
BenchMarkColumns.DOCLING_VERSION: docling_version(),
BenchMarkColumns.STATUS: "SUCCESS",
BenchMarkColumns.DOC_ID: str(os.path.basename(pdf_path)),
BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()),
BenchMarkColumns.PREDICTION: json.dumps(pred_doc.export_to_dict()),
BenchMarkColumns.ORIGINAL: get_binary(pdf_path),
BenchMarkColumns.MIMETYPE: "application/pdf",
BenchMarkColumns.PAGE_IMAGES: true_page_images,
BenchMarkColumns.PICTURES: [], # pred_pictures,
}
records.append(record)
record = {
BenchMarkColumns.DOCLING_VERSION: docling_version(),
BenchMarkColumns.STATUS: "SUCCESS",
BenchMarkColumns.DOC_ID: str(os.path.basename(pdf_path)),
BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()),
BenchMarkColumns.PREDICTION: json.dumps(pred_doc.export_to_dict()),
BenchMarkColumns.ORIGINAL: get_binary(pdf_path),
BenchMarkColumns.MIMETYPE: "application/pdf",
BenchMarkColumns.PAGE_IMAGES: true_page_images,
BenchMarkColumns.PICTURES: [], # pred_pictures,
}
records.append(record)

test_dir = output_dir / "test"
os.makedirs(test_dir, exist_ok=True)

save_shard_to_disk(items=records, dataset_path=test_dir)

write_datasets_info(
name="DPBench: readingorder",
name="DPBench: reading_order",
output_dir=output_dir,
num_train_rows=0,
num_test_rows=len(records),
Expand Down
8 changes: 4 additions & 4 deletions docling_eval/benchmarks/omnidocbench/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,9 +472,9 @@ def create_omnidocbench_reading_order_dataset(
TODO: Most of the code is similar to create_omnidocbench_tableformer_dataset.
Steps:
1. Initialiaze the LayoutModelUpdater
1. Initialiaze the ReadingOrderUpdater
2. Create the ground-truth instance of DoclingDocument.
3. Pass the original pdf and the true_doc to the layout updater to generate the predicted doc.
3. Pass the original pdf and the true_doc to the updater to generate the predicted doc.
"""
# load the groundtruth
with open(omnidocbench_dir / f"OmniDocBench.json", "r") as fr:
Expand All @@ -491,14 +491,14 @@ def create_omnidocbench_reading_order_dataset(
records = []
page_tuples = get_filenames(omnidocbench_dir)

# Init the LayoutModelUpdater
# Init the ReadingOrderUpdater
reading_order_updater = ReadingOrderUpdater()

for page_tuple in tqdm(
page_tuples,
total=len(page_tuples),
ncols=128,
desc="Processing files for OmniDocBench with LayoutModel",
desc="Processing files for OmniDocBench with GLM model",
):
jpg_path = page_tuple[0]
pdf_path = Path(page_tuple[1])
Expand Down
5 changes: 5 additions & 0 deletions docling_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from docling_eval.benchmarks.constants import BenchMarkNames, EvaluationModality
from docling_eval.benchmarks.dpbench.create import (
create_dpbench_e2e_dataset,
create_dpbench_reading_order_dataset,
create_dpbench_tableformer_dataset,
)
from docling_eval.benchmarks.omnidocbench.create import (
Expand Down Expand Up @@ -79,6 +80,10 @@ def create(
create_dpbench_tableformer_dataset(
dpbench_dir=idir, output_dir=odir, image_scale=image_scale
)
elif modality == EvaluationModality.READING_ORDER:
create_dpbench_reading_order_dataset(
dpbench_dir=idir, output_dir=odir, image_scale=image_scale
)

else:
log.error(f"{modality} is not yet implemented for {benchmark}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,13 @@ def __call__(
self, pdf_path: Path, true_doc: DoclingDocument
) -> Optional[DoclingDocument]:
r""" """
print(true_doc.name)
# deep copy of the true-document
pred_doc = copy.deepcopy(true_doc)
pred_doc_legacy = docling_document_to_legacy(pred_doc)
ds_doc_dict = pred_doc_legacy.model_dump(by_alias=True, exclude_none=True)
try:
# TODO: Make a concrete check against missing bboxes in the legacy document
# TODO: Understand why some documents fail here
glm_doc = self._nlp_model.apply_on_doc(ds_doc_dict)
except RuntimeError as ex:
# print("nlp_model.apply_on_doc()")
Expand All @@ -44,6 +45,12 @@ def __call__(
# Map from value to key.value before calling to_docling_document
for page_element in glm_doc["page-elements"]:
page_element["name"] = self._labels_forward_mapping[page_element["name"]]

# When true_doc.name == "ground-truth 01030000000016.pdf"
# pydantic_core._pydantic_core.ValidationError: 1 validation error for TextItem
# label
# Input should be <DocItemLabel.CAPTION: 'caption'>, <DocItemLabel.CHECKBOX_SELECTED: 'checkbox_selected'>, <DocItemLabel.CHECKBOX_UNSELECTED: 'checkbox_unselected'>, <DocItemLabel.CODE: 'code'>, <DocItemLabel.FOOTNOTE: 'footnote'>, <DocItemLabel.FORMULA: 'formula'>, <DocItemLabel.PAGE_FOOTER: 'page_footer'>, <DocItemLabel.PAGE_HEADER: 'page_header'>, <DocItemLabel.PARAGRAPH: 'paragraph'>, <DocItemLabel.REFERENCE: 'reference'>, <DocItemLabel.TEXT: 'text'> or <DocItemLabel.TITLE: 'title'> [type=literal_error, input_value=<DocItemLabel.DOCUMENT_INDEX: 'document_index'>, input_type=DocItemLabel]

pred_doc = to_docling_document(glm_doc)

return pred_doc

0 comments on commit ba8e98a

Please sign in to comment.