Skip to content

Commit

Permalink
feat: ReadingOrderEvaluator: Full implementation with Average Relativ…
Browse files Browse the repository at this point in the history
…e Distance metric

Signed-off-by: Nikos Livathinos <[email protected]>
  • Loading branch information
nikos-livathinos committed Jan 13, 2025
1 parent 3cbf954 commit 586ae01
Show file tree
Hide file tree
Showing 7 changed files with 245 additions and 91 deletions.
1 change: 1 addition & 0 deletions docling_eval/benchmarks/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class EvaluationModality(str, Enum):
LAYOUT = "layout"
TABLEFORMER = "tableformer"
CODEFORMER = "codeformer"
READING_ORDER = "reading_order"


class BenchMarkNames(str, Enum):
Expand Down
91 changes: 3 additions & 88 deletions docling_eval/benchmarks/dpbench/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
write_datasets_info,
)
from docling_eval.docling.conversion import create_converter
from docling_eval.docling.models.reading_order.reading_order_updater import (
ReadingOrderUpdater,
)
from docling_eval.docling.models.tableformer.tf_model_prediction import (
TableFormerUpdater,
)
Expand Down Expand Up @@ -424,94 +427,6 @@ def create_dpbench_tableformer_dataset(
)


def create_dpbench_readingorder_dataset(
dpbench_dir: Path, output_dir: Path, image_scale: float = 1.0
):
# Init the TableFormer model
tf_updater = TableFormerUpdater()

# load the groundtruth
with open(dpbench_dir / f"dataset/reference.json", "r") as fr:
gt = json.load(fr)

viz_dir = output_dir / "vizualisations"
os.makedirs(viz_dir, exist_ok=True)

records = []

for filename, annots in tqdm(
gt.items(),
desc="Processing files for DP-Bench with TableFormer",
total=len(gt),
ncols=128,
):

pdf_path = dpbench_dir / f"dataset/pdfs/{filename}"

# Create the groundtruth Document
true_doc = DoclingDocument(name=f"ground-truth {os.path.basename(pdf_path)}")
true_doc, true_page_images = add_pages_to_true_doc(
pdf_path=pdf_path, true_doc=true_doc, image_scale=image_scale
)

assert len(true_page_images) == 1, "len(true_page_images)==1"

page_width = true_doc.pages[1].size.width
page_height = true_doc.pages[1].size.height

for elem in annots["elements"]:
update(
true_doc,
elem,
page=true_doc.pages[1],
page_image=true_page_images[0],
page_width=page_width,
page_height=page_height,
)

# Create the updated Document
updated, pred_doc = tf_updater.replace_tabledata(
pdf_path=pdf_path, true_doc=true_doc
)

if updated:

if True:
save_comparison_html(
filename=viz_dir / f"{os.path.basename(pdf_path)}-comp.html",
true_doc=true_doc,
pred_doc=pred_doc,
page_image=true_page_images[0],
true_labels=TRUE_HTML_EXPORT_LABELS,
pred_labels=PRED_HTML_EXPORT_LABELS,
)

record = {
BenchMarkColumns.DOCLING_VERSION: docling_version(),
BenchMarkColumns.STATUS: "SUCCESS",
BenchMarkColumns.DOC_ID: str(os.path.basename(pdf_path)),
BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()),
BenchMarkColumns.PREDICTION: json.dumps(pred_doc.export_to_dict()),
BenchMarkColumns.ORIGINAL: get_binary(pdf_path),
BenchMarkColumns.MIMETYPE: "application/pdf",
BenchMarkColumns.PAGE_IMAGES: true_page_images,
BenchMarkColumns.PICTURES: [], # pred_pictures,
}
records.append(record)

test_dir = output_dir / "test"
os.makedirs(test_dir, exist_ok=True)

save_shard_to_disk(items=records, dataset_path=test_dir)

write_datasets_info(
name="DPBench: readingorder",
output_dir=output_dir,
num_train_rows=0,
num_test_rows=len(records),
)


def parse_arguments():
"""Parse arguments for DP-Bench parsing."""

Expand Down
4 changes: 4 additions & 0 deletions docling_eval/benchmarks/omnidocbench/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
write_datasets_info,
)
from docling_eval.docling.conversion import create_converter
from docling_eval.docling.models.reading_order.reading_order_updater import (
ReadingOrderUpdater,
)
from docling_eval.docling.models.tableformer.tf_model_prediction import (
TableFormerUpdater,
)
Expand Down Expand Up @@ -295,6 +298,7 @@ def create_omnidocbench_e2e_dataset(

assert len(true_page_images) == 1, "len(true_page_images)==1"

# The true_doc.pages is a dict with the page numbers as indices starting at 1
page_width = true_doc.pages[1].size.width
page_height = true_doc.pages[1].size.height

Expand Down
4 changes: 2 additions & 2 deletions docling_eval/benchmarks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def write_datasets_info(
fw.write(json.dumps(dataset_infos, indent=2))


def get_input_document(file: Path):
def get_input_document(file: Path) -> InputDocument:
return InputDocument(
path_or_stream=file,
format=InputFormat.PDF, # type: ignore[arg-type]
Expand All @@ -85,7 +85,7 @@ def add_pages_to_true_doc(

for page_no in range(0, in_doc.page_count):
page = Page(page_no=page_no)
page._backend = in_doc._backend.load_page(page.page_no)
page._backend = in_doc._backend.load_page(page.page_no) # type: ignore[attr-defined]

if page._backend is not None and page._backend.is_valid():
page.size = page._backend.get_size()
Expand Down
11 changes: 10 additions & 1 deletion docling_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
DatasetLayoutEvaluation,
LayoutEvaluator,
)
from docling_eval.evaluators.readingorder_evaluator import ReadingOrderEvaluator
from docling_eval.evaluators.table_evaluator import (
DatasetTableEvaluation,
TableEvaluator,
Expand Down Expand Up @@ -65,7 +66,6 @@ def create(
odir = Path("./benchmarks") / benchmark.value / modality.value

if benchmark == BenchMarkNames.DPBENCH:

if (
modality == EvaluationModality.END2END
or modality == EvaluationModality.LAYOUT
Expand Down Expand Up @@ -128,6 +128,15 @@ def evaluate(
with open(save_fn, "w") as fd:
json.dump(table_evaluation.model_dump(), fd, indent=2, sort_keys=True)

elif modality == EvaluationModality.READING_ORDER:
readingorder_evaluator = ReadingOrderEvaluator()
readingorder_evaluation = readingorder_evaluator(idir, split="test")

with open(save_fn, "w") as fd:
json.dump(
readingorder_evaluation.model_dump(), fd, indent=2, sort_keys=True
)

elif modality == EvaluationModality.CODEFORMER:
pass

Expand Down
59 changes: 59 additions & 0 deletions docling_eval/docling/models/reading_order/reading_order_updater.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import copy
import json
import logging
from pathlib import Path
from typing import Optional

from deepsearch_glm.andromeda_nlp import nlp_model # type: ignore
from docling.utils.glm_utils import to_docling_document
from docling_core.types.doc.document import DoclingDocument
from docling_core.types.doc.labels import DocItemLabel
from docling_core.utils.legacy import (
doc_item_label_to_legacy_name,
docling_document_to_legacy,
)

from docling_eval.benchmarks.utils import get_input_document

_log = logging.getLogger(__name__)


class ReadingOrderUpdater:
def __init__(self):
r""" """
self._nlp_model = nlp_model(loglevel="error", text_ordering=True)
self._labels_forward_mapping = {
doc_item_label_to_legacy_name(v): v.value for v in DocItemLabel
}

def __call__(
self, pdf_path: Path, true_doc: DoclingDocument
) -> Optional[DoclingDocument]:
r""" """
print(true_doc.name)
# deep copy of the true-document
pred_doc = copy.deepcopy(true_doc)
pred_doc_legacy = docling_document_to_legacy(pred_doc)
ds_doc_dict = pred_doc_legacy.model_dump(by_alias=True, exclude_none=True)
try:
# TODO: Understand why some documents fail here
glm_doc = self._nlp_model.apply_on_doc(ds_doc_dict)
except RuntimeError as ex:
# print("nlp_model.apply_on_doc()")
return None

# Map from value to key.value before calling to_docling_document
for page_element in glm_doc["page-elements"]:
page_element["name"] = self._labels_forward_mapping[page_element["name"]]

# When true_doc.name == "ground-truth 01030000000016.pdf"
# pydantic_core._pydantic_core.ValidationError: 1 validation error for TextItem label
# Input should be <DocItemLabel.CAPTION: 'caption'>, <DocItemLabel.CHECKBOX_SELECTED: 'checkbox_selected'>,
# <DocItemLabel.CHECKBOX_UNSELECTED: 'checkbox_unselected'>, <DocItemLabel.CODE: 'code'>,
# <DocItemLabel.FOOTNOTE: 'footnote'>, <DocItemLabel.FORMULA: 'formula'>, <DocItemLabel.PAGE_FOOTER: 'page_footer'>,
# <DocItemLabel.PAGE_HEADER: 'page_header'>, <DocItemLabel.PARAGRAPH: 'paragraph'>, <DocItemLabel.REFERENCE: 'reference'>,
# <DocItemLabel.TEXT: 'text'> or <DocItemLabel.TITLE: 'title'>
# [type=literal_error, input_value=<DocItemLabel.DOCUMENT_INDEX: 'document_index'>, input_type=DocItemLabel]
pred_doc = to_docling_document(glm_doc)

return pred_doc
Loading

0 comments on commit 586ae01

Please sign in to comment.