Skip to content

Commit

Permalink
chore: WIP: Fixing syntax erros, code styling.
Browse files Browse the repository at this point in the history
Signed-off-by: Nikos Livathinos <[email protected]>
  • Loading branch information
nikos-livathinos committed Jan 13, 2025
1 parent aa6be39 commit 72b895b
Show file tree
Hide file tree
Showing 13 changed files with 1,365 additions and 958 deletions.
777 changes: 466 additions & 311 deletions docling_eval/benchmarks/annotation_formats/create.py

Large diffs are not rendered by default.

965 changes: 589 additions & 376 deletions docling_eval/benchmarks/annotation_formats/cvat.py

Large diffs are not rendered by default.

395 changes: 219 additions & 176 deletions docling_eval/benchmarks/annotation_formats/preannotate.py

Large diffs are not rendered by default.

10 changes: 4 additions & 6 deletions docling_eval/benchmarks/annotation_formats/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class DocLinkLabel(str, Enum):
TO_CAPTION = "to_caption"
TO_FOOTNOTE = "to_footnote"
TO_VALUE = "to_value"

MERGE = "merge"
GROUP = "group"

Expand All @@ -25,16 +25,15 @@ def get_color(label: "DocLinkLabel") -> Tuple[int, int, int]:
"""Return the RGB color associated with a given label."""
color_map = {
DocLinkLabel.READING_ORDER: (255, 0, 0),

DocLinkLabel.TO_CAPTION: (0, 255, 0),
DocLinkLabel.TO_FOOTNOTE: (0, 255, 0),
DocLinkLabel.TO_VALUE: (0, 255, 0),

DocLinkLabel.MERGE: (255, 0, 255),
DocLinkLabel.GROUP: (255, 255, 0),
}
return color_map[label]



class TableComponentLabel(str, Enum):
"""TableComponentLabel."""

Expand All @@ -56,5 +55,4 @@ def get_color(label: "TableComponentLabel") -> Tuple[int, int, int]:
TableComponentLabel.TABLE_COL: (0, 255, 0),
TableComponentLabel.TABLE_GROUP: (0, 0, 255),
}
return color_map[label]

return color_map[label]
2 changes: 1 addition & 1 deletion docling_eval/benchmarks/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class BenchMarkColumns(str, Enum):

GROUNDTRUTH_PAGE_IMAGES = "GroundTruthPageImages"
GROUNDTRUTH_PICTURES = "GroundTruthPictures"

PREDICTION_PAGE_IMAGES = "PredictionPageImages"
PREDICTION_PICTURES = "PredictionPictures"

Expand Down
15 changes: 4 additions & 11 deletions docling_eval/benchmarks/dpbench/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def create_dpbench_e2e_dataset(
true_doc, true_page_images = add_pages_to_true_doc(
pdf_path=pdf_path, true_doc=true_doc, image_scale=image_scale
)

assert len(true_page_images) == 1, "len(true_page_images)==1"

page_width = true_doc.pages[1].size.width
Expand Down Expand Up @@ -309,7 +309,7 @@ def create_dpbench_e2e_dataset(
pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, # pictures_column,
page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, # page_images_column,
)

pred_doc, pred_pictures, pred_page_images = extract_images(
document=pred_doc,
pictures_column=BenchMarkColumns.PREDICTION_PICTURES.value, # pictures_column,
Expand All @@ -320,15 +320,12 @@ def create_dpbench_e2e_dataset(
BenchMarkColumns.DOCLING_VERSION: docling_version(),
BenchMarkColumns.STATUS: str(conv_results.status),
BenchMarkColumns.DOC_ID: str(filename),

BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()),
BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: true_page_images,
BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures,

BenchMarkColumns.PREDICTION: json.dumps(pred_doc.export_to_dict()),
BenchMarkColumns.PREDICTION_PAGE_IMAGES: pred_page_images,
BenchMarkColumns.PREDICTION_PICTURES: pred_pictures,

BenchMarkColumns.ORIGINAL: get_binary(pdf_path),
BenchMarkColumns.MIMETYPE: "application/pdf",
}
Expand Down Expand Up @@ -414,27 +411,23 @@ def create_dpbench_tableformer_dataset(
pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, # pictures_column,
page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, # page_images_column,
)

pred_doc, pred_pictures, pred_page_images = extract_images(
document=pred_doc,
pictures_column=BenchMarkColumns.PREDICTION_PICTURES.value, # pictures_column,
page_images_column=BenchMarkColumns.PREDICTION_PAGE_IMAGES.value, # page_images_column,
)

record = {
BenchMarkColumns.DOCLING_VERSION: docling_version(),
BenchMarkColumns.STATUS: "SUCCESS",
BenchMarkColumns.DOC_ID: str(os.path.basename(pdf_path)),

BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()),
BenchMarkColumns.PREDICTION: json.dumps(pred_doc.export_to_dict()),

BenchMarkColumns.ORIGINAL: get_binary(pdf_path),
BenchMarkColumns.MIMETYPE: "application/pdf",

BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: true_page_images,
BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures,

BenchMarkColumns.PREDICTION_PAGE_IMAGES: pred_page_images,
BenchMarkColumns.PREDICTION_PICTURES: pred_pictures,
}
Expand Down
8 changes: 4 additions & 4 deletions docling_eval/benchmarks/omnidocbench/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ def create_omnidocbench_e2e_dataset(
pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES, # pictures_column,
page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES, # page_images_column,
)

pred_doc, pred_pictures, pred_page_images = extract_images(
pred_doc, # conv_results.document,
pictures_column=BenchMarkColumns.PREDICTION_PICTURES, # pictures_column,
Expand Down Expand Up @@ -440,19 +440,19 @@ def create_omnidocbench_tableformer_dataset(
true_labels=TRUE_HTML_EXPORT_LABELS,
pred_labels=PRED_HTML_EXPORT_LABELS,
)

true_doc, true_pictures, true_page_images = extract_images(
true_doc, # conv_results.document,
pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES, # pictures_column,
page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES, # page_images_column,
)

pred_doc, pred_pictures, pred_page_images = extract_images(
pred_doc, # conv_results.document,
pictures_column=BenchMarkColumns.PREDICTION_PICTURES, # pictures_column,
page_images_column=BenchMarkColumns.PREDICTION_PAGE_IMAGES, # page_images_column,
)

record = {
BenchMarkColumns.DOCLING_VERSION: docling_version(),
BenchMarkColumns.STATUS: "SUCCESS",
Expand Down
67 changes: 36 additions & 31 deletions docling_eval/benchmarks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

import pypdfium2 as pdfium
from bs4 import BeautifulSoup # type: ignore
from datasets import Features
from datasets import Image as Features_Image
from datasets import Sequence, Value
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.datamodel.base_models import InputFormat, Page
from docling.datamodel.document import InputDocument
Expand All @@ -24,17 +27,13 @@

from docling_eval.benchmarks.constants import BenchMarkColumns
from docling_eval.docling.constants import (
HTML_INSPECTION,
HTML_COMPARISON_PAGE,
HTML_COMPARISON_PAGE_WITH_CLUSTERS,
HTML_DEFAULT_HEAD_FOR_COMP,
HTML_INSPECTION,
)
from docling_eval.docling.utils import from_pil_to_base64, from_pil_to_base64uri

from datasets import Features, Value, Sequence
from datasets import Image as Features_Image


"""
def write_datasets_info(
name: str, output_dir: Path, num_train_rows: int, num_test_rows: int
Expand Down Expand Up @@ -71,24 +70,29 @@ def write_datasets_info(
fw.write(json.dumps(dataset_infos, indent=2))
"""

def write_datasets_info(name: str, output_dir: Path, num_train_rows: int, num_test_rows: int):
features = Features({
BenchMarkColumns.DOCLING_VERSION: Value("string"),
BenchMarkColumns.STATUS: Value("string"),
BenchMarkColumns.DOC_ID: Value("string"),
BenchMarkColumns.GROUNDTRUTH: Value("string"),
BenchMarkColumns.PREDICTION: Value("string"),
BenchMarkColumns.ORIGINAL: Value("string"),
BenchMarkColumns.MIMETYPE: Value("string"),
BenchMarkColumns.PREDICTION_PICTURES: Sequence(Features_Image()),
BenchMarkColumns.PREDICTION_PAGE_IMAGES: Sequence(Features_Image()),
BenchMarkColumns.GROUNDTRUTH_PICTURES: Sequence(Features_Image()),
BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: Sequence(Features_Image()),
})


def write_datasets_info(
name: str, output_dir: Path, num_train_rows: int, num_test_rows: int
):
features = Features(
{
BenchMarkColumns.DOCLING_VERSION: Value("string"),
BenchMarkColumns.STATUS: Value("string"),
BenchMarkColumns.DOC_ID: Value("string"),
BenchMarkColumns.GROUNDTRUTH: Value("string"),
BenchMarkColumns.PREDICTION: Value("string"),
BenchMarkColumns.ORIGINAL: Value("string"),
BenchMarkColumns.MIMETYPE: Value("string"),
BenchMarkColumns.PREDICTION_PICTURES: Sequence(Features_Image()),
BenchMarkColumns.PREDICTION_PAGE_IMAGES: Sequence(Features_Image()),
BenchMarkColumns.GROUNDTRUTH_PICTURES: Sequence(Features_Image()),
BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: Sequence(Features_Image()),
}
)

schema = features.to_dict()
print(json.dumps(schema, indent=2))

dataset_infos = {
"train": {
"description": f"Training split of {name}",
Expand All @@ -106,6 +110,7 @@ def write_datasets_info(name: str, output_dir: Path, num_train_rows: int, num_te
with open(output_dir / "dataset_infos.json", "w") as fw:
json.dump(dataset_infos, fw, indent=2)


def get_input_document(file: Path):
return InputDocument(
path_or_stream=file,
Expand Down Expand Up @@ -143,8 +148,8 @@ def add_pages_to_true_doc(
size=Size(
width=float(page_image.width), height=float(page_image.height)
),
#uri=Path(f"{BenchMarkColumns.PAGE_IMAGES}/{page_no}"),
uri=from_pil_to_base64uri(page_image)
# uri=Path(f"{BenchMarkColumns.PAGE_IMAGES}/{page_no}"),
uri=from_pil_to_base64uri(page_image),
)
page_item = PageItem(
page_no=page_no + 1,
Expand All @@ -153,12 +158,12 @@ def add_pages_to_true_doc(
)

true_doc.pages[page_no + 1] = page_item
#page_image.show()
# page_image.show()
else:
logging.warning("did not get image for page `add_pages_to_true_doc`")

page._backend.unload()

return true_doc, page_images


Expand Down Expand Up @@ -303,7 +308,7 @@ def draw_clusters_with_reading_order(
reading_order: bool = True,
):

#img = copy.deepcopy(page_image)
# img = copy.deepcopy(page_image)
img = page_image.copy()
draw = ImageDraw.Draw(img)

Expand Down Expand Up @@ -563,9 +568,9 @@ def draw_clusters(doc: DoclingDocument, labels: Set[DocItemLabel]):
fw.write(comparison_page)


def save_inspection_html(filename: Path,
doc:DoclingDocument,
labels: Set[DocItemLabel]):
def save_inspection_html(
filename: Path, doc: DoclingDocument, labels: Set[DocItemLabel]
):

html_doc = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED, labels=labels)
html_doc = html_doc.replace("'", "&#39;")
Expand All @@ -589,6 +594,6 @@ def save_inspection_html(filename: Path,
html_viz = copy.deepcopy(HTML_INSPECTION)
html_viz = html_viz.replace("PREDDOC", html_doc)
html_viz = html_viz.replace("PAGE_IMAGES", "\n".join(page_images))

with open(str(filename), "w") as fw:
fw.write(html_viz)
5 changes: 3 additions & 2 deletions docling_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,9 @@ def visualise(
)

figname = odir / f"evaluation_{benchmark.value}_{modality.value}.png"
layout_evaluation.mAP_stats.save_histogram(figname=figname, name="struct-with-text")

layout_evaluation.mAP_stats.save_histogram(
figname=figname, name="struct-with-text"
)

elif modality == EvaluationModality.TABLEFORMER:

Expand Down
6 changes: 3 additions & 3 deletions docling_eval/docling/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def extract_images(
# img.show()
page_images.append(page.image.pil_image)
page.image.uri = Path(f"{page_images_column}/{page_no}")

return document, pictures, page_images


Expand All @@ -138,13 +138,13 @@ def insert_images(
# Save page images
for pic_no, picture in enumerate(document.pictures):
if picture.image is not None:
b64 = to_base64(pictures[pic_no-1])
b64 = to_base64(pictures[pic_no - 1])
picture.image.uri = AnyUrl(f"data:image/png;base64,{b64}")

# Save page images
for page_no, page in document.pages.items():
if page.image is not None:
b64 = to_base64(page_images[page_no-1])
b64 = to_base64(page_images[page_no - 1])
page.image.uri = AnyUrl(f"data:image/png;base64,{b64}")

return document
Expand Down
Loading

0 comments on commit 72b895b

Please sign in to comment.