Skip to content

Commit

Permalink
chore: Add CONVERTER_TYPE and MODALITIES columns to all produced data…
Browse files Browse the repository at this point in the history
…sets

Signed-off-by: Nikos Livathinos <[email protected]>
  • Loading branch information
nikos-livathinos committed Feb 24, 2025
1 parent de55da1 commit 2d5ea4e
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 6 deletions.
7 changes: 6 additions & 1 deletion docling_eval/benchmarks/cvat_annotation/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@
from PIL import Image # as PILImage
from tqdm import tqdm # type: ignore

from docling_eval.benchmarks.constants import BenchMarkColumns, EvaluationModality
from docling_eval.benchmarks.constants import (
BenchMarkColumns,
ConverterTypes,
EvaluationModality,
)
from docling_eval.benchmarks.cvat_annotation.utils import (
AnnotatedImage,
AnnotationOverview,
Expand Down Expand Up @@ -979,6 +983,7 @@ def create_layout_dataset_from_annotations(
)

record = {
BenchMarkColumns.CONVERTER_TYPE: ConverterTypes.DOCLING,
BenchMarkColumns.DOCLING_VERSION: docling_version(),
BenchMarkColumns.STATUS: str(conv_results.status),
BenchMarkColumns.DOC_ID: str(basename),
Expand Down
10 changes: 9 additions & 1 deletion docling_eval/benchmarks/doclaynet_v1/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@
from docling_core.types.io import DocumentStream
from tqdm import tqdm # type: ignore

from docling_eval.benchmarks.constants import BenchMarkColumns, ConverterTypes
from docling_eval.benchmarks.constants import (
BenchMarkColumns,
ConverterTypes,
EvaluationModality,
)
from docling_eval.benchmarks.utils import add_pages_to_true_doc, write_datasets_info
from docling_eval.converters.conversion import (
create_docling_converter,
Expand Down Expand Up @@ -265,6 +269,10 @@ def create_dlnv1_e2e_dataset(
BenchMarkColumns.PREDICTION_PICTURES: pred_pictures,
BenchMarkColumns.ORIGINAL: pdf_stream.getvalue(),
BenchMarkColumns.MIMETYPE: "image/png",
BenchMarkColumns.MODALITIES: [
EvaluationModality.LAYOUT,
EvaluationModality.READING_ORDER,
],
}
pdf_stream.close()
records.append(record)
Expand Down
13 changes: 12 additions & 1 deletion docling_eval/benchmarks/dpbench/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,11 @@
from docling_core.types.doc.labels import DocItemLabel
from PIL import Image # as PILImage

from docling_eval.benchmarks.constants import BenchMarkColumns, ConverterTypes
from docling_eval.benchmarks.constants import (
BenchMarkColumns,
ConverterTypes,
EvaluationModality,
)
from docling_eval.benchmarks.utils import (
add_pages_to_true_doc,
convert_html_table_into_docling_tabledata,
Expand Down Expand Up @@ -341,6 +345,10 @@ def create_dpbench_e2e_dataset(
BenchMarkColumns.PREDICTION_PICTURES: pred_pictures,
BenchMarkColumns.ORIGINAL: get_binary(pdf_path),
BenchMarkColumns.MIMETYPE: "application/pdf",
BenchMarkColumns.MODALITIES: [
EvaluationModality.LAYOUT,
EvaluationModality.READING_ORDER,
],
}
records.append(record)

Expand Down Expand Up @@ -448,6 +456,9 @@ def create_dpbench_tableformer_dataset(
BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures,
BenchMarkColumns.PREDICTION_PAGE_IMAGES: pred_page_images,
BenchMarkColumns.PREDICTION_PICTURES: pred_pictures,
BenchMarkColumns.MODALITIES: [
EvaluationModality.TABLE_STRUCTURE,
],
}
records.append(record)

Expand Down
11 changes: 10 additions & 1 deletion docling_eval/benchmarks/funsd/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@
from PIL import Image
from tqdm import tqdm # type: ignore

from docling_eval.benchmarks.constants import BenchMarkColumns
from docling_eval.benchmarks.constants import (
BenchMarkColumns,
ConverterTypes,
EvaluationModality,
)
from docling_eval.benchmarks.utils import write_datasets_info
from docling_eval.converters.conversion import create_image_converter
from docling_eval.converters.utils import (
Expand Down Expand Up @@ -511,13 +515,18 @@ def create_funsd_dataset(
)

record = {
BenchMarkColumns.CONVERTER_TYPE: ConverterTypes.DOCLING,
BenchMarkColumns.DOCLING_VERSION: docling_version(),
BenchMarkColumns.DOC_ID: img_path.stem,
BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()),
BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: true_page_images,
BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures,
BenchMarkColumns.ORIGINAL: img_bytes,
BenchMarkColumns.MIMETYPE: "image/png",
BenchMarkColumns.MODALITIES: [
EvaluationModality.LAYOUT,
EvaluationModality.READING_ORDER,
],
}
records.append(record)
count += 1
Expand Down
11 changes: 10 additions & 1 deletion docling_eval/benchmarks/omnidocbench/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@
from PIL import Image # as PILImage
from tqdm import tqdm # type: ignore

from docling_eval.benchmarks.constants import BenchMarkColumns, ConverterTypes
from docling_eval.benchmarks.constants import (
BenchMarkColumns,
ConverterTypes,
EvaluationModality,
)
from docling_eval.benchmarks.utils import (
add_pages_to_true_doc,
convert_html_table_into_docling_tabledata,
Expand Down Expand Up @@ -368,6 +372,10 @@ def create_omnidocbench_e2e_dataset(
BenchMarkColumns.PREDICTION_PICTURES: pred_pictures,
BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: true_page_images,
BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures,
BenchMarkColumns.MODALITIES: [
EvaluationModality.LAYOUT,
EvaluationModality.READING_ORDER,
],
}
records.append(record)

Expand Down Expand Up @@ -485,6 +493,7 @@ def create_omnidocbench_tableformer_dataset(
BenchMarkColumns.PREDICTION_PICTURES: pred_pictures,
BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: true_page_images,
BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures,
BenchMarkColumns.MODALITIES: [EvaluationModality.TABLE_STRUCTURE],
}
records.append(record)

Expand Down
10 changes: 9 additions & 1 deletion docling_eval/benchmarks/tableformer_huggingface_otsl/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
from docling_core.types.doc.labels import DocItemLabel
from tqdm import tqdm # type: ignore

from docling_eval.benchmarks.constants import BenchMarkColumns
from docling_eval.benchmarks.constants import (
BenchMarkColumns,
ConverterTypes,
EvaluationModality,
)
from docling_eval.benchmarks.utils import convert_html_table_into_docling_tabledata
from docling_eval.converters.models.tableformer.tf_model_prediction import (
PageTokens,
Expand Down Expand Up @@ -221,6 +225,7 @@ def create_huggingface_otsl_tableformer_dataset(
)

record = {
BenchMarkColumns.CONVERTER_TYPE: ConverterTypes.DOCLING,
BenchMarkColumns.DOCLING_VERSION: docling_version(),
BenchMarkColumns.STATUS: str(ConversionStatus.SUCCESS.value),
BenchMarkColumns.DOC_ID: str(os.path.basename(filename)),
Expand All @@ -232,10 +237,12 @@ def create_huggingface_otsl_tableformer_dataset(
BenchMarkColumns.PREDICTION_PICTURES: pred_pictures,
BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: true_page_images,
BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures,
BenchMarkColumns.MODALITIES: [EvaluationModality.TABLE_STRUCTURE],
}
records.append(record)
else:
record = {
BenchMarkColumns.CONVERTER_TYPE: ConverterTypes.DOCLING,
BenchMarkColumns.DOCLING_VERSION: docling_version(),
BenchMarkColumns.STATUS: str(ConversionStatus.FAILURE.value),
BenchMarkColumns.DOC_ID: str(os.path.basename(filename)),
Expand All @@ -247,6 +254,7 @@ def create_huggingface_otsl_tableformer_dataset(
BenchMarkColumns.PREDICTION_PICTURES: pred_pictures,
BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: true_page_images,
BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures,
BenchMarkColumns.MODALITIES: [EvaluationModality.TABLE_STRUCTURE],
}
records.append(record)

Expand Down

0 comments on commit 2d5ea4e

Please sign in to comment.