Skip to content

Commit

Permalink
fix: ocr options, use image converter
Browse files Browse the repository at this point in the history
Signed-off-by: Yusik Kim <[email protected]>
  • Loading branch information
kmyusk committed Feb 26, 2025
1 parent 6190792 commit 488fe75
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 7 deletions.
10 changes: 4 additions & 6 deletions docling_eval/benchmarks/doclaynet_v2/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
write_datasets_info,
)
from docling_eval.converters.conversion import (
create_pdf_docling_converter,
create_image_docling_converter,
create_smol_docling_converter,
)
from docling_eval.visualisation.visualisations import save_comparison_html_with_clusters
Expand Down Expand Up @@ -416,10 +416,8 @@ def create_dlnv2_e2e_dataset(
max_items: int = -1, # If -1 take the whole split
):
if converter_type == ConverterTypes.DOCLING:
converter = create_pdf_docling_converter(
page_image_scale=1.0,
converter = create_image_docling_converter(
do_ocr=True,
ocr_lang=["en", "fr", "es", "de", "jp", "cn"],
)
else:
converter = create_smol_docling_converter()
Expand Down Expand Up @@ -531,12 +529,12 @@ def create_dlnv2_e2e_dataset(
}
records.append(record)
count += 1
if count > max_items:
break
if count % SHARD_SIZE == 0:
shard_id = count // SHARD_SIZE - 1
save_shard_to_disk(items=records, dataset_path=test_dir, shard_id=shard_id)
records = []
if count > max_items:
break

shard_id = count // SHARD_SIZE
save_shard_to_disk(items=records, dataset_path=test_dir, shard_id=shard_id)
Expand Down
2 changes: 1 addition & 1 deletion docling_eval/converters/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def create_image_docling_converter(

pipeline_options = PdfPipelineOptions(
do_ocr=do_ocr,
ocr_options=EasyOcrOptions(force_full_page_ocr=force_ocr),
ocr_options=ocr_options,
do_table_structure=True,
)

Expand Down

0 comments on commit 488fe75

Please sign in to comment.