From 291a5940385a53f727fa4e80be995adb7ad24089 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 6 Jan 2025 12:59:21 +0100 Subject: [PATCH] reformatted the code Signed-off-by: Peter Staar --- .../tableformer_huggingface_otsl/create.py | 82 +++++++++++++++---- docling_eval/benchmarks/utils.py | 20 +++-- .../benchmark_tableformer_fintabnet.py | 6 +- docs/examples/benchmark_tableformer_p1m.py | 2 +- .../benchmark_tableformer_pubtabnet.py | 6 +- 5 files changed, 86 insertions(+), 30 deletions(-) diff --git a/docling_eval/benchmarks/tableformer_huggingface_otsl/create.py b/docling_eval/benchmarks/tableformer_huggingface_otsl/create.py index e960514..3c1f734 100644 --- a/docling_eval/benchmarks/tableformer_huggingface_otsl/create.py +++ b/docling_eval/benchmarks/tableformer_huggingface_otsl/create.py @@ -98,7 +98,13 @@ def create_page_tokens(data: List[Any], height: float, width: float) -> PageToke def create_huggingface_otsl_tableformer_dataset( - output_dir: Path, image_scale: float = 1.0, max_records: int = 1000, name: str = "ds4sd/FinTabNet_OTSL", split:str = "test", do_viz:bool = False, max_items:int = -1 + output_dir: Path, + image_scale: float = 1.0, + max_records: int = 1000, + name: str = "ds4sd/FinTabNet_OTSL", + split: str = "test", + do_viz: bool = False, + max_items: int = -1, ): # Create the directories @@ -110,7 +116,7 @@ def create_huggingface_otsl_tableformer_dataset( # Use glob to find all .parquet files in the directory parquet_files = glob.glob(os.path.join(str(test_dir), "*.parquet")) - + # Loop through and remove each file for file in parquet_files: try: @@ -118,28 +124,28 @@ def create_huggingface_otsl_tableformer_dataset( print(f"Deleted: {file}") except Exception as e: print(f"Error deleting {file}: {e}") - + # Init the TableFormer model tf_updater = TableFormerUpdater() ds = load_dataset(name, split=split) - if max_items==-1: + if max_items == -1: max_items = len(ds) - + records = [] tid, sid = 0, 0 - for i,item in tqdm( + for i, item in tqdm( enumerate(ds), total=max_items, ncols=128, desc=f"create {name} tableformer dataset", ): - if i>=max_items: + if i >= max_items: break - + filename = item["filename"] table_image = item["image"] @@ -166,7 +172,9 @@ def create_huggingface_otsl_tableformer_dataset( ) html = "" + "".join(item["html"]) + "
" - table_data = convert_html_table_into_docling_tabledata(html, text_cells=item["cells"][0]) + table_data = convert_html_table_into_docling_tabledata( + html, text_cells=item["cells"][0] + ) l = 0.0 b = 0.0 @@ -177,7 +185,7 @@ def create_huggingface_otsl_tableformer_dataset( b = table_image.height - item["table_bbox"][3] r = item["table_bbox"][2] t = table_image.height - item["table_bbox"][1] - + bbox = BoundingBox( l=l, r=r, @@ -258,20 +266,60 @@ def create_huggingface_otsl_tableformer_dataset( sid += 1 records = [] + def create_fintabnet_tableformer_dataset( - output_dir: Path, image_scale: float = 1.0, max_records: int = 1000, do_viz:bool = False, max_items: int = 1000): - create_huggingface_otsl_tableformer_dataset(output_dir=output_dir, image_scale=image_scale, max_records=max_records, name="ds4sd/FinTabNet_OTSL", split="test", do_viz=do_viz, max_items=max_items) + output_dir: Path, + image_scale: float = 1.0, + max_records: int = 1000, + do_viz: bool = False, + max_items: int = 1000, +): + create_huggingface_otsl_tableformer_dataset( + output_dir=output_dir, + image_scale=image_scale, + max_records=max_records, + name="ds4sd/FinTabNet_OTSL", + split="test", + do_viz=do_viz, + max_items=max_items, + ) + def create_pubtabnet_tableformer_dataset( - output_dir: Path, image_scale: float = 1.0, max_records: int = 1000, do_viz:bool = False, max_items: int = 1000 + output_dir: Path, + image_scale: float = 1.0, + max_records: int = 1000, + do_viz: bool = False, + max_items: int = 1000, ): - create_huggingface_otsl_tableformer_dataset(output_dir=output_dir, image_scale=image_scale, max_records=max_records, name="ds4sd/PubTabNet_OTSL", split="val", do_viz=do_viz, max_items=max_items) + create_huggingface_otsl_tableformer_dataset( + output_dir=output_dir, + image_scale=image_scale, + max_records=max_records, + name="ds4sd/PubTabNet_OTSL", + split="val", + do_viz=do_viz, + max_items=max_items, + ) + def create_p1m_tableformer_dataset( - output_dir: Path, image_scale: float = 1.0, max_records: int = 1000, do_viz:bool = True, max_items: int = 1000 + output_dir: Path, + image_scale: float = 1.0, + max_records: int = 1000, + do_viz: bool = True, + max_items: int = 1000, ): - create_huggingface_otsl_tableformer_dataset(output_dir=output_dir, image_scale=image_scale, max_records=max_records, name="ds4sd/PubTables-1M_OTSL", split="test", do_viz=do_viz, max_items=max_items) - + create_huggingface_otsl_tableformer_dataset( + output_dir=output_dir, + image_scale=image_scale, + max_records=max_records, + name="ds4sd/PubTables-1M_OTSL", + split="test", + do_viz=do_viz, + max_items=max_items, + ) + def main(): diff --git a/docling_eval/benchmarks/utils.py b/docling_eval/benchmarks/utils.py index 2d4141c..8e53a88 100644 --- a/docling_eval/benchmarks/utils.py +++ b/docling_eval/benchmarks/utils.py @@ -4,7 +4,7 @@ import json import logging from pathlib import Path -from typing import Dict, List, Set, Optional +from typing import Dict, List, Optional, Set import pypdfium2 as pdfium from bs4 import BeautifulSoup # type: ignore @@ -109,7 +109,9 @@ def add_pages_to_true_doc( return true_doc, page_images -def yield_cells_from_html_table(table_html: str, text_cells: Optional[List[Dict]] = None): +def yield_cells_from_html_table( + table_html: str, text_cells: Optional[List[Dict]] = None +): soup = BeautifulSoup(table_html, "html.parser") table = soup.find("table") or soup # Ensure table context rows = table.find_all("tr") @@ -139,10 +141,10 @@ def yield_cells_from_html_table(table_html: str, text_cells: Optional[List[Dict] # Get text, rowspan, and colspan text = cell.get_text(strip=True) - if len(text)==0 and text_cells is not None: + if len(text) == 0 and text_cells is not None: text_cell = text_cells[text_cell_id] text = "".join(text_cell["tokens"]) - + rowspan = int(cell.get("rowspan", 1)) colspan = int(cell.get("colspan", 1)) @@ -157,9 +159,11 @@ def yield_cells_from_html_table(table_html: str, text_cells: Optional[List[Dict] col_idx += colspan # Move to next column after colspan text_cell_id += 1 - -def convert_html_table_into_docling_tabledata(table_html: str, text_cells: Optional[List] = None) -> TableData: + +def convert_html_table_into_docling_tabledata( + table_html: str, text_cells: Optional[List] = None +) -> TableData: num_rows = -1 num_cols = -1 @@ -168,7 +172,7 @@ def convert_html_table_into_docling_tabledata(table_html: str, text_cells: Optio try: for row_idx, col_idx, rowspan, colspan, text in yield_cells_from_html_table( - table_html=table_html, text_cells=text_cells + table_html=table_html, text_cells=text_cells ): cell = TableCell( row_span=rowspan, @@ -187,7 +191,7 @@ def convert_html_table_into_docling_tabledata(table_html: str, text_cells: Optio except: logging.error("No table-structure identified") exit(-1) - + return TableData(num_rows=num_rows, num_cols=num_cols, table_cells=cells) diff --git a/docs/examples/benchmark_tableformer_fintabnet.py b/docs/examples/benchmark_tableformer_fintabnet.py index 40800ce..c447f48 100644 --- a/docs/examples/benchmark_tableformer_fintabnet.py +++ b/docs/examples/benchmark_tableformer_fintabnet.py @@ -18,7 +18,7 @@ def main(): benchmark = BenchMarkNames.FINTABNET - + odir = Path(f"./benchmarks/{BenchMarkNames.FINTABNET.value}-dataset") odir_tab = Path(odir) / "tableformer" @@ -27,7 +27,9 @@ def main(): os.makedirs(_, exist_ok=True) if True: - create_fintabnet_tableformer_dataset(output_dir=odir_tab, max_items=1000, do_viz=True) + create_fintabnet_tableformer_dataset( + output_dir=odir_tab, max_items=1000, do_viz=True + ) evaluate( modality=EvaluationModality.TABLEFORMER, diff --git a/docs/examples/benchmark_tableformer_p1m.py b/docs/examples/benchmark_tableformer_p1m.py index 36b12f0..884c501 100644 --- a/docs/examples/benchmark_tableformer_p1m.py +++ b/docs/examples/benchmark_tableformer_p1m.py @@ -18,7 +18,7 @@ def main(): benchmark = BenchMarkNames.PUB1M - + odir = Path(f"./benchmarks/{benchmark.value}-dataset") odir_tab = Path(odir) / "tableformer" diff --git a/docs/examples/benchmark_tableformer_pubtabnet.py b/docs/examples/benchmark_tableformer_pubtabnet.py index 9fe277a..7b87037 100644 --- a/docs/examples/benchmark_tableformer_pubtabnet.py +++ b/docs/examples/benchmark_tableformer_pubtabnet.py @@ -18,7 +18,7 @@ def main(): benchmark = BenchMarkNames.PUBTABNET - + odir = Path(f"./benchmarks/{BenchMarkNames.FINTABNET.value}-dataset") odir_tab = Path(odir) / "tableformer" @@ -27,7 +27,9 @@ def main(): os.makedirs(_, exist_ok=True) if True: - create_pubtabnet_tableformer_dataset(output_dir=odir_tab, max_items=1000, do_viz=True) + create_pubtabnet_tableformer_dataset( + output_dir=odir_tab, max_items=1000, do_viz=True + ) evaluate( modality=EvaluationModality.TABLEFORMER,