Skip to content

Commit

Permalink
reformatted the code
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Jan 6, 2025
1 parent b166973 commit 291a594
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 30 deletions.
82 changes: 65 additions & 17 deletions docling_eval/benchmarks/tableformer_huggingface_otsl/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,13 @@ def create_page_tokens(data: List[Any], height: float, width: float) -> PageToke


def create_huggingface_otsl_tableformer_dataset(
output_dir: Path, image_scale: float = 1.0, max_records: int = 1000, name: str = "ds4sd/FinTabNet_OTSL", split:str = "test", do_viz:bool = False, max_items:int = -1
output_dir: Path,
image_scale: float = 1.0,
max_records: int = 1000,
name: str = "ds4sd/FinTabNet_OTSL",
split: str = "test",
do_viz: bool = False,
max_items: int = -1,
):

# Create the directories
Expand All @@ -110,36 +116,36 @@ def create_huggingface_otsl_tableformer_dataset(

# Use glob to find all .parquet files in the directory
parquet_files = glob.glob(os.path.join(str(test_dir), "*.parquet"))

# Loop through and remove each file
for file in parquet_files:
try:
os.remove(file)
print(f"Deleted: {file}")
except Exception as e:
print(f"Error deleting {file}: {e}")

# Init the TableFormer model
tf_updater = TableFormerUpdater()

ds = load_dataset(name, split=split)

if max_items==-1:
if max_items == -1:
max_items = len(ds)

records = []
tid, sid = 0, 0

for i,item in tqdm(
for i, item in tqdm(
enumerate(ds),
total=max_items,
ncols=128,
desc=f"create {name} tableformer dataset",
):

if i>=max_items:
if i >= max_items:
break

filename = item["filename"]
table_image = item["image"]

Expand All @@ -166,7 +172,9 @@ def create_huggingface_otsl_tableformer_dataset(
)

html = "<table>" + "".join(item["html"]) + "</table>"
table_data = convert_html_table_into_docling_tabledata(html, text_cells=item["cells"][0])
table_data = convert_html_table_into_docling_tabledata(
html, text_cells=item["cells"][0]
)

l = 0.0
b = 0.0
Expand All @@ -177,7 +185,7 @@ def create_huggingface_otsl_tableformer_dataset(
b = table_image.height - item["table_bbox"][3]
r = item["table_bbox"][2]
t = table_image.height - item["table_bbox"][1]

bbox = BoundingBox(
l=l,
r=r,
Expand Down Expand Up @@ -258,20 +266,60 @@ def create_huggingface_otsl_tableformer_dataset(
sid += 1
records = []


def create_fintabnet_tableformer_dataset(
output_dir: Path, image_scale: float = 1.0, max_records: int = 1000, do_viz:bool = False, max_items: int = 1000):
create_huggingface_otsl_tableformer_dataset(output_dir=output_dir, image_scale=image_scale, max_records=max_records, name="ds4sd/FinTabNet_OTSL", split="test", do_viz=do_viz, max_items=max_items)
output_dir: Path,
image_scale: float = 1.0,
max_records: int = 1000,
do_viz: bool = False,
max_items: int = 1000,
):
create_huggingface_otsl_tableformer_dataset(
output_dir=output_dir,
image_scale=image_scale,
max_records=max_records,
name="ds4sd/FinTabNet_OTSL",
split="test",
do_viz=do_viz,
max_items=max_items,
)


def create_pubtabnet_tableformer_dataset(
output_dir: Path, image_scale: float = 1.0, max_records: int = 1000, do_viz:bool = False, max_items: int = 1000
output_dir: Path,
image_scale: float = 1.0,
max_records: int = 1000,
do_viz: bool = False,
max_items: int = 1000,
):
create_huggingface_otsl_tableformer_dataset(output_dir=output_dir, image_scale=image_scale, max_records=max_records, name="ds4sd/PubTabNet_OTSL", split="val", do_viz=do_viz, max_items=max_items)
create_huggingface_otsl_tableformer_dataset(
output_dir=output_dir,
image_scale=image_scale,
max_records=max_records,
name="ds4sd/PubTabNet_OTSL",
split="val",
do_viz=do_viz,
max_items=max_items,
)


def create_p1m_tableformer_dataset(
output_dir: Path, image_scale: float = 1.0, max_records: int = 1000, do_viz:bool = True, max_items: int = 1000
output_dir: Path,
image_scale: float = 1.0,
max_records: int = 1000,
do_viz: bool = True,
max_items: int = 1000,
):
create_huggingface_otsl_tableformer_dataset(output_dir=output_dir, image_scale=image_scale, max_records=max_records, name="ds4sd/PubTables-1M_OTSL", split="test", do_viz=do_viz, max_items=max_items)

create_huggingface_otsl_tableformer_dataset(
output_dir=output_dir,
image_scale=image_scale,
max_records=max_records,
name="ds4sd/PubTables-1M_OTSL",
split="test",
do_viz=do_viz,
max_items=max_items,
)


def main():

Expand Down
20 changes: 12 additions & 8 deletions docling_eval/benchmarks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import json
import logging
from pathlib import Path
from typing import Dict, List, Set, Optional
from typing import Dict, List, Optional, Set

import pypdfium2 as pdfium
from bs4 import BeautifulSoup # type: ignore
Expand Down Expand Up @@ -109,7 +109,9 @@ def add_pages_to_true_doc(
return true_doc, page_images


def yield_cells_from_html_table(table_html: str, text_cells: Optional[List[Dict]] = None):
def yield_cells_from_html_table(
table_html: str, text_cells: Optional[List[Dict]] = None
):
soup = BeautifulSoup(table_html, "html.parser")
table = soup.find("table") or soup # Ensure table context
rows = table.find_all("tr")
Expand Down Expand Up @@ -139,10 +141,10 @@ def yield_cells_from_html_table(table_html: str, text_cells: Optional[List[Dict]
# Get text, rowspan, and colspan
text = cell.get_text(strip=True)

if len(text)==0 and text_cells is not None:
if len(text) == 0 and text_cells is not None:
text_cell = text_cells[text_cell_id]
text = "".join(text_cell["tokens"])

rowspan = int(cell.get("rowspan", 1))
colspan = int(cell.get("colspan", 1))

Expand All @@ -157,9 +159,11 @@ def yield_cells_from_html_table(table_html: str, text_cells: Optional[List[Dict]
col_idx += colspan # Move to next column after colspan

text_cell_id += 1


def convert_html_table_into_docling_tabledata(table_html: str, text_cells: Optional[List] = None) -> TableData:

def convert_html_table_into_docling_tabledata(
table_html: str, text_cells: Optional[List] = None
) -> TableData:

num_rows = -1
num_cols = -1
Expand All @@ -168,7 +172,7 @@ def convert_html_table_into_docling_tabledata(table_html: str, text_cells: Optio

try:
for row_idx, col_idx, rowspan, colspan, text in yield_cells_from_html_table(
table_html=table_html, text_cells=text_cells
table_html=table_html, text_cells=text_cells
):
cell = TableCell(
row_span=rowspan,
Expand All @@ -187,7 +191,7 @@ def convert_html_table_into_docling_tabledata(table_html: str, text_cells: Optio
except:
logging.error("No table-structure identified")
exit(-1)

return TableData(num_rows=num_rows, num_cols=num_cols, table_cells=cells)


Expand Down
6 changes: 4 additions & 2 deletions docs/examples/benchmark_tableformer_fintabnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
def main():

benchmark = BenchMarkNames.FINTABNET

odir = Path(f"./benchmarks/{BenchMarkNames.FINTABNET.value}-dataset")

odir_tab = Path(odir) / "tableformer"
Expand All @@ -27,7 +27,9 @@ def main():
os.makedirs(_, exist_ok=True)

if True:
create_fintabnet_tableformer_dataset(output_dir=odir_tab, max_items=1000, do_viz=True)
create_fintabnet_tableformer_dataset(
output_dir=odir_tab, max_items=1000, do_viz=True
)

evaluate(
modality=EvaluationModality.TABLEFORMER,
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/benchmark_tableformer_p1m.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
def main():

benchmark = BenchMarkNames.PUB1M

odir = Path(f"./benchmarks/{benchmark.value}-dataset")

odir_tab = Path(odir) / "tableformer"
Expand Down
6 changes: 4 additions & 2 deletions docs/examples/benchmark_tableformer_pubtabnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
def main():

benchmark = BenchMarkNames.PUBTABNET

odir = Path(f"./benchmarks/{BenchMarkNames.FINTABNET.value}-dataset")

odir_tab = Path(odir) / "tableformer"
Expand All @@ -27,7 +27,9 @@ def main():
os.makedirs(_, exist_ok=True)

if True:
create_pubtabnet_tableformer_dataset(output_dir=odir_tab, max_items=1000, do_viz=True)
create_pubtabnet_tableformer_dataset(
output_dir=odir_tab, max_items=1000, do_viz=True
)

evaluate(
modality=EvaluationModality.TABLEFORMER,
Expand Down

0 comments on commit 291a594

Please sign in to comment.