Skip to content

Commit

Permalink
added the table-parsing in omnidocbench
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Dec 29, 2024
1 parent b214077 commit 668b662
Showing 1 changed file with 125 additions and 23 deletions.
148 changes: 125 additions & 23 deletions docling_eval/benchmarks/omnidocbench/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import pypdfium2 as pdfium
from tqdm import tqdm # type: ignore

from bs4 import BeautifulSoup # type: ignore

from docling_core.types.doc.labels import DocItemLabel

from docling_core.types.doc.document import (
Expand All @@ -27,6 +29,8 @@
from docling_eval.benchmarks.utils import write_datasets_info
from docling_eval.docling.conversion import create_converter

from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size

from docling_eval.docling.utils import (
crop_bounding_box,
docling_version,
Expand Down Expand Up @@ -61,29 +65,122 @@ def update_gt_into_map(gt):

return result

def create_true_doc(jpg_path, pdf_path, gt):
def parse_html_table(table_html):
soup = BeautifulSoup(table_html, "html.parser")
table = soup.find("table") or soup # Ensure table context
rows = table.find_all("tr")

max_cols = 0
for row in rows:
cols = row.find_all(["td", "th"])
max_cols = max(max_cols, len(cols)) # Determine maximum columns

# Create grid to track cell positions
grid = [[None for _ in range(max_cols * 2)] for _ in range(len(rows) * 2)]

for row_idx, row in enumerate(rows):
col_idx = 0 # Start from first column
for cell in row.find_all(["td", "th"]):
# Skip over filled grid positions (handle previous rowspan/colspan)
while grid[row_idx][col_idx] is not None:
col_idx += 1

# Get text, rowspan, and colspan
text = cell.get_text(strip=True)
rowspan = int(cell.get("rowspan", 1))
colspan = int(cell.get("colspan", 1))

# Fill grid positions and yield (row, column, text)
for r in range(rowspan):
for c in range(colspan):
grid[row_idx + r][col_idx + c] = text

# print(f"Row: {row_idx + 1}, Col: {col_idx + 1}, Text: {text}")
yield row_idx, col_idx, rowspan, colspan, text

col_idx += colspan # Move to next column after colspan

def update_doc_with_gt(gt, true_doc, page_width:float, page_height:float):

print(json.dumps(gt, indent=2))

true_doc = DoclingDocument(name=f"ground-truth {os.path.basename(jpg_path)}")
gt_width = float(gt["page_info"]["width"])
gt_height = float(gt["page_info"]["height"])

for item in gt["layout_dets"]:

#print(json.dumps(item, indent=2))
print(json.dumps(item, indent=2))

label = item["category_type"]
#text = item["text"]

text = ""
if "text" in item:
text = item["text"]

min_x = item["poly"][0]
max_x = item["poly"][0]

min_y = item["poly"][1]
max_y = item["poly"][1]

for i in range(0,4):
min_x = min(min_x, item["poly"][2*i])
max_x = max(max_x, item["poly"][2*i])

min_y = min(min_y, item["poly"][2*i+1])
max_y = max(max_y, item["poly"][2*i+1])

bbox = BoundingBox(
l=min_x * page_width / gt_width,
r=max_x * page_width / gt_width,
b=min_y * page_height / gt_height,
t=max_y * page_height / gt_height,
coord_origin=CoordOrigin.TOPLEFT,
)

prov = ProvenanceItem(page_no=1, bbox=bbox, charspan=(0, len(text)))

if label=="title":
pass
true_doc.add_heading(text=text, orig=text, level=1, prov=prov)

elif label=="text_block":
pass
true_doc.add_text(label=DocItemLabel.TEXT, text=text, orig=text, prov=prov)

elif label=="text_mask":
pass

elif label=="table":
pass


num_rows = -1
num_cols = -1

cells = []

if "html" in item:

for row_idx, col_idx, rowspan, colspan, text in parse_html_table(
table_html=item["html"]
):
cell = TableCell(
row_span=rowspan,
col_span=colspan,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + rowspan,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + colspan,
text=text,
)
cells.append(cell)

num_rows = max(row_idx + rowspan, num_rows)
num_cols = max(col_idx + colspan, num_cols)

else:
logging.error("No table-structure identified")

table_data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=cells)
true_doc.add_table(data=table_data, caption=None, prov=prov)

elif label=="table_caption":
pass

Expand All @@ -94,7 +191,7 @@ def create_true_doc(jpg_path, pdf_path, gt):
pass

elif label=="figure":
pass
true_doc.add_picture(prov=prov)

elif label=="figure_caption":
pass
Expand All @@ -103,7 +200,7 @@ def create_true_doc(jpg_path, pdf_path, gt):
pass

elif label=="equation_isolated":
pass
true_doc.add_text(label=DocItemLabel.FORMULA, text=text, orig=text, prov=prov)

elif label=="equation_caption":
pass
Expand All @@ -118,10 +215,10 @@ def create_true_doc(jpg_path, pdf_path, gt):
pass

elif label=="header":
pass
true_doc.add_text(label=DocItemLabel.PAGE_HEADER, text=text, orig=text, prov=prov)

elif label=="footer":
pass
true_doc.add_text(label=DocItemLabel.PAGE_FOOTER, text=text, orig=text, prov=prov)

elif label=="reference":
pass
Expand All @@ -134,8 +231,6 @@ def create_true_doc(jpg_path, pdf_path, gt):

else:
logging.error(f"label {label} is not assigned!")
exit(-1)


return true_doc

Expand Down Expand Up @@ -175,27 +270,35 @@ def create_omnidocbench_e2e_dataset(
logging.error(f"did not find ground-truth for {os.path.basename(jpg_path)}")
continue

true_doc = create_true_doc(jpg_path, pdf_path, gt[os.path.basename(jpg_path)])
gt_doc = gt[os.path.basename(jpg_path)]



"""
conv_results = doc_converter.convert(source=pdf_path, raises_on_error=True)

conv_results.document.save_as_html(filename = viz_dir / f"{os.path.basename(pdf_path)}.html",
conv_results.document.save_as_html(filename = viz_dir / f"{os.path.basename(pdf_path)}-pred.html",
image_mode = ImageRefMode.EMBEDDED)

pred_doc, pictures, page_images = extract_images(
conv_results.document,
pictures_column=BenchMarkColumns.PICTURES.value, # pictures_column,
page_images_column=BenchMarkColumns.PAGE_IMAGES.value, # page_images_column,
)

true_doc = DoclingDocument(name=f"ground-truth {os.path.basename(jpg_path)}")
true_doc.pages = pred_doc.pages

page_width = pred_doc.pages[1].size.width
page_height = pred_doc.pages[1].size.height

true_doc = update_doc_with_gt(gt_doc, true_doc, page_width=page_width, page_height=page_height)

true_doc.save_as_html(filename = viz_dir / f"{os.path.basename(pdf_path)}-true.html",
image_mode = ImageRefMode.PLACEHOLDER)

record = {
BenchMarkColumns.DOCLING_VERSION: docling_version(),
BenchMarkColumns.STATUS: "SUCCESS",
BenchMarkColumns.DOC_ID: str(os.path.basename(pdf_path)),
BenchMarkColumns.GROUNDTRUTH: "", #json.dumps(true_doc.export_to_dict()),
BenchMarkColumns.DOC_ID: str(os.path.basename(jpg_path)),
BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()),
BenchMarkColumns.PREDICTION: json.dumps(pred_doc.export_to_dict()),
BenchMarkColumns.ORIGINAL: get_binary(pdf_path),
BenchMarkColumns.MIMETYPE: "application/pdf",
Expand All @@ -208,7 +311,6 @@ def create_omnidocbench_e2e_dataset(
break
else:
cnt += 1
"""

test_dir = output_dir / "test"
os.makedirs(test_dir, exist_ok=True)
Expand Down

0 comments on commit 668b662

Please sign in to comment.