added the table-parsing in omnidocbench

Signed-off-by: Peter Staar <[email protected]>
DS4SD · Dec 29, 2024 · 668b662 · 668b662
1 parent b214077
commit 668b662
Showing 1 changed file with 125 additions and 23 deletions.
diff --git a/docling_eval/benchmarks/omnidocbench/create.py b/docling_eval/benchmarks/omnidocbench/create.py
@@ -10,6 +10,8 @@
 import pypdfium2 as pdfium
 from tqdm import tqdm  # type: ignore
 
+from bs4 import BeautifulSoup  # type: ignore
+
 from docling_core.types.doc.labels import DocItemLabel
 
 from docling_core.types.doc.document import (
@@ -27,6 +29,8 @@
 from docling_eval.benchmarks.utils import write_datasets_info
 from docling_eval.docling.conversion import create_converter
 
+from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size
+
 from docling_eval.docling.utils import (
     crop_bounding_box,
     docling_version,
@@ -61,29 +65,122 @@ def update_gt_into_map(gt):
 
     return result
 
-def create_true_doc(jpg_path, pdf_path, gt):
+def parse_html_table(table_html):
+    soup = BeautifulSoup(table_html, "html.parser")
+    table = soup.find("table") or soup  # Ensure table context
+    rows = table.find_all("tr")
+
+    max_cols = 0
+    for row in rows:
+        cols = row.find_all(["td", "th"])
+        max_cols = max(max_cols, len(cols))  # Determine maximum columns
+
+    # Create grid to track cell positions
+    grid = [[None for _ in range(max_cols * 2)] for _ in range(len(rows) * 2)]
+
+    for row_idx, row in enumerate(rows):
+        col_idx = 0  # Start from first column
+        for cell in row.find_all(["td", "th"]):
+            # Skip over filled grid positions (handle previous rowspan/colspan)
+            while grid[row_idx][col_idx] is not None:
+                col_idx += 1
+
+            # Get text, rowspan, and colspan
+            text = cell.get_text(strip=True)
+            rowspan = int(cell.get("rowspan", 1))
+            colspan = int(cell.get("colspan", 1))
+
+            # Fill grid positions and yield (row, column, text)
+            for r in range(rowspan):
+                for c in range(colspan):
+                    grid[row_idx + r][col_idx + c] = text
+
+            # print(f"Row: {row_idx + 1}, Col: {col_idx + 1}, Text: {text}")
+            yield row_idx, col_idx, rowspan, colspan, text
+
+            col_idx += colspan  # Move to next column after colspan
+
+def update_doc_with_gt(gt, true_doc, page_width:float, page_height:float):   
+
+    print(json.dumps(gt, indent=2))
 
-    true_doc = DoclingDocument(name=f"ground-truth {os.path.basename(jpg_path)}")
+    gt_width = float(gt["page_info"]["width"])
+    gt_height = float(gt["page_info"]["height"])
 
     for item in gt["layout_dets"]:
 
-        #print(json.dumps(item, indent=2))
+        print(json.dumps(item, indent=2))
 
         label = item["category_type"]
-        #text = item["text"]
 
+        text = ""
+        if "text" in item:
+            text = item["text"]
+
+        min_x = item["poly"][0]
+        max_x = item["poly"][0]
+
+        min_y = item["poly"][1]
+        max_y = item["poly"][1]
+
+        for i in range(0,4):
+            min_x = min(min_x, item["poly"][2*i])
+            max_x = max(max_x, item["poly"][2*i])
+
+            min_y = min(min_y, item["poly"][2*i+1])
+            max_y = max(max_y, item["poly"][2*i+1])
+
+        bbox = BoundingBox(
+            l=min_x * page_width / gt_width,
+            r=max_x * page_width / gt_width,
+            b=min_y * page_height / gt_height,
+            t=max_y * page_height / gt_height,
+            coord_origin=CoordOrigin.TOPLEFT,
+        )
+
+        prov = ProvenanceItem(page_no=1, bbox=bbox, charspan=(0, len(text)))
+
         if label=="title":
-            pass
+            true_doc.add_heading(text=text, orig=text, level=1, prov=prov)
 
         elif label=="text_block":
-            pass
+            true_doc.add_text(label=DocItemLabel.TEXT, text=text, orig=text, prov=prov)
 
         elif label=="text_mask":
             pass
 
         elif label=="table":
-            pass
-
+
+            num_rows = -1
+            num_cols = -1
+
+            cells = []
+
+            if "html" in item:
+
+                for row_idx, col_idx, rowspan, colspan, text in parse_html_table(
+                        table_html=item["html"]
+                ):
+                    cell = TableCell(
+                        row_span=rowspan,
+                        col_span=colspan,
+                        start_row_offset_idx=row_idx,
+                        end_row_offset_idx=row_idx + rowspan,
+                        start_col_offset_idx=col_idx,
+                        end_col_offset_idx=col_idx + colspan,
+                        text=text,
+                    )
+                    cells.append(cell)
+
+                    num_rows = max(row_idx + rowspan, num_rows)
+                    num_cols = max(col_idx + colspan, num_cols)
+
+            else:
+                logging.error("No table-structure identified")
+
+            table_data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=cells)
+            true_doc.add_table(data=table_data, caption=None, prov=prov)
+
         elif label=="table_caption":
             pass
 
@@ -94,7 +191,7 @@ def create_true_doc(jpg_path, pdf_path, gt):
             pass
 
         elif label=="figure":
-            pass
+            true_doc.add_picture(prov=prov)
 
         elif label=="figure_caption":
             pass
@@ -103,7 +200,7 @@ def create_true_doc(jpg_path, pdf_path, gt):
             pass
 
         elif label=="equation_isolated":
-            pass
+            true_doc.add_text(label=DocItemLabel.FORMULA, text=text, orig=text, prov=prov)
 
         elif label=="equation_caption":
             pass
@@ -118,10 +215,10 @@ def create_true_doc(jpg_path, pdf_path, gt):
             pass                
 
         elif label=="header":
-            pass
+            true_doc.add_text(label=DocItemLabel.PAGE_HEADER, text=text, orig=text, prov=prov)
 
         elif label=="footer":
-            pass
+            true_doc.add_text(label=DocItemLabel.PAGE_FOOTER, text=text, orig=text, prov=prov)
 
         elif label=="reference":
             pass
@@ -134,8 +231,6 @@ def create_true_doc(jpg_path, pdf_path, gt):
 
         else:
             logging.error(f"label {label} is not assigned!")
-            exit(-1)
-
 
     return true_doc
 
@@ -175,27 +270,35 @@ def create_omnidocbench_e2e_dataset(
             logging.error(f"did not find ground-truth for {os.path.basename(jpg_path)}")
             continue
 
-        true_doc = create_true_doc(jpg_path, pdf_path, gt[os.path.basename(jpg_path)])
+        gt_doc = gt[os.path.basename(jpg_path)]
 
-
-
-        """
         conv_results = doc_converter.convert(source=pdf_path, raises_on_error=True)
 
-        conv_results.document.save_as_html(filename = viz_dir / f"{os.path.basename(pdf_path)}.html",
+        conv_results.document.save_as_html(filename = viz_dir / f"{os.path.basename(pdf_path)}-pred.html",
                                            image_mode = ImageRefMode.EMBEDDED)
-        
+
         pred_doc, pictures, page_images = extract_images(
             conv_results.document,
             pictures_column=BenchMarkColumns.PICTURES.value,  # pictures_column,
             page_images_column=BenchMarkColumns.PAGE_IMAGES.value,  # page_images_column,
         )
+
+        true_doc = DoclingDocument(name=f"ground-truth {os.path.basename(jpg_path)}")
+        true_doc.pages = pred_doc.pages
+
+        page_width = pred_doc.pages[1].size.width
+        page_height = pred_doc.pages[1].size.height
 
+        true_doc = update_doc_with_gt(gt_doc, true_doc, page_width=page_width, page_height=page_height)
+
+        true_doc.save_as_html(filename = viz_dir / f"{os.path.basename(pdf_path)}-true.html",
+                              image_mode = ImageRefMode.PLACEHOLDER)
+
         record = {
             BenchMarkColumns.DOCLING_VERSION: docling_version(),
             BenchMarkColumns.STATUS: "SUCCESS",
-            BenchMarkColumns.DOC_ID: str(os.path.basename(pdf_path)),
-            BenchMarkColumns.GROUNDTRUTH: "", #json.dumps(true_doc.export_to_dict()),
+            BenchMarkColumns.DOC_ID: str(os.path.basename(jpg_path)),
+            BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()),
             BenchMarkColumns.PREDICTION: json.dumps(pred_doc.export_to_dict()),
             BenchMarkColumns.ORIGINAL: get_binary(pdf_path),
             BenchMarkColumns.MIMETYPE: "application/pdf",
@@ -208,7 +311,6 @@ def create_omnidocbench_e2e_dataset(
             break
         else:
             cnt += 1
-        """
 
     test_dir = output_dir / "test"
     os.makedirs(test_dir, exist_ok=True)