updated code and tests, need to decide what to commit as test-cases

Signed-off-by: Peter Staar <[email protected]>
DS4SD · Sep 19, 2024 · 5266bab · 5266bab
1 parent 2bc04eb
commit 5266bab
Show file tree

Hide file tree

Showing 4 changed files with 346 additions and 36 deletions.
diff --git a/docling_core/types/doc/base.py b/docling_core/types/doc/base.py
@@ -253,40 +253,105 @@ def export_to_html(self) -> str:
     def export_to_document_tokens(
         self,
         new_line: str = "\n",
-        page_w: float = None,
-        page_h: float = None,
+        page_w: float = 0.0,
+        page_h: float = 0.0,
         xsize: int = 100,
         ysize: int = 100,
+        add_caption: bool = True,
         add_table_location: bool = True,
-        add_cell_location: bool = False,
-        add_cell_label: bool = False,
+        add_cell_location: bool = True,
+        add_cell_label: bool = True,
         add_cell_text: bool = True,
+        page_tagging: bool = True,
     ):
         """Export table to document tokens format."""
-        body = ""
-
-        loc_str = ""
-        if page_w is not None and page_h is not None:
+        body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
+
+        if (
+            add_table_location
+            and page_tagging
+            and self.prov is not None
+            and len(self.prov) > 0
+        ):
             loc_str = DocumentToken.get_location(
-                self.bbox, page_w, page_h, xsize, ysize, self.page
+                bbox=self.prov[0].bbox,
+                page_w=page_w,
+                page_h=page_h,
+                xsize=xsize,
+                ysize=ysize,
+                page_i=self.prov[0].page,
             )
+            body += f"{loc_str}{new_line}"
+
+        elif (
+            add_table_location
+            and not page_tagging
+            and self.prov is not None
+            and len(self.prov) > 0
+        ):
+            loc_str = DocumentToken.get_location(
+                bbox=self.prov[0].bbox,
+                page_w=page_w,
+                page_h=page_h,
+                xsize=xsize,
+                ysize=ysize,
+                page_i=-1,
+            )
+            body += f"{loc_str}{new_line}"
 
-        body += f"{DocumentToken.BEG_TABLE.value}{loc_str}"
-
-        if self.text is not None and len(self.text) > 0:
-            body += f"{DocumentToken.BEG_CAPTION.value}"
-            body += f"{self.text}{DocumentToken.END_CAPTION.value}{new_line}"
+        if add_caption and self.text is not None and len(self.text) > 0:
+            body += f"{DocumentToken.BEG_CAPTION.value}{self.text.strip()}{DocumentToken.END_CAPTION.value}{new_line}"
 
         if self.data is not None and len(self.data) > 0:
             for i, row in enumerate(self.data):
                 body += f"<row_{i}>"
                 for j, col in enumerate(row):
-                    text = col.text
-                    body += f"<col_{j}>{text}</col_{j}>"
+
+                    text = ""
+                    if add_cell_text:
+                        text = col.text.strip()
+
+                    cell_loc = ""
+                    if (
+                        col.bbox is not None
+                        and add_cell_location
+                        and page_tagging
+                        and self.prov is not None
+                        and len(self.prov) > 0
+                    ):
+                        cell_loc = DocumentToken.get_location(
+                            bbox=col.bbox,
+                            page_w=page_w,
+                            page_h=page_h,
+                            xsize=xsize,
+                            ysize=ysize,
+                            page_i=self.prov[0].page,
+                        )
+                    elif (
+                        col.bbox is not None and add_cell_location and not page_tagging
+                    ):
+                        cell_loc = DocumentToken.get_location(
+                            bbox=col.bbox,
+                            page_w=page_w,
+                            page_h=page_h,
+                            xsize=xsize,
+                            ysize=ysize,
+                            page_i=-1,
+                        )
+
+                    cell_label = ""
+                    if (
+                        add_cell_label
+                        and col.obj_type is not None
+                        and len(col.obj_type) > 0
+                    ):
+                        cell_label = f"<{col.obj_type}>"
+
+                    body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
 
                 body += f"</row_{i}>{new_line}"
 
-        body += f"{DocumentToken.BEG_TABLE.value}{new_line}"
+        body += f"{DocumentToken.END_TABLE.value}{new_line}"
 
         return body
 
@@ -295,6 +360,59 @@ def export_to_document_tokens(
 class Figure(BaseCell):
     """Figure."""
 
+    def export_to_document_tokens(
+        self,
+        new_line: str = "\n",
+        page_w: float = 0.0,
+        page_h: float = 0.0,
+        xsize: int = 100,
+        ysize: int = 100,
+        add_caption: bool = True,
+        add_figure_location: bool = True,
+        page_tagging: bool = True,
+    ):
+        """Export figure to document tokens format."""
+        body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
+
+        if (
+            add_figure_location
+            and page_tagging
+            and self.prov is not None
+            and len(self.prov) > 0
+        ):
+            loc_str = DocumentToken.get_location(
+                bbox=self.prov[0].bbox,
+                page_w=page_w,
+                page_h=page_h,
+                xsize=xsize,
+                ysize=ysize,
+                page_i=self.prov[0].page,
+            )
+            body += f"{loc_str}{new_line}"
+
+        elif (
+            add_figure_location
+            and not page_tagging
+            and self.prov is not None
+            and len(self.prov) > 0
+        ):
+            loc_str = DocumentToken.get_location(
+                bbox=self.prov[0].bbox,
+                page_w=page_w,
+                page_h=page_h,
+                xsize=xsize,
+                ysize=ysize,
+                page_i=-1,
+            )
+            body += f"{loc_str}{new_line}"
+
+        if add_caption and self.text is not None and len(self.text) > 0:
+            body += f"{DocumentToken.BEG_CAPTION.value}{self.text.strip()}{DocumentToken.END_CAPTION.value}{new_line}"
+
+        body += f"{DocumentToken.END_FIGURE.value}{new_line}"
+
+        return body
+
 
 class BaseText(AliasModel):
     """Base model for text objects."""
@@ -311,6 +429,61 @@ class BaseText(AliasModel):
     font: Optional[str] = None
     prov: Optional[list[Prov]] = None
 
+    def export_to_document_tokens(
+        self,
+        new_line: str = "\n",
+        page_w: float = 0.0,
+        page_h: float = 0.0,
+        xsize: int = 100,
+        ysize: int = 100,
+        add_location: bool = True,
+        page_tagging: bool = True,
+    ):
+        """Export text element to document tokens format."""
+        body = f"<{obj_type}>"
+
+        assert DocumentToken.is_known_token(
+            obj_type
+        ), f"failed DocumentToken.is_known_token({obj_type})"
+
+        if (
+            add_location
+            and page_tagging
+            and self.prov is not None
+            and len(self.prov) > 0
+        ):
+            loc_str = DocumentToken.get_location(
+                bbox=self.prov[0].bbox,
+                page_w=page_w,
+                page_h=page_h,
+                xsize=xsize,
+                ysize=ysize,
+                page_i=self.prov[0].page,
+            )
+            body += f"{loc_str}"
+
+        elif (
+            add_location
+            and not page_tagging
+            and self.prov is not None
+            and len(self.prov) > 0
+        ):
+            loc_str = DocumentToken.get_location(
+                bbox=self.prov[0].bbox,
+                page_w=page_w,
+                page_h=page_h,
+                xsize=xsize,
+                ysize=ysize,
+                page_i=-1,
+            )
+            body += f"{loc_str}"
+
+        body += text.strip()
+
+        body += f"</{obj_type}>{new_line}"
+
+        return body
+
 
 class ListItem(BaseText):
     """List item."""

diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -424,6 +424,16 @@ def _resolve_ref(self, item: Ref) -> Optional[Union[BaseCell, BaseText]]:
 
         return result
 
+    def get_map_to_page_dimensions(self):
+
+        pagedims = {}
+
+        if self.page_dimensions is not None:
+            for _ in self.page_dimensions:
+                pagedims[_.page] = [_.width, _.height]
+
+        return pagedims
+
     def export_to_markdown(
         self,
         delim: str = "\n\n",
@@ -584,12 +594,14 @@ def export_to_document_tokens(
         Returns:
             str: The content of the document formatted as an XML string.
         """
-        xml_str = DocumentToken.BEG_DOCUMENT.value
-
         new_line = ""
         if add_new_line:
             new_line = "\n"
 
+        xml_str = f"{DocumentToken.BEG_DOCUMENT.value}{new_line}"
+
+        # pagedims = self.get_map_to_page_dimensions()
+
         if self.main_text is not None:
             for orig_item in self.main_text[main_text_start:main_text_stop]:
 
@@ -605,19 +617,23 @@ def export_to_document_tokens(
                 prov = item.prov
 
                 loc_str = ""  # default is zero
+                page_w = 0.0
+                page_h = 0.0
+
                 if (
                     location_tagging
                     and self.page_dimensions is not None
                     and prov is not None
                     and len(prov) > 0
                 ):
 
-                    page = prov[0].page
+                    prov[0].page
                     page_dim = self.page_dimensions[page - 1]
 
                     page_w = float(page_dim.width)
                     page_h = float(page_dim.height)
 
+                    """
                     x0 = float(prov[0].bbox[0]) / float(page_w)
                     y0 = float(prov[0].bbox[1]) / float(page_h)
                     x1 = float(prov[0].bbox[2]) / float(page_w)
@@ -645,15 +661,41 @@ def export_to_document_tokens(
                     loc_str += f"{page_tok}"
                     loc_str += f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
                     loc_str += f"{DocumentToken.END_LOCATION.value}"
+                    """
 
                 item_type = item.obj_type
                 if isinstance(item, BaseText) and (item_type in main_text_labels):
-                    text = item.text
 
+                    xml_str += item.export_to_document_tokens(
+                        new_line=new_line,
+                        page_w=page_w,
+                        page_h=page_h,
+                        xsize=location_dimensions[0],
+                        ysize=location_dimensions[1],
+                    )
+
+                    """
+                    text = item.text
                     xml_str += f"<{item_type}>{loc_str}{text}</{item_type}>{new_line}"
+                    """
 
                 elif isinstance(item, Table) and (item_type in main_text_labels):
 
+                    xml_str += item.export_to_document_tokens(
+                        new_line=new_line,
+                        page_w=page_w,
+                        page_h=page_h,
+                        xsize=location_dimensions[0],
+                        ysize=location_dimensions[1],
+                        add_caption=True,
+                        add_table_location=True,
+                        add_cell_location=False,
+                        add_cell_label=True,
+                        add_cell_text=True,
+                        page_tagging=page_tagging,
+                    )
+
+                    """
                     xml_str += f"<{item_type}>{loc_str}"
 
                     if item.text is not None and len(item.text) > 0:
@@ -672,9 +714,22 @@ def export_to_document_tokens(
                             xml_str += f"</row_{i}>{new_line}"
 
                     xml_str += f"</{item_type}>{new_line}"
+                    """
 
                 elif isinstance(item, Figure) and (item_type in main_text_labels):
 
+                    xml_str += item.export_to_document_tokens(
+                        new_line=new_line,
+                        page_w=page_w,
+                        page_h=page_h,
+                        xsize=location_dimensions[0],
+                        ysize=location_dimensions[1],
+                        add_caption=True,
+                        add_figure_location=True,
+                        page_tagging=page_tagging,
+                    )
+
+                    """
                     xml_str += f"<{item_type}>{loc_str}"
 
                     if item.text is not None and len(item.text) > 0:
@@ -684,6 +739,7 @@ def export_to_document_tokens(
                         )
 
                     xml_str += f"</{item_type}>{new_line}"
+                    """
 
         xml_str += DocumentToken.END_DOCUMENT.value