extend nesting logic to list groups

Signed-off-by: Panos Vagenas <[email protected]>
DS4SD · Feb 21, 2025 · 1953b73 · 1953b73
1 parent f2af200
commit 1953b73
Show file tree

Hide file tree

Showing 34 changed files with 1,225 additions and 331 deletions.
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -2358,9 +2358,8 @@ def export_to_markdown(  # noqa: C901
         :returns: The exported Markdown representation.
         :rtype: str
         """
-        return self._export_to_markdown(
+        comps = self._get_markdown_components(
             node=self.body,
-            delim=delim,
             from_element=from_element,
             to_element=to_element,
             labels=labels,
@@ -2372,14 +2371,15 @@ def export_to_markdown(  # noqa: C901
             text_width=text_width,
             page_no=page_no,
             included_content_layers=included_content_layers,
+            list_level=0,
             is_inline_scope=False,
             visited=set(),
         )
+        return delim.join(comps)
 
-    def _export_to_markdown(  # noqa: C901
+    def _get_markdown_components(  # noqa: C901
         self,
         node: NodeItem,
-        delim: str,
         from_element: int,
         to_element: int,
         labels: set[DocItemLabel],
@@ -2391,14 +2391,11 @@ def _export_to_markdown(  # noqa: C901
         text_width: int,
         page_no: Optional[int],
         included_content_layers: set[ContentLayer],
+        list_level: int,
         is_inline_scope: bool,
         visited: set[str],  # refs of visited items
-    ) -> str:
+    ) -> list[str]:
         components: list[str] = []  # components to concatenate
-        list_item_texts: list[str] = []
-        list_nesting_level = 0  # Track the current list nesting level
-        previous_level = 0  # Track the previous item's level
-        in_list = False  # Track if we're currently processing list items
 
         # Our export markdown doesn't contain any emphasis styling:
         # Bold, Italic, or Bold-Italic
@@ -2438,9 +2435,7 @@ def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
                 text = _escape_underscores(text)
             if do_escape_html:
                 text = html.escape(text, quote=False)
-            if in_list:
-                list_item_texts.append(text)
-            else:
+            if text:
                 components.append(text)
 
         for ix, (item, level) in enumerate(
@@ -2456,66 +2451,74 @@ def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
             else:
                 visited.add(item.self_ref)
 
-            # If we've moved to a lower level, we're exiting one or more groups
-            if level < previous_level:
-                # Calculate how many levels we've exited
-                level_difference = previous_level - level
-                # Decrement list_nesting_level for each list group we've exited
-                list_nesting_level = max(0, list_nesting_level - level_difference)
-
-            previous_level = level  # Update previous_level for next iteration
-
             if ix < from_element or to_element <= ix:
                 continue  # skip as many items as you want
 
-            if (isinstance(item, DocItem)) and (item.label not in labels):
+            elif (isinstance(item, DocItem)) and (item.label not in labels):
                 continue  # skip any label that is not whitelisted
 
-            in_list = (
-                isinstance(item, GroupItem)
-                and item.label
-                in [
+            elif isinstance(item, GroupItem):
+                if item.label in [
                     GroupLabel.LIST,
                     GroupLabel.ORDERED_LIST,
-                ]
-            ) or isinstance(item, ListItem)
-
-            # dump list in case just completed
-            if len(list_item_texts) > 0 and not in_list:
-                components.append("\n".join(list_item_texts))
-                list_item_texts.clear()
-
-            if isinstance(item, GroupItem) and item.label in [
-                GroupLabel.LIST,
-                GroupLabel.ORDERED_LIST,
-            ]:
-                # Increment list nesting level when entering a new list
-                list_nesting_level += 1
-                continue
-
-            if isinstance(item, GroupItem) and item.label == GroupLabel.INLINE:
-                inline_text = self._export_to_markdown(
-                    node=item,
-                    delim=" ",
-                    from_element=from_element,
-                    to_element=to_element,
-                    labels=labels,
-                    strict_text=strict_text,
-                    escaping_underscores=escaping_underscores,
-                    image_placeholder=image_placeholder,
-                    image_mode=image_mode,
-                    indent=indent,
-                    text_width=text_width,
-                    page_no=page_no,
-                    included_content_layers=included_content_layers,
-                    is_inline_scope=True,
-                    visited=visited,
-                )
-                if inline_text:
-                    _ingest_text(inline_text)
-
-            elif isinstance(item, GroupItem):
-                continue
+                ]:
+                    comps = self._get_markdown_components(
+                        node=item,
+                        from_element=from_element,
+                        to_element=to_element,
+                        labels=labels,
+                        strict_text=strict_text,
+                        escaping_underscores=escaping_underscores,
+                        image_placeholder=image_placeholder,
+                        image_mode=image_mode,
+                        indent=indent,
+                        text_width=text_width,
+                        page_no=page_no,
+                        included_content_layers=included_content_layers,
+                        list_level=list_level + 1,
+                        is_inline_scope=is_inline_scope,
+                        visited=visited,
+                    )
+                    # NOTE: assumes unordered (flag & marker currently in ListItem)
+                    indent_str = f"{(list_level if components else 0) * indent * ' '}"
+                    text = "\n".join(
+                        [
+                            (
+                                # if starting with sublist, promote to top-level
+                                cpt.lstrip()
+                                if not components
+                                else (
+                                    # avoid additional marker on already evaled sublists
+                                    cpt
+                                    if cpt and cpt[0] == " "
+                                    else f"{indent_str}- {cpt}"
+                                )
+                            )
+                            for cpt in comps
+                        ]
+                    )
+                    _ingest_text(text=text)
+                elif item.label == GroupLabel.INLINE:
+                    comps = self._get_markdown_components(
+                        node=item,
+                        from_element=from_element,
+                        to_element=to_element,
+                        labels=labels,
+                        strict_text=strict_text,
+                        escaping_underscores=escaping_underscores,
+                        image_placeholder=image_placeholder,
+                        image_mode=image_mode,
+                        indent=indent,
+                        text_width=text_width,
+                        page_no=page_no,
+                        included_content_layers=included_content_layers,
+                        list_level=list_level,
+                        is_inline_scope=True,
+                        visited=visited,
+                    )
+                    _ingest_text(" ".join(comps))
+                else:
+                    continue
 
             elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
                 marker = "" if strict_text else "#"
@@ -2538,22 +2541,6 @@ def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
                 text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
                 _ingest_text(text, do_escape_underscores=False, do_escape_html=False)
 
-            elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
-                # Calculate indent based on list_nesting_level
-                # -1 because level 1 needs no indent
-                list_indent = " " * (indent * (list_nesting_level - 1))
-
-                marker = ""
-                if strict_text:
-                    marker = ""
-                elif item.enumerated:
-                    marker = item.marker
-                else:
-                    marker = "-"  # Markdown needs only dash as item marker.
-
-                text = f"{list_indent}{marker} {item.text}"
-                _ingest_text(text)
-
             elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
                 if item.text != "":
                     _ingest_text(
@@ -2597,12 +2584,7 @@ def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
                 text = "<!-- missing-text -->"
                 _ingest_text(text, do_escape_html=False, do_escape_underscores=False)
 
-        # dump any trailing list
-        if len(list_item_texts) > 0:
-            components.append("\n".join(list_item_texts))
-
-        mdtext = (delim.join(components)).strip()
-        return mdtext
+        return components
 
     def export_to_text(  # noqa: C901
         self,

diff --git a/test/data/doc/2206.01062.yaml.dt b/test/data/doc/2206.01062.yaml.dt
@@ -134,4 +134,4 @@
 <list_item><loc_259><loc_332><loc_457><loc_342>[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.</list_item>
 <list_item><loc_260><loc_358><loc_457><loc_377>[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.</list_item>
 <list_item><loc_260><loc_378><loc_457><loc_387>[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.</list_item>
-</doctag>
+</doctag>
diff --git a/test/data/doc/2206.01062.yaml.et b/test/data/doc/2206.01062.yaml.et
@@ -127,4 +127,4 @@
  126: list_item
  127: list_item
  128: list_item
- 129: list_item
+ 129: list_item
diff --git a/test/data/doc/2206.01062.yaml.html b/test/data/doc/2206.01062.yaml.html
@@ -185,4 +185,4 @@ <h2>REFERENCES</h2>
 <li>[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.</li>
 <li>[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.</li>
 <li>[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.</li>
-</html>
+</html>