Skip to content

Commit

Permalink
extend nesting logic to list groups
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas committed Feb 21, 2025
1 parent f2af200 commit 1953b73
Show file tree
Hide file tree
Showing 34 changed files with 1,225 additions and 331 deletions.
156 changes: 69 additions & 87 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -2358,9 +2358,8 @@ def export_to_markdown( # noqa: C901
:returns: The exported Markdown representation.
:rtype: str
"""
return self._export_to_markdown(
comps = self._get_markdown_components(
node=self.body,
delim=delim,
from_element=from_element,
to_element=to_element,
labels=labels,
Expand All @@ -2372,14 +2371,15 @@ def export_to_markdown( # noqa: C901
text_width=text_width,
page_no=page_no,
included_content_layers=included_content_layers,
list_level=0,
is_inline_scope=False,
visited=set(),
)
return delim.join(comps)

def _export_to_markdown( # noqa: C901
def _get_markdown_components( # noqa: C901
self,
node: NodeItem,
delim: str,
from_element: int,
to_element: int,
labels: set[DocItemLabel],
Expand All @@ -2391,14 +2391,11 @@ def _export_to_markdown( # noqa: C901
text_width: int,
page_no: Optional[int],
included_content_layers: set[ContentLayer],
list_level: int,
is_inline_scope: bool,
visited: set[str], # refs of visited items
) -> str:
) -> list[str]:
components: list[str] = [] # components to concatenate
list_item_texts: list[str] = []
list_nesting_level = 0 # Track the current list nesting level
previous_level = 0 # Track the previous item's level
in_list = False # Track if we're currently processing list items

# Our export markdown doesn't contain any emphasis styling:
# Bold, Italic, or Bold-Italic
Expand Down Expand Up @@ -2438,9 +2435,7 @@ def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
text = _escape_underscores(text)
if do_escape_html:
text = html.escape(text, quote=False)
if in_list:
list_item_texts.append(text)
else:
if text:
components.append(text)

for ix, (item, level) in enumerate(
Expand All @@ -2456,66 +2451,74 @@ def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
else:
visited.add(item.self_ref)

# If we've moved to a lower level, we're exiting one or more groups
if level < previous_level:
# Calculate how many levels we've exited
level_difference = previous_level - level
# Decrement list_nesting_level for each list group we've exited
list_nesting_level = max(0, list_nesting_level - level_difference)

previous_level = level # Update previous_level for next iteration

if ix < from_element or to_element <= ix:
continue # skip as many items as you want

if (isinstance(item, DocItem)) and (item.label not in labels):
elif (isinstance(item, DocItem)) and (item.label not in labels):
continue # skip any label that is not whitelisted

in_list = (
isinstance(item, GroupItem)
and item.label
in [
elif isinstance(item, GroupItem):
if item.label in [
GroupLabel.LIST,
GroupLabel.ORDERED_LIST,
]
) or isinstance(item, ListItem)

# dump list in case just completed
if len(list_item_texts) > 0 and not in_list:
components.append("\n".join(list_item_texts))
list_item_texts.clear()

if isinstance(item, GroupItem) and item.label in [
GroupLabel.LIST,
GroupLabel.ORDERED_LIST,
]:
# Increment list nesting level when entering a new list
list_nesting_level += 1
continue

if isinstance(item, GroupItem) and item.label == GroupLabel.INLINE:
inline_text = self._export_to_markdown(
node=item,
delim=" ",
from_element=from_element,
to_element=to_element,
labels=labels,
strict_text=strict_text,
escaping_underscores=escaping_underscores,
image_placeholder=image_placeholder,
image_mode=image_mode,
indent=indent,
text_width=text_width,
page_no=page_no,
included_content_layers=included_content_layers,
is_inline_scope=True,
visited=visited,
)
if inline_text:
_ingest_text(inline_text)

elif isinstance(item, GroupItem):
continue
]:
comps = self._get_markdown_components(
node=item,
from_element=from_element,
to_element=to_element,
labels=labels,
strict_text=strict_text,
escaping_underscores=escaping_underscores,
image_placeholder=image_placeholder,
image_mode=image_mode,
indent=indent,
text_width=text_width,
page_no=page_no,
included_content_layers=included_content_layers,
list_level=list_level + 1,
is_inline_scope=is_inline_scope,
visited=visited,
)
# NOTE: assumes unordered (flag & marker currently in ListItem)
indent_str = f"{(list_level if components else 0) * indent * ' '}"
text = "\n".join(
[
(
# if starting with sublist, promote to top-level
cpt.lstrip()
if not components
else (
# avoid additional marker on already evaled sublists
cpt
if cpt and cpt[0] == " "
else f"{indent_str}- {cpt}"
)
)
for cpt in comps
]
)
_ingest_text(text=text)
elif item.label == GroupLabel.INLINE:
comps = self._get_markdown_components(
node=item,
from_element=from_element,
to_element=to_element,
labels=labels,
strict_text=strict_text,
escaping_underscores=escaping_underscores,
image_placeholder=image_placeholder,
image_mode=image_mode,
indent=indent,
text_width=text_width,
page_no=page_no,
included_content_layers=included_content_layers,
list_level=list_level,
is_inline_scope=True,
visited=visited,
)
_ingest_text(" ".join(comps))
else:
continue

elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
marker = "" if strict_text else "#"
Expand All @@ -2538,22 +2541,6 @@ def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
_ingest_text(text, do_escape_underscores=False, do_escape_html=False)

elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
# Calculate indent based on list_nesting_level
# -1 because level 1 needs no indent
list_indent = " " * (indent * (list_nesting_level - 1))

marker = ""
if strict_text:
marker = ""
elif item.enumerated:
marker = item.marker
else:
marker = "-" # Markdown needs only dash as item marker.

text = f"{list_indent}{marker} {item.text}"
_ingest_text(text)

elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
if item.text != "":
_ingest_text(
Expand Down Expand Up @@ -2597,12 +2584,7 @@ def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
text = "<!-- missing-text -->"
_ingest_text(text, do_escape_html=False, do_escape_underscores=False)

# dump any trailing list
if len(list_item_texts) > 0:
components.append("\n".join(list_item_texts))

mdtext = (delim.join(components)).strip()
return mdtext
return components

def export_to_text( # noqa: C901
self,
Expand Down
2 changes: 1 addition & 1 deletion test/data/doc/2206.01062.yaml.dt
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,4 @@
<list_item><loc_259><loc_332><loc_457><loc_342>[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.</list_item>
<list_item><loc_260><loc_358><loc_457><loc_377>[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.</list_item>
<list_item><loc_260><loc_378><loc_457><loc_387>[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.</list_item>
</doctag>
</doctag>
2 changes: 1 addition & 1 deletion test/data/doc/2206.01062.yaml.et
Original file line number Diff line number Diff line change
Expand Up @@ -127,4 +127,4 @@
126: list_item
127: list_item
128: list_item
129: list_item
129: list_item
2 changes: 1 addition & 1 deletion test/data/doc/2206.01062.yaml.html
Original file line number Diff line number Diff line change
Expand Up @@ -185,4 +185,4 @@ <h2>REFERENCES</h2>
<li>[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.</li>
<li>[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.</li>
<li>[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.</li>
</html>
</html>
Loading

0 comments on commit 1953b73

Please sign in to comment.