diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 2dda5784..7bd2ece0 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1881,7 +1881,6 @@ def iterate_items( page_no: Optional[int] = None, included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS, _level: int = 0, # fixed parameter, carries through the node nesting level - traverse_inline: bool = True, ) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level """iterate_elements. @@ -1890,7 +1889,6 @@ def iterate_items( :param traverse_pictures: bool: (Default value = False) :param page_no: Optional[int]: (Default value = None) :param _level: (Default value = 0) - :param traverse_inline: bool: (Default value = True) :param # fixed parameter: :param carries through the node nesting level: """ @@ -1915,12 +1913,8 @@ def iterate_items( if should_yield: yield root, _level - # Prevent children traversal when necessary - if (isinstance(root, PictureItem) and not traverse_pictures) or ( - isinstance(root, GroupItem) - and root.label == GroupLabel.INLINE - and not traverse_inline - ): + # Handle picture traversal - only traverse children if requested + if isinstance(root, PictureItem) and not traverse_pictures: return # Traverse children @@ -1933,7 +1927,6 @@ def iterate_items( traverse_pictures=traverse_pictures, page_no=page_no, _level=_level + 1, - traverse_inline=traverse_inline, ) def _clear_picture_pil_cache(self): @@ -2243,7 +2236,7 @@ def export_to_markdown( # noqa: C901 text_width=text_width, page_no=page_no, is_inline_scope=False, - visited_ids=set(), + visited=set(), ) return tmp or "" @@ -2262,17 +2255,13 @@ def _export_to_markdown( # noqa: C901 text_width: int, page_no: Optional[int], is_inline_scope: bool, - visited_ids: set[str], + visited: set[str], # refs of visited items ) -> Optional[str]: paragraphs: list[str] = [] list_item_texts: list[str] = [] list_nesting_level = 0 # Track the current list nesting level previous_level = 0 # Track the previous item's level in_list = False # Track if we're currently processing list items - if node.self_ref in visited_ids: - return None - else: - visited_ids.add(node.self_ref) # Our export markdown doesn't contain any emphasis styling: # Bold, Italic, or Bold-Italic @@ -2322,9 +2311,13 @@ def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True): node, with_groups=True, page_no=page_no, - traverse_inline=is_inline_scope, ) ): + if item.self_ref in visited: + continue + else: + visited.add(item.self_ref) + # If we've moved to a lower level, we're exiting one or more groups if level < previous_level: # Calculate how many levels we've exited @@ -2377,7 +2370,7 @@ def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True): text_width=text_width, page_no=page_no, is_inline_scope=True, - visited_ids=visited_ids, + visited=visited, ) if inline_text: _ingest_text(inline_text)