Skip to content

Commit

Permalink
fixed the tests
Browse files Browse the repository at this point in the history
Signed-off-by: Matteo-Omenetti <[email protected]>
  • Loading branch information
Matteo-Omenetti authored and Matteo-Omenetti committed Jan 29, 2025
1 parent da4a835 commit fabf75f
Show file tree
Hide file tree
Showing 7 changed files with 188 additions and 157 deletions.
25 changes: 23 additions & 2 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -2577,6 +2577,16 @@ def close_lists(

return (in_ordered_list, result)

def add_page_no(result, item, previous_page_no, delim):
prov_list = item.prov
if len(prov_list) == 0:
return result, previous_page_no
current_page_no = prov_list[0].page_no
if previous_page_no is None or current_page_no != previous_page_no:
result += f"<page_{current_page_no}>{delim}"

return result, current_page_no

if newline:
delim = "\n"
else:
Expand All @@ -2587,11 +2597,10 @@ def close_lists(
in_ordered_list: List[bool] = [] # False

result = f"{DocumentToken.BEG_DOCUMENT.value}{delim}"

previous_page_no = None
for ix, (item, curr_level) in enumerate(
self.iterate_items(self.body, with_groups=True)
):

# If we've moved to a lower level, we're exiting one or more groups
if curr_level < prev_level and len(in_ordered_list) > 0:
# Calculate how many levels we've exited
Expand Down Expand Up @@ -2630,6 +2639,9 @@ def close_lists(
in_ordered_list.append(False)

elif isinstance(item, SectionHeaderItem):
result, previous_page_no = add_page_no(
result, item, previous_page_no, delim
)

result += item.export_to_document_tokens(
doc=self,
Expand All @@ -2653,6 +2665,9 @@ def close_lists(
)

elif isinstance(item, TextItem) and (item.label in labels):
result, previous_page_no = add_page_no(
result, item, previous_page_no, delim
)

result += item.export_to_document_tokens(
doc=self,
Expand All @@ -2665,6 +2680,9 @@ def close_lists(
)

elif isinstance(item, TableItem) and (item.label in labels):
result, previous_page_no = add_page_no(
result, item, previous_page_no, delim
)

result += item.export_to_document_tokens(
doc=self,
Expand All @@ -2681,6 +2699,9 @@ def close_lists(
)

elif isinstance(item, PictureItem) and (item.label in labels):
result, previous_page_no = add_page_no(
result, item, previous_page_no, delim
)

result += item.export_to_document_tokens(
doc=self,
Expand Down
265 changes: 137 additions & 128 deletions test/data/doc/2206.01062.yaml.dt

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions test/data/doc/bad_doc.yaml.dt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<document>
<doctag>
<title>This is the title</title>
<section_header_level_1>This is the first section</section_header_level_1>
</document>
</doctag>
12 changes: 6 additions & 6 deletions test/data/doc/constructed_doc.dt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<document>
<doctag>
<title>Title of the Document</title>
<text>Author 1
Affiliation 1</text>
Expand All @@ -23,10 +23,10 @@ Affiliation 2</text>
<row_1><col_0><body>Product</col_0><col_1><body>2016</col_1><col_2><body>2017</col_2></row_1>
<row_2><col_0><body>Apple</col_0><col_1><body>49823</col_1><col_2><body>695944</col_2></row_2>
</table>
<figure>
<picture>
<caption>This is the caption of figure 1.</caption>
</figure>
<figure>
</picture>
<picture>
<caption>This is the caption of figure 2.</caption>
</figure>
</document>
</picture>
</doctag>
12 changes: 6 additions & 6 deletions test/data/doc/constructed_doc.dt.gt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<document>
<doctag>
<title>Title of the Document</title>
<text>Author 1
Affiliation 1</text>
Expand All @@ -23,10 +23,10 @@ Affiliation 2</text>
<row_1><col_0><body>Product</col_0><col_1><body>2016</col_1><col_2><body>2017</col_2></row_1>
<row_2><col_0><body>Apple</col_0><col_1><body>49823</col_1><col_2><body>695944</col_2></row_2>
</table>
<figure>
<picture>
<caption>This is the caption of figure 1.</caption>
</figure>
<figure>
</picture>
<picture>
<caption>This is the caption of figure 2.</caption>
</figure>
</document>
</picture>
</doctag>
12 changes: 6 additions & 6 deletions test/data/doc/constructed_document.yaml.dt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<document>
<doctag>
<title>Title of the Document</title>
<text>Author 1
Affiliation 1</text>
Expand All @@ -23,10 +23,10 @@ Affiliation 2</text>
<row_1><col_0><body>Product</col_0><col_1><body>2016</col_1><col_2><body>2017</col_2></row_1>
<row_2><col_0><body>Apple</col_0><col_1><body>49823</col_1><col_2><body>695944</col_2></row_2>
</table>
<figure>
<picture>
<caption>This is the caption of figure 1.</caption>
</figure>
<figure>
</picture>
<picture>
<caption>This is the caption of figure 2.</caption>
</figure>
</document>
</picture>
</doctag>
15 changes: 8 additions & 7 deletions test/data/doc/dummy_doc.yaml.dt
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
<document>
<title><location><page_1><loc_8><loc_91><loc_81><loc_95></location>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</title>
<figure>
<location><page_1><loc_59><loc_0><loc_91><loc_75></location>
<doctag>
<page_1>
<title><loc_8><loc_91><loc_81><loc_95>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</title>
<picture>
<loc_59><loc_0><loc_91><loc_75>
<caption>Figure 1: Four examples of complex page layouts across different document categories</caption>
</figure>
</picture>
<table>
<location><page_1><loc_42><loc_57><loc_49><loc_61></location>
<loc_42><loc_57><loc_49><loc_61>
</table>
</document>
</doctag>

0 comments on commit fabf75f

Please sign in to comment.