Skip to content

Commit

Permalink
fix: properly handle missing page image case for export_to_html (#166)
Browse files Browse the repository at this point in the history
* fix: properly handle missing image case

Signed-off-by: Yusik Kim <[email protected]>

* chore: pre-commit changes

Signed-off-by: Yusik Kim <[email protected]>

* fix: simplify test and address broken test

Signed-off-by: Yusik Kim <[email protected]>

---------

Signed-off-by: Yusik Kim <[email protected]>
  • Loading branch information
kmyusk authored Feb 25, 2025
1 parent 28bd65c commit 4708f93
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 4 deletions.
15 changes: 11 additions & 4 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -2764,14 +2764,17 @@ def _image_fallback(item: TextItem):
"</figure>"
)

img_fallback = _image_fallback(item)

# If the formula is not processed correcty, use its image
if (
item.text == ""
and item.orig != ""
and image_mode == ImageRefMode.EMBEDDED
and len(item.prov) > 0
and img_fallback is not None
):
text = _image_fallback(item)
text = img_fallback

# Building a math equation in MathML format
# ref https://www.w3.org/TR/wai-aria-1.1/#math
Expand All @@ -2791,9 +2794,13 @@ def _image_fallback(item: TextItem):
"Malformed formula cannot be rendered. "
f"Error {err.__class__.__name__}, formula={math_formula}"
)
if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0:
text = _image_fallback(item)
else:
if (
image_mode == ImageRefMode.EMBEDDED
and len(item.prov) > 0
and img_fallback is not None
):
text = img_fallback
elif len(math_formula) > 0:
text = f"<pre>{math_formula}</pre>"

elif math_formula != "":
Expand Down
19 changes: 19 additions & 0 deletions test/test_docling_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,6 +775,25 @@ def test_formula_mathml():
assert doc_html == gt_html


def test_formula_with_missing_fallback():
doc = DoclingDocument(name="Dummy")
bbox = BoundingBox.from_tuple((1, 2, 3, 4), origin=CoordOrigin.BOTTOMLEFT)
prov = ProvenanceItem(page_no=1, bbox=bbox, charspan=(0, 2))
doc.add_text(label=DocItemLabel.FORMULA, text="", orig="(II.24) 2 Imar", prov=prov)

actual = doc.export_to_html(
formula_to_mathml=True, html_head="", image_mode=ImageRefMode.EMBEDDED
)

expected = """<!DOCTYPE html>
<html lang="en">
<div class="formula-not-decoded">Formula not decoded</div>
</html>"""

assert actual == expected


def test_docitem_get_image():
# Prepare the document
doc = DoclingDocument(name="Dummy")
Expand Down

0 comments on commit 4708f93

Please sign in to comment.