Skip to content

Commit

Permalink
(MD export) add inline formulas, placeholders for KeyValueItem, `Fo…
Browse files Browse the repository at this point in the history
…rmItem`

Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas committed Feb 24, 2025
1 parent 1953b73 commit d28c100
Show file tree
Hide file tree
Showing 18 changed files with 841 additions and 56 deletions.
20 changes: 16 additions & 4 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -1397,6 +1397,10 @@ class KeyValueItem(FloatingItem):

graph: GraphData

def _export_to_markdown(self) -> str:
# TODO add actual implementation
return "<!-- missing-key-value-item -->"


class FormItem(FloatingItem):
"""FormItem."""
Expand All @@ -1405,6 +1409,10 @@ class FormItem(FloatingItem):

graph: GraphData

def _export_to_markdown(self) -> str:
# TODO add actual implementation
return "<!-- missing-form-item -->"


ContentItem = Annotated[
Union[
Expand Down Expand Up @@ -2537,14 +2545,14 @@ def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
text = f"{marker} {item.text}"
_ingest_text(text.strip())

elif isinstance(item, CodeItem) and item.label in labels:
elif isinstance(item, CodeItem):
text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
_ingest_text(text, do_escape_underscores=False, do_escape_html=False)

elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
if item.text != "":
_ingest_text(
f"$${item.text}$$",
f"${item.text}$" if is_inline_scope else f"$${item.text}$$",
do_escape_underscores=False,
do_escape_html=False,
)
Expand All @@ -2555,7 +2563,7 @@ def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
do_escape_html=False,
)

elif isinstance(item, TextItem) and item.label in labels:
elif isinstance(item, TextItem):
if len(item.text) and text_width > 0:
text = item.text
wrapped_text = textwrap.fill(text, width=text_width)
Expand All @@ -2580,7 +2588,11 @@ def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):

_ingest_text(line, do_escape_html=False, do_escape_underscores=False)

elif isinstance(item, DocItem) and item.label in labels:
elif isinstance(item, (KeyValueItem, FormItem)):
text = item._export_to_markdown()
_ingest_text(text, do_escape_html=False, do_escape_underscores=False)

elif isinstance(item, DocItem):
text = "<!-- missing-text -->"
_ingest_text(text, do_escape_html=False, do_escape_underscores=False)

Expand Down
8 changes: 7 additions & 1 deletion test/data/doc/constructed_doc.dt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,13 @@ Affiliation 2</text>
<code<_unknown_>print("Hello world")</code
<paragraph>(to be displayed inline)</paragraph>
</unordered_list>
<paragraph>Here a formula:</paragraph>
<formula>E=mc^2</formula>
<paragraph>(to be displayed inline)</paragraph>
</unordered_list>
<paragraph>And a code block:</paragraph>
<paragraph>Here a code block:</paragraph>
<code<_unknown_>print("Hello world")</code
<paragraph>Here a formula block:</paragraph>
<formula>E=mc^2</formula>
<paragraph>The end.</paragraph>
</doctag>
8 changes: 7 additions & 1 deletion test/data/doc/constructed_doc.dt.gt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,13 @@ Affiliation 2</text>
<code<_unknown_>print("Hello world")</code
<paragraph>(to be displayed inline)</paragraph>
</unordered_list>
<paragraph>Here a formula:</paragraph>
<formula>E=mc^2</formula>
<paragraph>(to be displayed inline)</paragraph>
</unordered_list>
<paragraph>And a code block:</paragraph>
<paragraph>Here a code block:</paragraph>
<code<_unknown_>print("Hello world")</code
<paragraph>Here a formula block:</paragraph>
<formula>E=mc^2</formula>
<paragraph>The end.</paragraph>
</doctag>
8 changes: 7 additions & 1 deletion test/data/doc/constructed_doc.embedded.html.gt
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,13 @@
<pre><code>print("Hello world")</code></pre>
<p>(to be displayed inline)</p>
</ul>
<p>Here a formula:</p>
<div><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math></div>
<p>(to be displayed inline)</p>
</ul>
<p>And a code block:</p>
<p>Here a code block:</p>
<pre><code>print("Hello world")</code></pre>
<p>Here a formula block:</p>
<div><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math></div>
<p>The end.</p>
</html>
208 changes: 201 additions & 7 deletions test/data/doc/constructed_doc.embedded.json.gt
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,25 @@
"$ref": "#/groups/5"
},
{
"$ref": "#/texts/24"
"$ref": "#/texts/27"
},
{
"$ref": "#/texts/25"
"$ref": "#/texts/28"
},
{
"$ref": "#/texts/29"
},
{
"$ref": "#/texts/30"
},
{
"$ref": "#/key_value_items/0"
},
{
"$ref": "#/form_items/0"
},
{
"$ref": "#/texts/31"
}
],
"content_layer": "body",
Expand Down Expand Up @@ -185,6 +200,9 @@
},
{
"$ref": "#/groups/7"
},
{
"$ref": "#/groups/8"
}
],
"content_layer": "body",
Expand All @@ -210,6 +228,26 @@
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/8",
"parent": {
"$ref": "#/groups/6"
},
"children": [
{
"$ref": "#/texts/24"
},
{
"$ref": "#/texts/25"
},
{
"$ref": "#/texts/26"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
}
],
"texts": [
Expand Down Expand Up @@ -542,17 +580,53 @@
{
"self_ref": "#/texts/24",
"parent": {
"$ref": "#/body"
"$ref": "#/groups/8"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "And a code block:",
"text": "And a code block:"
"orig": "Here a formula:",
"text": "Here a formula:"
},
{
"self_ref": "#/texts/25",
"parent": {
"$ref": "#/groups/8"
},
"children": [],
"content_layer": "body",
"label": "formula",
"prov": [],
"orig": "E=mc^2",
"text": "E=mc^2"
},
{
"self_ref": "#/texts/26",
"parent": {
"$ref": "#/groups/8"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "(to be displayed inline)",
"text": "(to be displayed inline)"
},
{
"self_ref": "#/texts/27",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "Here a code block:",
"text": "Here a code block:"
},
{
"self_ref": "#/texts/28",
"parent": {
"$ref": "#/body"
},
Expand All @@ -566,6 +640,42 @@
"references": [],
"footnotes": [],
"code_language": "unknown"
},
{
"self_ref": "#/texts/29",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "Here a formula block:",
"text": "Here a formula block:"
},
{
"self_ref": "#/texts/30",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "formula",
"prov": [],
"orig": "E=mc^2",
"text": "E=mc^2"
},
{
"self_ref": "#/texts/31",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "The end.",
"text": "The end."
}
],
"pictures": [
Expand Down Expand Up @@ -840,7 +950,91 @@
}
}
],
"key_value_items": [],
"form_items": [],
"key_value_items": [
{
"self_ref": "#/key_value_items/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "key_value_region",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"graph": {
"cells": [
{
"label": "key",
"cell_id": 0,
"text": "number",
"orig": "#"
},
{
"label": "value",
"cell_id": 1,
"text": "1",
"orig": "1"
}
],
"links": [
{
"label": "to_value",
"source_cell_id": 0,
"target_cell_id": 1
},
{
"label": "to_key",
"source_cell_id": 1,
"target_cell_id": 0
}
]
}
}
],
"form_items": [
{
"self_ref": "#/form_items/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "form",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"graph": {
"cells": [
{
"label": "key",
"cell_id": 0,
"text": "number",
"orig": "#"
},
{
"label": "value",
"cell_id": 1,
"text": "1",
"orig": "1"
}
],
"links": [
{
"label": "to_value",
"source_cell_id": 0,
"target_cell_id": 1
},
{
"label": "to_key",
"source_cell_id": 1,
"target_cell_id": 0
}
]
}
}
],
"pages": {}
}
11 changes: 9 additions & 2 deletions test/data/doc/constructed_doc.embedded.md.gt
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,16 @@ This is the caption of figure 2.
- item 2 of neighboring list
- item 1 of sub list
- Here a code snippet: `print("Hello world")` (to be displayed inline)
- Here a formula: $E=mc^2$ (to be displayed inline)

And a code block:
Here a code block:

```
print("Hello world")
```
```

Here a formula block:

$$E=mc^2$$

The end.
Loading

0 comments on commit d28c100

Please sign in to comment.