cleaned up the HTML output

Signed-off-by: Peter Staar <[email protected]>
DS4SD · Jun 6, 2024 · 5a724f2 · 5a724f2
1 parent 863626e
commit 5a724f2
Show file tree

Hide file tree

Showing 3 changed files with 138 additions and 13 deletions.
diff --git a/deepsearch/documents/core/export.py b/deepsearch/documents/core/export.py
@@ -204,17 +204,26 @@ def get_body_new(self, data):
         subenumeration = False
         subsubenumeration = False
 
+        opened_list = False
+
         # loop though content
         for item in data["main-text"]:
+
             # if this is a reference, get it
             if "$ref" in item:
                 item = self.get_refs(item["$ref"])
+            if "__ref" in item:
+                item = self.get_refs(item["__ref"])
 
             page = self.get_page(item)
             style = self.get_style(item)
 
             if item["type"] == "subtitle-level-1" or item["type"] == "subtitle":
 
+                if opened_list:
+                    body += "</ul>\n"
+                    opened_list = False
+
                 if subtitle_l6:
                     body += "</section>\n"
                     subtitle_l6 = False
@@ -242,15 +251,19 @@ def get_body_new(self, data):
 
                 body += "<section>\n"
 
-                body += '<p><span class="title" info="subtitle_l1" {page}><span style="{style}">'.format(
+                body += '<h2><span class="title" info="subtitle_l1" {page}><span style="{style}">'.format(
                     page=page, style=style
                 )
                 for bbox, text in self.split_item_in_boxes(item):
                     body += "<bbox {bbox}>{text}</bbox>\n".format(bbox=bbox, text=text)
-                body += "</span></span></p>\n"
+                body += "</span></span></h2>\n"
 
             elif item["type"] == "subtitle-level-2" or item["type"] == "subsubtitle":
 
+                if opened_list:
+                    body += "</ul>\n"
+                    opened_list = False
+
                 if subtitle_l6:
                     body += "</section>\n"
                     subtitle_l6 = False
@@ -283,6 +296,10 @@ def get_body_new(self, data):
 
             elif item["type"] == "subtitle-level-3" or item["type"] == "subsubsubtitle":
 
+                if opened_list:
+                    body += "</ul>\n"
+                    opened_list = False
+
                 if subtitle_l6:
                     body += "</section>\n"
                     subtitle_l6 = False
@@ -311,6 +328,10 @@ def get_body_new(self, data):
 
             elif item["type"] == "subtitle-level-4":
 
+                if opened_list:
+                    body += "</ul>\n"
+                    opened_list = False
+
                 if subtitle_l6:
                     body += "</section>\n"
                     subtitle_l6 = False
@@ -335,6 +356,10 @@ def get_body_new(self, data):
 
             elif item["type"] == "subtitle-level-5":
 
+                if opened_list:
+                    body += "</ul>\n"
+                    opened_list = False
+
                 if subtitle_l6:
                     body += "</section>\n"
                     subtitle_l6 = False
@@ -355,6 +380,10 @@ def get_body_new(self, data):
 
             elif item["type"] == "subtitle-level-6":
 
+                if opened_list:
+                    body += "</ul>\n"
+                    opened_list = False
+
                 if subtitle_l6:
                     body += "</section>\n"
 
@@ -369,19 +398,63 @@ def get_body_new(self, data):
                     body += "<bbox {bbox}>{text}</bbox>\n".format(bbox=bbox, text=text)
                 body += "</span></span></p>\n"
 
-            elif item["type"] in ["paragraph", "enumeration"] or item[
-                "type"
-            ].startswith("list-item-level-"):
+            elif item["type"] in ["paragraph"]:
 
-                body += '<p><span info="paragraph" style="{style}" {page}>\n'.format(
-                    page=page, style=style
-                )
-                for bbox, text in self.split_item_in_boxes(item):
-                    body += "<bbox {bbox}>{text}</bbox>\n".format(bbox=bbox, text=text)
-                body += "</span></p>\n"
+                if (
+                    "name" in item
+                    and item["name"].startswith("list-item")
+                    and opened_list
+                ):
+
+                    body += (
+                        '<li><span info="paragraph" style="{style}" {page}>\n'.format(
+                            page=page, style=style
+                        )
+                    )
+                    for bbox, text in self.split_item_in_boxes(item):
+                        body += "<bbox {bbox}>{text}</bbox>\n".format(
+                            bbox=bbox, text=text
+                        )
+                    body += "</span></li>\n"
+
+                elif "name" in item and item["name"].startswith("list-item"):
+
+                    body += "<ul>\n"
+                    opened_list = True
+
+                    body += (
+                        '<li><span info="paragraph" style="{style}" {page}>\n'.format(
+                            page=page, style=style
+                        )
+                    )
+                    for bbox, text in self.split_item_in_boxes(item):
+                        body += "<bbox {bbox}>{text}</bbox>\n".format(
+                            bbox=bbox, text=text
+                        )
+                    body += "</span></li>\n"
+
+                else:
+                    if opened_list:
+                        body += "</ul>\n"
+                        opened_list = False
+
+                    body += (
+                        '<p><span info="paragraph" style="{style}" {page}>\n'.format(
+                            page=page, style=style
+                        )
+                    )
+                    for bbox, text in self.split_item_in_boxes(item):
+                        body += "<bbox {bbox}>{text}</bbox>\n".format(
+                            bbox=bbox, text=text
+                        )
+                    body += "</span></p>\n"
 
             elif item["type"] == "caption":
 
+                if opened_list:
+                    body += "</ul>\n"
+                    opened_list = False
+
                 body += '<p><span info="caption" style="{style}" {page}>\n'.format(
                     page=page, style=style
                 )
@@ -391,13 +464,43 @@ def get_body_new(self, data):
 
             elif item["type"] == "footnote":
 
+                if opened_list:
+                    body += "</ul>\n"
+                    opened_list = False
+
                 body += '<p class="footnote" style="{style}" {page}>\n'.format(
                     page=page, style=style
                 )
                 for bbox, text in self.split_item_in_boxes(item):
                     body += "<bbox {bbox}>{text}</bbox>\n".format(bbox=bbox, text=text)
                 body += "</p>\n"
 
+            elif item["type"] == "page-footer":
+
+                if opened_list:
+                    body += "</ul>\n"
+                    opened_list = False
+
+                body += '<p class="page-footer" style="{style}" {page}>\n'.format(
+                    page=page, style=style
+                )
+                for bbox, text in self.split_item_in_boxes(item):
+                    body += "<bbox {bbox}>{text}</bbox>\n".format(bbox=bbox, text=text)
+                body += "</p>\n"
+
+            elif item["type"] == "page-header":
+
+                if opened_list:
+                    body += "</ul>\n"
+                    opened_list = False
+
+                body += '<p class="page-header" style="{style}" {page}>\n'.format(
+                    page=page, style=style
+                )
+                for bbox, text in self.split_item_in_boxes(item):
+                    body += "<bbox {bbox}>{text}</bbox>\n".format(bbox=bbox, text=text)
+                body += "</p>\n"
+
             # elif(item["type"].startswith("list-item-level-")):
             #
             #     body += self.write_enum(item)
@@ -408,11 +511,19 @@ def get_body_new(self, data):
 
             elif item["type"] == "table":
 
+                if opened_list:
+                    body += "</ul>\n"
+                    opened_list = False
+
                 body += self.write_table(item)
 
             ## By defualt dump everything containing text
             elif "text" in item:
 
+                if opened_list:
+                    body += "</ul>\n"
+                    opened_list = False
+
                 body += '<p><span info="{type}" style="{style}" {page}>\n'.format(
                     page=page, style=style, type=item["type"]
                 )

diff --git a/test/inputs/export_to_markdown.json → test/inputs/test_document.json b/test/inputs/export_to_markdown.json → test/inputs/test_document.json
diff --git a/test/test_documents_core_export.py b/test/test_documents_core_export.py
@@ -2,15 +2,29 @@
 import os
 from typing import Any
 
-from deepsearch.documents.core.export import export_to_markdown
+from deepsearch.documents.core.export import export_to_markdown, export_to_html
 
 def test_export_to_markdown():
     doc: dict[str, Any] | None = None
     root = os.path.dirname(os.path.abspath(__file__))
-    with open(os.path.join(root, "inputs", "export_to_markdown.json")) as file_doc:
+    with open(os.path.join(root, "inputs", "test_document.json")) as file_doc:
         doc = json.load(file_doc)
     assert doc
     assert isinstance(doc, dict)
 
     md = export_to_markdown(doc)
     assert md
+
+    """
+    with open(os.path.join(root, "inputs", "test_document.md"), "w") as fw:
+        fw.write(md)
+    """
+
+    html = export_to_html(doc)
+    assert html
+
+    """
+    with open(os.path.join(root, "inputs", "test_document.html"), "w") as fw:
+        fw.write(html)
+    """
+