Skip to content

Commit

Permalink
cleaned up the HTML output
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Jun 6, 2024
1 parent 863626e commit 5a724f2
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 13 deletions.
133 changes: 122 additions & 11 deletions deepsearch/documents/core/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,17 +204,26 @@ def get_body_new(self, data):
subenumeration = False
subsubenumeration = False

opened_list = False

# loop though content
for item in data["main-text"]:

# if this is a reference, get it
if "$ref" in item:
item = self.get_refs(item["$ref"])
if "__ref" in item:
item = self.get_refs(item["__ref"])

page = self.get_page(item)
style = self.get_style(item)

if item["type"] == "subtitle-level-1" or item["type"] == "subtitle":

if opened_list:
body += "</ul>\n"
opened_list = False

if subtitle_l6:
body += "</section>\n"
subtitle_l6 = False
Expand Down Expand Up @@ -242,15 +251,19 @@ def get_body_new(self, data):

body += "<section>\n"

body += '<p><span class="title" info="subtitle_l1" {page}><span style="{style}">'.format(
body += '<h2><span class="title" info="subtitle_l1" {page}><span style="{style}">'.format(
page=page, style=style
)
for bbox, text in self.split_item_in_boxes(item):
body += "<bbox {bbox}>{text}</bbox>\n".format(bbox=bbox, text=text)
body += "</span></span></p>\n"
body += "</span></span></h2>\n"

elif item["type"] == "subtitle-level-2" or item["type"] == "subsubtitle":

if opened_list:
body += "</ul>\n"
opened_list = False

if subtitle_l6:
body += "</section>\n"
subtitle_l6 = False
Expand Down Expand Up @@ -283,6 +296,10 @@ def get_body_new(self, data):

elif item["type"] == "subtitle-level-3" or item["type"] == "subsubsubtitle":

if opened_list:
body += "</ul>\n"
opened_list = False

if subtitle_l6:
body += "</section>\n"
subtitle_l6 = False
Expand Down Expand Up @@ -311,6 +328,10 @@ def get_body_new(self, data):

elif item["type"] == "subtitle-level-4":

if opened_list:
body += "</ul>\n"
opened_list = False

if subtitle_l6:
body += "</section>\n"
subtitle_l6 = False
Expand All @@ -335,6 +356,10 @@ def get_body_new(self, data):

elif item["type"] == "subtitle-level-5":

if opened_list:
body += "</ul>\n"
opened_list = False

if subtitle_l6:
body += "</section>\n"
subtitle_l6 = False
Expand All @@ -355,6 +380,10 @@ def get_body_new(self, data):

elif item["type"] == "subtitle-level-6":

if opened_list:
body += "</ul>\n"
opened_list = False

if subtitle_l6:
body += "</section>\n"

Expand All @@ -369,19 +398,63 @@ def get_body_new(self, data):
body += "<bbox {bbox}>{text}</bbox>\n".format(bbox=bbox, text=text)
body += "</span></span></p>\n"

elif item["type"] in ["paragraph", "enumeration"] or item[
"type"
].startswith("list-item-level-"):
elif item["type"] in ["paragraph"]:

body += '<p><span info="paragraph" style="{style}" {page}>\n'.format(
page=page, style=style
)
for bbox, text in self.split_item_in_boxes(item):
body += "<bbox {bbox}>{text}</bbox>\n".format(bbox=bbox, text=text)
body += "</span></p>\n"
if (
"name" in item
and item["name"].startswith("list-item")
and opened_list
):

body += (
'<li><span info="paragraph" style="{style}" {page}>\n'.format(
page=page, style=style
)
)
for bbox, text in self.split_item_in_boxes(item):
body += "<bbox {bbox}>{text}</bbox>\n".format(
bbox=bbox, text=text
)
body += "</span></li>\n"

elif "name" in item and item["name"].startswith("list-item"):

body += "<ul>\n"
opened_list = True

body += (
'<li><span info="paragraph" style="{style}" {page}>\n'.format(
page=page, style=style
)
)
for bbox, text in self.split_item_in_boxes(item):
body += "<bbox {bbox}>{text}</bbox>\n".format(
bbox=bbox, text=text
)
body += "</span></li>\n"

else:
if opened_list:
body += "</ul>\n"
opened_list = False

body += (
'<p><span info="paragraph" style="{style}" {page}>\n'.format(
page=page, style=style
)
)
for bbox, text in self.split_item_in_boxes(item):
body += "<bbox {bbox}>{text}</bbox>\n".format(
bbox=bbox, text=text
)
body += "</span></p>\n"

elif item["type"] == "caption":

if opened_list:
body += "</ul>\n"
opened_list = False

body += '<p><span info="caption" style="{style}" {page}>\n'.format(
page=page, style=style
)
Expand All @@ -391,13 +464,43 @@ def get_body_new(self, data):

elif item["type"] == "footnote":

if opened_list:
body += "</ul>\n"
opened_list = False

body += '<p class="footnote" style="{style}" {page}>\n'.format(
page=page, style=style
)
for bbox, text in self.split_item_in_boxes(item):
body += "<bbox {bbox}>{text}</bbox>\n".format(bbox=bbox, text=text)
body += "</p>\n"

elif item["type"] == "page-footer":

if opened_list:
body += "</ul>\n"
opened_list = False

body += '<p class="page-footer" style="{style}" {page}>\n'.format(
page=page, style=style
)
for bbox, text in self.split_item_in_boxes(item):
body += "<bbox {bbox}>{text}</bbox>\n".format(bbox=bbox, text=text)
body += "</p>\n"

elif item["type"] == "page-header":

if opened_list:
body += "</ul>\n"
opened_list = False

body += '<p class="page-header" style="{style}" {page}>\n'.format(
page=page, style=style
)
for bbox, text in self.split_item_in_boxes(item):
body += "<bbox {bbox}>{text}</bbox>\n".format(bbox=bbox, text=text)
body += "</p>\n"

# elif(item["type"].startswith("list-item-level-")):
#
# body += self.write_enum(item)
Expand All @@ -408,11 +511,19 @@ def get_body_new(self, data):

elif item["type"] == "table":

if opened_list:
body += "</ul>\n"
opened_list = False

body += self.write_table(item)

## By defualt dump everything containing text
elif "text" in item:

if opened_list:
body += "</ul>\n"
opened_list = False

body += '<p><span info="{type}" style="{style}" {page}>\n'.format(
page=page, style=style, type=item["type"]
)
Expand Down
File renamed without changes.
18 changes: 16 additions & 2 deletions test/test_documents_core_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,29 @@
import os
from typing import Any

from deepsearch.documents.core.export import export_to_markdown
from deepsearch.documents.core.export import export_to_markdown, export_to_html

def test_export_to_markdown():
doc: dict[str, Any] | None = None
root = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(root, "inputs", "export_to_markdown.json")) as file_doc:
with open(os.path.join(root, "inputs", "test_document.json")) as file_doc:
doc = json.load(file_doc)
assert doc
assert isinstance(doc, dict)

md = export_to_markdown(doc)
assert md

"""
with open(os.path.join(root, "inputs", "test_document.md"), "w") as fw:
fw.write(md)
"""

html = export_to_html(doc)
assert html

"""
with open(os.path.join(root, "inputs", "test_document.html"), "w") as fw:
fw.write(html)
"""

0 comments on commit 5a724f2

Please sign in to comment.