Skip to content

Commit

Permalink
updated code and tests, need to decide what to commit as test-cases
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Sep 19, 2024
1 parent 2bc04eb commit 5266bab
Show file tree
Hide file tree
Showing 4 changed files with 346 additions and 36 deletions.
207 changes: 190 additions & 17 deletions docling_core/types/doc/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,40 +253,105 @@ def export_to_html(self) -> str:
def export_to_document_tokens(
self,
new_line: str = "\n",
page_w: float = None,
page_h: float = None,
page_w: float = 0.0,
page_h: float = 0.0,
xsize: int = 100,
ysize: int = 100,
add_caption: bool = True,
add_table_location: bool = True,
add_cell_location: bool = False,
add_cell_label: bool = False,
add_cell_location: bool = True,
add_cell_label: bool = True,
add_cell_text: bool = True,
page_tagging: bool = True,
):
"""Export table to document tokens format."""
body = ""

loc_str = ""
if page_w is not None and page_h is not None:
body = f"{DocumentToken.BEG_TABLE.value}{new_line}"

if (
add_table_location
and page_tagging
and self.prov is not None
and len(self.prov) > 0
):
loc_str = DocumentToken.get_location(
self.bbox, page_w, page_h, xsize, ysize, self.page
bbox=self.prov[0].bbox,
page_w=page_w,
page_h=page_h,
xsize=xsize,
ysize=ysize,
page_i=self.prov[0].page,
)
body += f"{loc_str}{new_line}"

elif (
add_table_location
and not page_tagging
and self.prov is not None
and len(self.prov) > 0
):
loc_str = DocumentToken.get_location(
bbox=self.prov[0].bbox,
page_w=page_w,
page_h=page_h,
xsize=xsize,
ysize=ysize,
page_i=-1,
)
body += f"{loc_str}{new_line}"

body += f"{DocumentToken.BEG_TABLE.value}{loc_str}"

if self.text is not None and len(self.text) > 0:
body += f"{DocumentToken.BEG_CAPTION.value}"
body += f"{self.text}{DocumentToken.END_CAPTION.value}{new_line}"
if add_caption and self.text is not None and len(self.text) > 0:
body += f"{DocumentToken.BEG_CAPTION.value}{self.text.strip()}{DocumentToken.END_CAPTION.value}{new_line}"

if self.data is not None and len(self.data) > 0:
for i, row in enumerate(self.data):
body += f"<row_{i}>"
for j, col in enumerate(row):
text = col.text
body += f"<col_{j}>{text}</col_{j}>"

text = ""
if add_cell_text:
text = col.text.strip()

cell_loc = ""
if (
col.bbox is not None
and add_cell_location
and page_tagging
and self.prov is not None
and len(self.prov) > 0
):
cell_loc = DocumentToken.get_location(
bbox=col.bbox,
page_w=page_w,
page_h=page_h,
xsize=xsize,
ysize=ysize,
page_i=self.prov[0].page,
)
elif (
col.bbox is not None and add_cell_location and not page_tagging
):
cell_loc = DocumentToken.get_location(
bbox=col.bbox,
page_w=page_w,
page_h=page_h,
xsize=xsize,
ysize=ysize,
page_i=-1,
)

cell_label = ""
if (
add_cell_label
and col.obj_type is not None
and len(col.obj_type) > 0
):
cell_label = f"<{col.obj_type}>"

body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"

body += f"</row_{i}>{new_line}"

body += f"{DocumentToken.BEG_TABLE.value}{new_line}"
body += f"{DocumentToken.END_TABLE.value}{new_line}"

return body

Expand All @@ -295,6 +360,59 @@ def export_to_document_tokens(
class Figure(BaseCell):
"""Figure."""

def export_to_document_tokens(
self,
new_line: str = "\n",
page_w: float = 0.0,
page_h: float = 0.0,
xsize: int = 100,
ysize: int = 100,
add_caption: bool = True,
add_figure_location: bool = True,
page_tagging: bool = True,
):
"""Export figure to document tokens format."""
body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"

if (
add_figure_location
and page_tagging
and self.prov is not None
and len(self.prov) > 0
):
loc_str = DocumentToken.get_location(
bbox=self.prov[0].bbox,
page_w=page_w,
page_h=page_h,
xsize=xsize,
ysize=ysize,
page_i=self.prov[0].page,
)
body += f"{loc_str}{new_line}"

elif (
add_figure_location
and not page_tagging
and self.prov is not None
and len(self.prov) > 0
):
loc_str = DocumentToken.get_location(
bbox=self.prov[0].bbox,
page_w=page_w,
page_h=page_h,
xsize=xsize,
ysize=ysize,
page_i=-1,
)
body += f"{loc_str}{new_line}"

if add_caption and self.text is not None and len(self.text) > 0:
body += f"{DocumentToken.BEG_CAPTION.value}{self.text.strip()}{DocumentToken.END_CAPTION.value}{new_line}"

body += f"{DocumentToken.END_FIGURE.value}{new_line}"

return body


class BaseText(AliasModel):
"""Base model for text objects."""
Expand All @@ -311,6 +429,61 @@ class BaseText(AliasModel):
font: Optional[str] = None
prov: Optional[list[Prov]] = None

def export_to_document_tokens(
self,
new_line: str = "\n",
page_w: float = 0.0,
page_h: float = 0.0,
xsize: int = 100,
ysize: int = 100,
add_location: bool = True,
page_tagging: bool = True,
):
"""Export text element to document tokens format."""
body = f"<{obj_type}>"

assert DocumentToken.is_known_token(
obj_type
), f"failed DocumentToken.is_known_token({obj_type})"

if (
add_location
and page_tagging
and self.prov is not None
and len(self.prov) > 0
):
loc_str = DocumentToken.get_location(
bbox=self.prov[0].bbox,
page_w=page_w,
page_h=page_h,
xsize=xsize,
ysize=ysize,
page_i=self.prov[0].page,
)
body += f"{loc_str}"

elif (
add_location
and not page_tagging
and self.prov is not None
and len(self.prov) > 0
):
loc_str = DocumentToken.get_location(
bbox=self.prov[0].bbox,
page_w=page_w,
page_h=page_h,
xsize=xsize,
ysize=ysize,
page_i=-1,
)
body += f"{loc_str}"

body += text.strip()

body += f"</{obj_type}>{new_line}"

return body


class ListItem(BaseText):
"""List item."""
Expand Down
64 changes: 60 additions & 4 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,16 @@ def _resolve_ref(self, item: Ref) -> Optional[Union[BaseCell, BaseText]]:

return result

def get_map_to_page_dimensions(self):

pagedims = {}

if self.page_dimensions is not None:
for _ in self.page_dimensions:
pagedims[_.page] = [_.width, _.height]

return pagedims

def export_to_markdown(
self,
delim: str = "\n\n",
Expand Down Expand Up @@ -584,12 +594,14 @@ def export_to_document_tokens(
Returns:
str: The content of the document formatted as an XML string.
"""
xml_str = DocumentToken.BEG_DOCUMENT.value

new_line = ""
if add_new_line:
new_line = "\n"

xml_str = f"{DocumentToken.BEG_DOCUMENT.value}{new_line}"

# pagedims = self.get_map_to_page_dimensions()

if self.main_text is not None:
for orig_item in self.main_text[main_text_start:main_text_stop]:

Expand All @@ -605,19 +617,23 @@ def export_to_document_tokens(
prov = item.prov

loc_str = "" # default is zero
page_w = 0.0
page_h = 0.0

if (
location_tagging
and self.page_dimensions is not None
and prov is not None
and len(prov) > 0
):

page = prov[0].page
prov[0].page
page_dim = self.page_dimensions[page - 1]

page_w = float(page_dim.width)
page_h = float(page_dim.height)

"""
x0 = float(prov[0].bbox[0]) / float(page_w)
y0 = float(prov[0].bbox[1]) / float(page_h)
x1 = float(prov[0].bbox[2]) / float(page_w)
Expand Down Expand Up @@ -645,15 +661,41 @@ def export_to_document_tokens(
loc_str += f"{page_tok}"
loc_str += f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
loc_str += f"{DocumentToken.END_LOCATION.value}"
"""

item_type = item.obj_type
if isinstance(item, BaseText) and (item_type in main_text_labels):
text = item.text

xml_str += item.export_to_document_tokens(
new_line=new_line,
page_w=page_w,
page_h=page_h,
xsize=location_dimensions[0],
ysize=location_dimensions[1],
)

"""
text = item.text
xml_str += f"<{item_type}>{loc_str}{text}</{item_type}>{new_line}"
"""

elif isinstance(item, Table) and (item_type in main_text_labels):

xml_str += item.export_to_document_tokens(
new_line=new_line,
page_w=page_w,
page_h=page_h,
xsize=location_dimensions[0],
ysize=location_dimensions[1],
add_caption=True,
add_table_location=True,
add_cell_location=False,
add_cell_label=True,
add_cell_text=True,
page_tagging=page_tagging,
)

"""
xml_str += f"<{item_type}>{loc_str}"
if item.text is not None and len(item.text) > 0:
Expand All @@ -672,9 +714,22 @@ def export_to_document_tokens(
xml_str += f"</row_{i}>{new_line}"
xml_str += f"</{item_type}>{new_line}"
"""

elif isinstance(item, Figure) and (item_type in main_text_labels):

xml_str += item.export_to_document_tokens(
new_line=new_line,
page_w=page_w,
page_h=page_h,
xsize=location_dimensions[0],
ysize=location_dimensions[1],
add_caption=True,
add_figure_location=True,
page_tagging=page_tagging,
)

"""
xml_str += f"<{item_type}>{loc_str}"
if item.text is not None and len(item.text) > 0:
Expand All @@ -684,6 +739,7 @@ def export_to_document_tokens(
)
xml_str += f"</{item_type}>{new_line}"
"""

xml_str += DocumentToken.END_DOCUMENT.value

Expand Down
Loading

0 comments on commit 5266bab

Please sign in to comment.