Skip to content

Commit

Permalink
reformatted and fixed bugs
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Sep 18, 2024
1 parent 17f54bf commit 5237758
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 17 deletions.
15 changes: 7 additions & 8 deletions docling_core/types/doc/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pydantic import BaseModel, Field, PositiveInt, StrictStr

from docling_core.search.mapping import es_field
from docling_core.types.doc.tokens import DocumentToken
from docling_core.utils.alias import AliasModel

CellData = tuple[float, float, float, float, str, str]
Expand Down Expand Up @@ -249,31 +250,29 @@ def export_to_html(self) -> str:

return body

def export_to_document_tokens(self, new_line:str="\n", loc_str:str=""):

def export_to_document_tokens(self, new_line: str = "\n", loc_str: str = ""):
"""Export table to document tokens format."""
body = ""

body += f"{DocumentToken.BEG_TABLE.value}{loc_str}"

if self.text is not None and len(self.text) > 0:
body += f"{DocumentToken.BEG_CAPTION.value}"
body += (
f"{self.text}{DocumentToken.END_CAPTION.value}{new_line}"
)
body += f"{self.text}{DocumentToken.END_CAPTION.value}{new_line}"

if self.data is not None and len(self.data) > 0:
for i, row in enumerate(self.data):
body += f"<row_{i}>"
for j, col in enumerate(row):
text = col.text
body += f"<col_{j}>{text}</col_{j}>"

body += f"</row_{i}>{new_line}"

body += f"{DocumentToken.BEG_TABLE.value}{new_line}"

return body


# FIXME: let's add some figure specific data-types later
class Figure(BaseCell):
Expand Down
7 changes: 1 addition & 6 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,8 @@
"""Models for the Docling Document data type."""

from datetime import datetime
from enum import Enum
from typing import Generic, Optional, Tuple, Union



from pydantic import (
AnyHttpUrl,
BaseModel,
Expand All @@ -34,9 +31,6 @@
LanguageT,
Log,
)

from docling_core.types.doc.tokens import DocumentToken

from docling_core.types.doc.base import (
BaseCell,
BaseText,
Expand All @@ -48,6 +42,7 @@
S3Data,
Table,
)
from docling_core.types.doc.tokens import DocumentToken
from docling_core.utils.alias import AliasModel


Expand Down
6 changes: 3 additions & 3 deletions docling_core/types/doc/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
"""Tokens used in the docling document model."""

from enum import Enum
from typing import Tuple


class DocumentToken(Enum):
"""Class to represent an LLM friendly representation of a Document."""
Expand Down Expand Up @@ -104,7 +106,7 @@ def get_col_token(col: int, beg=bool) -> str:
return f"<col_{col}>"
else:
return f"</col_{col}>"

@staticmethod
def get_page_token(page: int):
"""Function to get page tokens."""
Expand All @@ -122,5 +124,3 @@ def get_location_token(val: float, rnorm: int = 100):
return f"<loc_{rnorm}>"

return f"<loc_{val_}>"


1 change: 1 addition & 0 deletions docling_core/types/rec/statement.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from docling_core.types.rec.attribute import Attribute
from docling_core.types.rec.subject import Subject


class StatementToken(Enum):
"""Class to represent an LLM friendly representation of statements."""

Expand Down

0 comments on commit 5237758

Please sign in to comment.