Skip to content

Commit

Permalink
Updates for document construction API and format
Browse files Browse the repository at this point in the history
Signed-off-by: Christoph Auer <[email protected]>
  • Loading branch information
cau-git committed Sep 20, 2024
1 parent bdbd93e commit 9264b1b
Show file tree
Hide file tree
Showing 3 changed files with 182 additions and 48 deletions.
155 changes: 138 additions & 17 deletions docling_core/types/experimental/document.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import hashlib
from typing import Any, Dict, List, Optional, Tuple, Union

from pydantic import AnyUrl, BaseModel, Field
from pydantic import AnyUrl, BaseModel, Field, computed_field, conint, ConfigDict

from docling_core.types.experimental.base import BoundingBox, Size

Uint64 = conint(ge=0, le=(2**64 - 1))

class FigureData(BaseModel): # TBD
pass
Expand All @@ -12,10 +14,16 @@ class FigureData(BaseModel): # TBD
class TableData(BaseModel): # TBD
pass

class FileInfo(BaseModel):
document_hash: str

class RefItem(BaseModel):
cref: str = Field(alias="$ref")

model_config = ConfigDict(
populate_by_name=True,
)

def resolve(self, doc: "DoclingDocument"):
_, path, index_str = self.cref.split("/")
index = int(index_str)
Expand All @@ -35,34 +43,51 @@ class ProvenanceItem(BaseModel):
bbox: BoundingBox
charspan: Tuple[int, int]


class DocItem(BaseModel):
class NodeItem(BaseModel):
dloc: str # format spec ({document_hash}{json-path})
hash: int
parent: Optional[RefItem] = None
children: List[RefItem] = []
@computed_field
@property
def hash(self) -> Uint64: # TODO align with hasher on deepsearch-glm
if not len(self.dloc):
return 0
hash_object = hashlib.sha256(self.dloc.encode('utf-8'))

# Convert the hash to an integer
hash_int = int.from_bytes(hash_object.digest(), 'big')

# Mask it to fit within 64 bits
return Uint64(hash_int & 0xFFFFFFFFFFFFFFFF) # 64-bit unsigned integer mask

class DocItem(NodeItem):
label: str
parent: Optional[RefItem]
children: List[RefItem]
prov: List[ProvenanceItem]
prov: List[ProvenanceItem] = []


class GroupItem(NodeItem):
name: str


class TextItem(DocItem):
orig: str # untreated representation
text: str # sanitized representation

class Section(TextItem):
level: conint(ge=1, le=100) = 1

class FloatingItem(DocItem):
caption: Optional[Union[RefItem, TextItem]]
references: List[Union[RefItem, TextItem]]
footnotes: List[Union[RefItem, TextItem]]
data: Any
image: Optional[ImageRef]
caption: Optional[RefItem] = None
references: List[RefItem] = []
footnotes: List[RefItem] = []
image: Optional[ImageRef] = None


class FigureItem(DocItem):
class FigureItem(FloatingItem):
data: FigureData


class TableItem(DocItem):
class TableItem(FloatingItem):
data: TableData


Expand All @@ -73,8 +98,9 @@ class KeyValueItem(DocItem):


class DocumentContent(BaseModel):
furniture: List[RefItem] = []
body: List[RefItem] = []
furniture: GroupItem = GroupItem(name="_root_", dloc="#/furniture") # List[RefItem] = []
body: GroupItem = GroupItem(name="_root_", dloc="#/body") # List[RefItem] = []
groups: List[GroupItem] = []
texts: List[TextItem] = []
figures: List[FigureItem] = []
tables: List[TableItem] = []
Expand All @@ -89,5 +115,100 @@ class PageItem(DocumentContent):

class DoclingDocument(DocumentContent):
description: Any
file_info: Any
file_info: FileInfo
pages: Dict[int, PageItem] = {} # empty as default

#def add_furniture_group(self, name: str):
# group = GroupItem(name=name)
# self.furniture.children.append(group)
# return group
def resolve_cref(self, obj):
path = obj.dloc.split("#")[1]
return path

def add_group(self, name: str, parent: Optional[GroupItem] = None) -> GroupItem:
if not parent:
parent = self.body
parent_cref = "#/body"
else:
parent_cref = self.resolve_cref(parent)

group_index = len(self.groups)
cref = f"#/groups/{group_index}"
dloc = f"{self.file_info.document_hash}{cref}"

group = GroupItem(name=name, dloc=dloc, parent=RefItem(cref=parent_cref))
self.groups.append(group)
parent.children.append(RefItem(cref=cref))

return group

def add_paragraph(self, label: str, text: str, orig: Optional[str] = None, prov: Optional[ProvenanceItem] = None, parent: Optional[GroupItem] = None,
item_cls=TextItem):
if not parent:
parent = self.body
parent_cref = "#/body"
else:
parent_cref = self.resolve_cref(parent)

if not orig:
orig = text

text_index = len(self.texts)
cref = f"#/texts/{text_index}"
dloc = f"{self.file_info.document_hash}{cref}"
text_item = item_cls(label=label, text=text, orig=orig, dloc=dloc, parent=RefItem(cref=parent_cref))
if prov:
text_item.prov.append(prov)

self.texts.append(text_item)
parent.children.append(RefItem(cref=cref))

return text_item

def add_table(self, data: TableData, caption: Optional[RefItem] = None, prov: Optional[ProvenanceItem] = None, parent: Optional[GroupItem] = None):
if not parent:
parent = self.body

table_index = len(self.tables)
cref = f"#/tables/{table_index}"
dloc = f"{self.file_info.document_hash}{cref}"

tbl_item = TableItem(label="table", data=data, dloc=dloc, parent=parent)
if prov:
tbl_item.prov.append(prov)
if caption:
tbl_item.caption = caption

self.tables.append(tbl_item)
parent.children.append(RefItem(cref=cref))

return tbl_item


def add_figure(self, data: FigureData, caption: Optional[RefItem] = None, prov: Optional[ProvenanceItem] = None, parent: Optional[GroupItem] = None):
if not parent:
parent = self.body

figure_index = len(self.figures)
cref = f"#/figures/{figure_index}"
dloc = f"{self.file_info.document_hash}{cref}"

fig_item = FigureItem(label="figure", data=data, dloc=dloc, parent=parent)
if prov:
fig_item.prov.append(prov)
if caption:
fig_item.caption = caption

self.figures.append(fig_item)
parent.children.append(RefItem(cref=cref))

return fig_item


def add_heading(self, label: str, text: str, orig: Optional[str] = None, level: conint(ge=1, le=100) = 1,
prov: Optional[ProvenanceItem] = None, parent: Optional[GroupItem] = None):
item: Section = self.add_paragraph(label, text, orig, prov, parent, item_cls=Section)
item.level = level
return item

38 changes: 26 additions & 12 deletions test/data/newdoc/dummy_doc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,34 @@
description: { } # DescriptionType - TBD
file_info: # FileInfoType - TBD
document_hash: e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5
furniture: # Headers, footers, framing, navigation elements, all other non-body text
- $ref: "/texts/0"

body: # Top-level elements in other arrays, by-reference only, must not have parent.
- $ref: "/texts/1"
- $ref: "/figure/0"
- $ref: "/texts/2"
- $ref: "/texts/3"
- $ref: "/tables/0"
furniture: # Top level element for any headers, footers, framing, navigation elements, all other non-body text
name: "_root_"
dloc: "#/furniture"
parent: null
children:
- $ref: "/texts/0"

body: # Top-level element for anything in the document body
name: "_root_"
dloc: "#/body"
parent: null
children:
- $ref: "/texts/1"
- $ref: "/figure/0"
- $ref: "/texts/2"
- $ref: "/tables/0"

groups: [] # Any group that is nested deeper in either body or furniture children

texts: # All elements that have a text-string representation, with actual data
- orig: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
text: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/0"
hash: 132103230
label: "page_header"
parent: null
parent:
$ref: "#/furniture"
children: []
prov:
- page_no: 1
Expand All @@ -34,7 +45,8 @@ texts: # All elements that have a text-string representation, with actual data
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/1"
hash: 2349732 # uint64 hash of dloc
label: "title"
parent: null
parent:
$ref: "#/body"
children: [ ]
prov: # must exist, can be empty
- page_no: 1
Expand Down Expand Up @@ -83,7 +95,8 @@ tables: # All tables...
- dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/table/0"
hash: 98574
label: "table"
parent: null
parent:
$ref: "#/body"
children: [ ]
caption:
$ref: "/texts/3"
Expand Down Expand Up @@ -117,7 +130,8 @@ figures: # All figures...
- dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/figures/0"
hash: 7782482
label: "figure"
parent: null
parent:
$ref: "#/body"
caption:
$ref: "/texts/2"
references:
Expand Down
37 changes: 18 additions & 19 deletions test/test_docling_doc.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import yaml
import pytest
from docling_core.types import DoclingDocument, BoundingBox
from docling_core.types.experimental.document import ProvenanceItem
from docling_core.types.experimental.document import ProvenanceItem, FileInfo


def test_load_serialize_doc():
Expand All @@ -19,7 +19,7 @@ def test_load_serialize_doc():
text_item.prov[0].page_no

# Objects that are references need explicit resolution for now:
obj = doc.body[2].resolve(doc=doc) # Text item with parent
obj = doc.body.children[2].resolve(doc=doc) # Text item with parent
parent = obj.parent.resolve(doc=doc) # it is a figure

obj2 = parent.children[0].resolve(
Expand All @@ -38,26 +38,25 @@ def test_load_serialize_doc():
assert doc_reload is not doc # can't be identical

def test_construct_doc():
doc = DoclingDocument(description={}, file_info={})

# group, heading, paragraph, table, figure, title, list, provenance
doc.add_title()
doc.add_paragraph(text="Author 1\nAffiliation 1").add_provenance(ProvenanceItem(page_no=1, bbox=BoundingBox(t=12, l=5, r=230, b=40), charspan=(0,22)))
doc.add_paragraph(text="Author 2\nAffiliation 2")

chapter1 = doc.add_group(name="Introduction")
chapter1.add_heading(text="1. Introduction", level=2)
chapter1.add_paragraph(text="This paper introduces the biggest invention ever made. ...")
mylist = chapter1.add_group()
mylist.add_item(text="Cooks your favourite meal before you know you want it.")
mylist.add_item(text="Cleans up all your dishes.")
mylist.add_item(text="Drains your bank account without consent.")
doc = DoclingDocument(description={}, file_info=FileInfo(document_hash="xyz"))

# group, heading, paragraph, table, figure, title, list, provenance
doc.add_paragraph(label="text", text="Author 1\nAffiliation 1")
doc.add_paragraph(label="text", text="Author 2\nAffiliation 2")

chapter1 = doc.add_group(name="Introduction") # can be done if such information is present, or ommitted.
doc.add_heading(parent=chapter1, label="section_header", text="1. Introduction", level=1)
doc.add_paragraph(parent=chapter1, label="text", text="This paper introduces the biggest invention ever made. ...")
mylist = doc.add_group(parent=chapter1, name="whateverlist")
doc.add_paragraph(parent=mylist, label="list_item", text="Cooks your favourite meal before you know you want it.")
doc.add_paragraph(parent=mylist, label="list_item", text="Cleans up all your dishes.")
doc.add_paragraph(parent=mylist, label="list_item", text="Drains your bank account without consent.")

sec = doc.add_section(text="1. Introduction")
yaml_dump = yaml.safe_dump(
doc.model_dump(mode="json", by_alias=True))

list = sec.add_child(label="container")
list.add_child()
list.add_child()
print(f"\n\n{yaml_dump}")

restored_doc = DoclingDocument.model_validate(
yaml.safe_load(yaml_dump))

0 comments on commit 9264b1b

Please sign in to comment.