diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py index 6f53007..f426a8c 100644 --- a/docling_core/types/experimental/document.py +++ b/docling_core/types/experimental/document.py @@ -1,9 +1,11 @@ +import hashlib from typing import Any, Dict, List, Optional, Tuple, Union -from pydantic import AnyUrl, BaseModel, Field +from pydantic import AnyUrl, BaseModel, Field, computed_field, conint, ConfigDict from docling_core.types.experimental.base import BoundingBox, Size +Uint64 = conint(ge=0, le=(2**64 - 1)) class FigureData(BaseModel): # TBD pass @@ -12,10 +14,16 @@ class FigureData(BaseModel): # TBD class TableData(BaseModel): # TBD pass +class FileInfo(BaseModel): + document_hash: str class RefItem(BaseModel): cref: str = Field(alias="$ref") + model_config = ConfigDict( + populate_by_name=True, + ) + def resolve(self, doc: "DoclingDocument"): _, path, index_str = self.cref.split("/") index = int(index_str) @@ -35,34 +43,52 @@ class ProvenanceItem(BaseModel): bbox: BoundingBox charspan: Tuple[int, int] +class NodeItem(BaseModel): + parent: Optional[RefItem] = None + children: List[RefItem] = [] + +class GroupItem(NodeItem): + name: str -class DocItem(BaseModel): +class DocItem(NodeItem): dloc: str # format spec ({document_hash}{json-path}) - hash: int label: str - parent: Optional[RefItem] - children: List[RefItem] - prov: List[ProvenanceItem] + prov: List[ProvenanceItem] = [] + + @computed_field + @property + def hash(self) -> Uint64: # TODO align with hasher on deepsearch-glm + if not len(self.dloc): + return 0 + hash_object = hashlib.sha256(self.dloc.encode('utf-8')) + + # Convert the hash to an integer + hash_int = int.from_bytes(hash_object.digest(), 'big') + + # Mask it to fit within 64 bits + return Uint64(hash_int & 0xFFFFFFFFFFFFFFFF) # 64-bit unsigned integer mask + class TextItem(DocItem): orig: str # untreated representation text: str # sanitized representation +class Section(TextItem): + level: conint(ge=1, le=100) = 1 class FloatingItem(DocItem): - caption: Optional[Union[RefItem, TextItem]] - references: List[Union[RefItem, TextItem]] - footnotes: List[Union[RefItem, TextItem]] - data: Any - image: Optional[ImageRef] + caption: Optional[RefItem] = None + references: List[RefItem] = [] + footnotes: List[RefItem] = [] + image: Optional[ImageRef] = None -class FigureItem(DocItem): +class FigureItem(FloatingItem): data: FigureData -class TableItem(DocItem): +class TableItem(FloatingItem): data: TableData @@ -73,8 +99,8 @@ class KeyValueItem(DocItem): class DocumentContent(BaseModel): - furniture: List[RefItem] = [] - body: List[RefItem] = [] + furniture: GroupItem = GroupItem(name="_root_") # List[RefItem] = [] + body: GroupItem = GroupItem(name="_root_") # List[RefItem] = [] texts: List[TextItem] = [] figures: List[FigureItem] = [] tables: List[TableItem] = [] @@ -89,5 +115,85 @@ class PageItem(DocumentContent): class DoclingDocument(DocumentContent): description: Any - file_info: Any + file_info: FileInfo pages: Dict[int, PageItem] = {} # empty as default + + #def add_furniture_group(self, name: str): + # group = GroupItem(name=name) + # self.furniture.children.append(group) + # return group + + def add_group(self, name: str, parent: Optional[GroupItem] = None) -> GroupItem: + if not parent: + parent = self.body + + group = GroupItem(name=name) + parent.children.append(group) + return group + + def add_paragraph(self, label: str, text: str, orig: Optional[str] = None, prov: Optional[ProvenanceItem] = None, parent: Optional[GroupItem] = None, + item_cls=TextItem): + if not parent: + parent = self.body + + if not orig: + orig = text + + text_index = len(self.texts) + cref = f"#/texts/{text_index}" + dloc = f"{self.file_info.document_hash}{cref}" + text_item = item_cls(label=label, text=text, orig=orig, dloc=dloc) + if prov: + text_item.prov.append(prov) + + self.texts.append(text_item) + parent.children.append(RefItem(cref=cref)) + + return text_item + + def add_table(self, data: TableData, caption: Optional[RefItem] = None, prov: Optional[ProvenanceItem] = None, parent: Optional[GroupItem] = None): + if not parent: + parent = self.body + + table_index = len(self.tables) + cref = f"#/tables/{table_index}" + dloc = f"{self.file_info.document_hash}{cref}" + + tbl_item = TableItem(label="table", data=data, dloc=dloc) + if prov: + tbl_item.prov.append(prov) + if caption: + tbl_item.caption = caption + + self.tables.append(tbl_item) + parent.children.append(RefItem(cref=cref)) + + return tbl_item + + + def add_figure(self, data: FigureData, caption: Optional[RefItem] = None, prov: Optional[ProvenanceItem] = None, parent: Optional[GroupItem] = None): + if not parent: + parent = self.body + + figure_index = len(self.figures) + cref = f"#/figures/{figure_index}" + dloc = f"{self.file_info.document_hash}{cref}" + + fig_item = FigureItem(label="figure", data=data, dloc=dloc) + if prov: + fig_item.prov.append(prov) + if caption: + fig_item.caption = caption + + self.figures.append(fig_item) + parent.children.append(RefItem(cref=cref)) + + return fig_item + + + def add_heading(self, label: str, text: str, orig: Optional[str] = None, level: conint(ge=1, le=100) = 1, + prov: Optional[ProvenanceItem] = None, parent: Optional[GroupItem] = None): + item: Section = self.add_paragraph(label, text, orig, prov, parent, item_cls=Section) + item.level = level + return item + diff --git a/test/data/newdoc/dummy_doc.yaml b/test/data/newdoc/dummy_doc.yaml index 632c164..902df25 100644 --- a/test/data/newdoc/dummy_doc.yaml +++ b/test/data/newdoc/dummy_doc.yaml @@ -3,15 +3,21 @@ description: { } # DescriptionType - TBD file_info: # FileInfoType - TBD document_hash: e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5 + furniture: # Headers, footers, framing, navigation elements, all other non-body text - - $ref: "/texts/0" + name: "_root_" + parent: null + children: + - $ref: "/texts/0" body: # Top-level elements in other arrays, by-reference only, must not have parent. - - $ref: "/texts/1" - - $ref: "/figure/0" - - $ref: "/texts/2" - - $ref: "/texts/3" - - $ref: "/tables/0" + name: "_root_" + parent: null + children: + - $ref: "/texts/1" + - $ref: "/figure/0" + - $ref: "/texts/2" + - $ref: "/tables/0" texts: # All elements that have a text-string representation, with actual data - orig: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022" @@ -19,7 +25,8 @@ texts: # All elements that have a text-string representation, with actual data dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/0" hash: 132103230 label: "page_header" - parent: null + parent: + $ref: "#/furniture" children: [] prov: - page_no: 1 @@ -34,7 +41,8 @@ texts: # All elements that have a text-string representation, with actual data dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/1" hash: 2349732 # uint64 hash of dloc label: "title" - parent: null + parent: + $ref: "#/body" children: [ ] prov: # must exist, can be empty - page_no: 1 @@ -83,7 +91,8 @@ tables: # All tables... - dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/table/0" hash: 98574 label: "table" - parent: null + parent: + $ref: "#/body" children: [ ] caption: $ref: "/texts/3" @@ -117,7 +126,8 @@ figures: # All figures... - dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/figures/0" hash: 7782482 label: "figure" - parent: null + parent: + $ref: "#/body" caption: $ref: "/texts/2" references: diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py index 74ca859..42de32e 100644 --- a/test/test_docling_doc.py +++ b/test/test_docling_doc.py @@ -1,7 +1,7 @@ import yaml import pytest from docling_core.types import DoclingDocument, BoundingBox -from docling_core.types.experimental.document import ProvenanceItem +from docling_core.types.experimental.document import ProvenanceItem, FileInfo def test_load_serialize_doc(): @@ -19,7 +19,7 @@ def test_load_serialize_doc(): text_item.prov[0].page_no # Objects that are references need explicit resolution for now: - obj = doc.body[2].resolve(doc=doc) # Text item with parent + obj = doc.body.children[2].resolve(doc=doc) # Text item with parent parent = obj.parent.resolve(doc=doc) # it is a figure obj2 = parent.children[0].resolve( @@ -38,26 +38,20 @@ def test_load_serialize_doc(): assert doc_reload is not doc # can't be identical def test_construct_doc(): - doc = DoclingDocument(description={}, file_info={}) - - # group, heading, paragraph, table, figure, title, list, provenance - doc.add_title() - doc.add_paragraph(text="Author 1\nAffiliation 1").add_provenance(ProvenanceItem(page_no=1, bbox=BoundingBox(t=12, l=5, r=230, b=40), charspan=(0,22))) - doc.add_paragraph(text="Author 2\nAffiliation 2") - - chapter1 = doc.add_group(name="Introduction") - chapter1.add_heading(text="1. Introduction", level=2) - chapter1.add_paragraph(text="This paper introduces the biggest invention ever made. ...") - mylist = chapter1.add_group() - mylist.add_item(text="Cooks your favourite meal before you know you want it.") - mylist.add_item(text="Cleans up all your dishes.") - mylist.add_item(text="Drains your bank account without consent.") - + # This code is purely imaginative. None of the APIs below exist yet. + doc = DoclingDocument(description={}, file_info=FileInfo(document_hash="xyz")) - sec = doc.add_section(text="1. Introduction") - - list = sec.add_child(label="container") - list.add_child() - list.add_child() - + # group, heading, paragraph, table, figure, title, list, provenance + doc.add_paragraph(label="text", text="Author 1\nAffiliation 1") + doc.add_paragraph(label="text", text="Author 2\nAffiliation 2") + + chapter1 = doc.add_group(name="Introduction") # can be done if such information is present, or ommitted. + doc.add_heading(parent=chapter1, label="section_header", text="1. Introduction", level=1) + doc.add_paragraph(parent=chapter1, label="text", text="This paper introduces the biggest invention ever made. ...") + mylist = doc.add_group(parent=chapter1, name="whateverlist") + doc.add_paragraph(parent=mylist, label="list_item", text="Cooks your favourite meal before you know you want it.") + doc.add_paragraph(parent=mylist, label="list_item", text="Cleans up all your dishes.") + doc.add_paragraph(parent=mylist, label="list_item", text="Drains your bank account without consent.") + + print(doc.model_dump(mode="json", by_alias=True)) \ No newline at end of file