Updates for document construction API and format

Signed-off-by: Christoph Auer <[email protected]>
DS4SD · Sep 20, 2024 · 9264b1b · 9264b1b
1 parent bdbd93e
commit 9264b1b
Show file tree

Hide file tree

Showing 3 changed files with 182 additions and 48 deletions.
diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
@@ -1,9 +1,11 @@
+import hashlib
 from typing import Any, Dict, List, Optional, Tuple, Union
 
-from pydantic import AnyUrl, BaseModel, Field
+from pydantic import AnyUrl, BaseModel, Field, computed_field, conint, ConfigDict
 
 from docling_core.types.experimental.base import BoundingBox, Size
 
+Uint64 = conint(ge=0, le=(2**64 - 1))
 
 class FigureData(BaseModel):  # TBD
     pass
@@ -12,10 +14,16 @@ class FigureData(BaseModel):  # TBD
 class TableData(BaseModel):  # TBD
     pass
 
+class FileInfo(BaseModel):
+    document_hash: str
 
 class RefItem(BaseModel):
     cref: str = Field(alias="$ref")
 
+    model_config = ConfigDict(
+        populate_by_name=True,
+    )
+
     def resolve(self, doc: "DoclingDocument"):
         _, path, index_str = self.cref.split("/")
         index = int(index_str)
@@ -35,34 +43,51 @@ class ProvenanceItem(BaseModel):
     bbox: BoundingBox
     charspan: Tuple[int, int]
 
-
-class DocItem(BaseModel):
+class NodeItem(BaseModel):
     dloc: str  # format spec ({document_hash}{json-path})
-    hash: int
+    parent: Optional[RefItem] = None
+    children: List[RefItem] = []
+    @computed_field
+    @property
+    def hash(self) -> Uint64: # TODO align with hasher on deepsearch-glm
+        if not len(self.dloc):
+            return 0
+        hash_object = hashlib.sha256(self.dloc.encode('utf-8'))
+
+        # Convert the hash to an integer
+        hash_int = int.from_bytes(hash_object.digest(), 'big')
+
+        # Mask it to fit within 64 bits
+        return Uint64(hash_int & 0xFFFFFFFFFFFFFFFF)  # 64-bit unsigned integer mask
+
+class DocItem(NodeItem):
     label: str
-    parent: Optional[RefItem]
-    children: List[RefItem]
-    prov: List[ProvenanceItem]
+    prov: List[ProvenanceItem] = []
+
+
+class GroupItem(NodeItem):
+    name: str
 
 
 class TextItem(DocItem):
     orig: str  # untreated representation
     text: str  # sanitized representation
 
+class Section(TextItem):
+    level: conint(ge=1, le=100) = 1
 
 class FloatingItem(DocItem):
-    caption: Optional[Union[RefItem, TextItem]]
-    references: List[Union[RefItem, TextItem]]
-    footnotes: List[Union[RefItem, TextItem]]
-    data: Any
-    image: Optional[ImageRef]
+    caption: Optional[RefItem] = None
+    references: List[RefItem] = []
+    footnotes: List[RefItem] = []
+    image: Optional[ImageRef] = None
 
 
-class FigureItem(DocItem):
+class FigureItem(FloatingItem):
     data: FigureData
 
 
-class TableItem(DocItem):
+class TableItem(FloatingItem):
     data: TableData
 
 
@@ -73,8 +98,9 @@ class KeyValueItem(DocItem):
 
 
 class DocumentContent(BaseModel):
-    furniture: List[RefItem] = []
-    body: List[RefItem] = []
+    furniture: GroupItem = GroupItem(name="_root_", dloc="#/furniture") # List[RefItem] = []
+    body: GroupItem = GroupItem(name="_root_", dloc="#/body") # List[RefItem] = []
+    groups: List[GroupItem] = []
     texts: List[TextItem] = []
     figures: List[FigureItem] = []
     tables: List[TableItem] = []
@@ -89,5 +115,100 @@ class PageItem(DocumentContent):
 
 class DoclingDocument(DocumentContent):
     description: Any
-    file_info: Any
+    file_info: FileInfo
     pages: Dict[int, PageItem] = {}  # empty as default
+
+    #def add_furniture_group(self, name: str):
+    #    group = GroupItem(name=name)
+    #    self.furniture.children.append(group)
+    #    return group
+    def resolve_cref(self, obj):
+        path = obj.dloc.split("#")[1]
+        return path
+
+    def add_group(self, name: str, parent: Optional[GroupItem] = None) -> GroupItem:
+        if not parent:
+            parent = self.body
+            parent_cref = "#/body"
+        else:
+            parent_cref = self.resolve_cref(parent)
+
+        group_index = len(self.groups)
+        cref = f"#/groups/{group_index}"
+        dloc = f"{self.file_info.document_hash}{cref}"
+
+        group = GroupItem(name=name, dloc=dloc, parent=RefItem(cref=parent_cref))
+        self.groups.append(group)
+        parent.children.append(RefItem(cref=cref))
+
+        return group
+
+    def add_paragraph(self, label: str, text: str, orig: Optional[str] = None, prov: Optional[ProvenanceItem] = None, parent: Optional[GroupItem] = None,
+                      item_cls=TextItem):
+        if not parent:
+            parent = self.body
+            parent_cref = "#/body"
+        else:
+            parent_cref = self.resolve_cref(parent)
+
+        if not orig:
+            orig = text
+
+        text_index = len(self.texts)
+        cref = f"#/texts/{text_index}"
+        dloc = f"{self.file_info.document_hash}{cref}"
+        text_item = item_cls(label=label, text=text, orig=orig, dloc=dloc, parent=RefItem(cref=parent_cref))
+        if prov:
+            text_item.prov.append(prov)
+
+        self.texts.append(text_item)
+        parent.children.append(RefItem(cref=cref))
+
+        return text_item
+
+    def add_table(self, data: TableData, caption: Optional[RefItem] = None, prov: Optional[ProvenanceItem] = None, parent: Optional[GroupItem] = None):
+        if not parent:
+            parent = self.body
+
+        table_index = len(self.tables)
+        cref = f"#/tables/{table_index}"
+        dloc = f"{self.file_info.document_hash}{cref}"
+
+        tbl_item = TableItem(label="table", data=data, dloc=dloc, parent=parent)
+        if prov:
+            tbl_item.prov.append(prov)
+        if caption:
+            tbl_item.caption = caption
+
+        self.tables.append(tbl_item)
+        parent.children.append(RefItem(cref=cref))
+
+        return tbl_item
+
+
+    def add_figure(self, data: FigureData, caption: Optional[RefItem] = None, prov: Optional[ProvenanceItem] = None, parent: Optional[GroupItem] = None):
+        if not parent:
+            parent = self.body
+
+        figure_index = len(self.figures)
+        cref = f"#/figures/{figure_index}"
+        dloc = f"{self.file_info.document_hash}{cref}"
+
+        fig_item = FigureItem(label="figure", data=data, dloc=dloc, parent=parent)
+        if prov:
+            fig_item.prov.append(prov)
+        if caption:
+            fig_item.caption = caption
+
+        self.figures.append(fig_item)
+        parent.children.append(RefItem(cref=cref))
+
+        return fig_item
+
+
+    def add_heading(self, label: str, text: str, orig: Optional[str] = None, level: conint(ge=1, le=100) = 1,
+                    prov: Optional[ProvenanceItem] = None, parent: Optional[GroupItem] = None):
+        item: Section = self.add_paragraph(label, text, orig, prov, parent, item_cls=Section)
+        item.level = level
+        return item
+
diff --git a/test/data/newdoc/dummy_doc.yaml b/test/data/newdoc/dummy_doc.yaml
@@ -3,23 +3,34 @@
 description: { } # DescriptionType - TBD
 file_info: # FileInfoType - TBD
   document_hash: e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5
-furniture: # Headers, footers, framing, navigation elements, all other non-body text
-  - $ref: "/texts/0"
 
-body: # Top-level elements in other arrays, by-reference only, must not have parent.
-  - $ref: "/texts/1"
-  - $ref: "/figure/0"
-  - $ref: "/texts/2"
-  - $ref: "/texts/3"
-  - $ref: "/tables/0"
+furniture: # Top level element for any headers, footers, framing, navigation elements, all other non-body text
+  name: "_root_"
+  dloc: "#/furniture"
+  parent: null
+  children:
+    - $ref: "/texts/0"
+
+body: # Top-level element for anything in the document body
+  name: "_root_"
+  dloc: "#/body"
+  parent: null
+  children:
+    - $ref: "/texts/1"
+    - $ref: "/figure/0"
+    - $ref: "/texts/2"
+    - $ref: "/tables/0"
+
+groups: [] # Any group that is nested deeper in either body or furniture children
 
 texts: # All elements that have a text-string representation, with actual data
   - orig: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
     text: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
     dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/0"
     hash: 132103230
     label: "page_header"
-    parent: null
+    parent:
+      $ref: "#/furniture"
     children: []
     prov:
       - page_no: 1
@@ -34,7 +45,8 @@ texts: # All elements that have a text-string representation, with actual data
     dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/1"
     hash: 2349732 # uint64 hash of dloc
     label: "title"
-    parent: null
+    parent:
+      $ref: "#/body"
     children: [ ]
     prov: # must exist, can be empty
       - page_no: 1
@@ -83,7 +95,8 @@ tables: # All tables...
   - dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/table/0"
     hash: 98574
     label: "table"
-    parent: null
+    parent:
+      $ref: "#/body"
     children: [ ]
     caption:
       $ref: "/texts/3"
@@ -117,7 +130,8 @@ figures: # All figures...
   - dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/figures/0"
     hash: 7782482
     label: "figure"
-    parent: null
+    parent:
+      $ref: "#/body"
     caption:
       $ref: "/texts/2"
     references:

diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
@@ -1,7 +1,7 @@
 import yaml
 import pytest
 from docling_core.types import DoclingDocument, BoundingBox
-from docling_core.types.experimental.document import ProvenanceItem
+from docling_core.types.experimental.document import ProvenanceItem, FileInfo
 
 
 def test_load_serialize_doc():
@@ -19,7 +19,7 @@ def test_load_serialize_doc():
     text_item.prov[0].page_no
 
     # Objects that are references need explicit resolution for now:
-    obj = doc.body[2].resolve(doc=doc)  # Text item with parent
+    obj = doc.body.children[2].resolve(doc=doc)  # Text item with parent
     parent = obj.parent.resolve(doc=doc)  # it is a figure
 
     obj2 = parent.children[0].resolve(
@@ -38,26 +38,25 @@ def test_load_serialize_doc():
     assert doc_reload is not doc  # can't be identical
 
 def test_construct_doc():
-    doc = DoclingDocument(description={}, file_info={})
 
-    # group, heading, paragraph, table, figure, title, list, provenance
-    doc.add_title()
-    doc.add_paragraph(text="Author 1\nAffiliation 1").add_provenance(ProvenanceItem(page_no=1, bbox=BoundingBox(t=12, l=5, r=230, b=40), charspan=(0,22)))
-    doc.add_paragraph(text="Author 2\nAffiliation 2")
-
-    chapter1 = doc.add_group(name="Introduction")
-    chapter1.add_heading(text="1. Introduction", level=2)
-    chapter1.add_paragraph(text="This paper introduces the biggest invention ever made. ...")
-    mylist = chapter1.add_group()
-    mylist.add_item(text="Cooks your favourite meal before you know you want it.")
-    mylist.add_item(text="Cleans up all your dishes.")
-    mylist.add_item(text="Drains your bank account without consent.")
+    doc = DoclingDocument(description={}, file_info=FileInfo(document_hash="xyz"))
 
+    # group, heading, paragraph, table, figure, title, list, provenance
+    doc.add_paragraph(label="text", text="Author 1\nAffiliation 1")
+    doc.add_paragraph(label="text", text="Author 2\nAffiliation 2")
 
+    chapter1 = doc.add_group(name="Introduction") # can be done if such information is present, or ommitted.
+    doc.add_heading(parent=chapter1, label="section_header", text="1. Introduction", level=1)
+    doc.add_paragraph(parent=chapter1, label="text", text="This paper introduces the biggest invention ever made. ...")
+    mylist = doc.add_group(parent=chapter1, name="whateverlist")
+    doc.add_paragraph(parent=mylist, label="list_item", text="Cooks your favourite meal before you know you want it.")
+    doc.add_paragraph(parent=mylist, label="list_item", text="Cleans up all your dishes.")
+    doc.add_paragraph(parent=mylist, label="list_item", text="Drains your bank account without consent.")
 
-    sec = doc.add_section(text="1. Introduction")
+    yaml_dump = yaml.safe_dump(
+                doc.model_dump(mode="json", by_alias=True))
 
-    list = sec.add_child(label="container")
-    list.add_child()
-    list.add_child()
+    print(f"\n\n{yaml_dump}")
 
+    restored_doc = DoclingDocument.model_validate(
+        yaml.safe_load(yaml_dump))