From a90cc19ce43fce6ab13ca7a660dff85a6be252f4 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Tue, 17 Sep 2024 15:13:32 +0200
Subject: [PATCH 01/34] Draft new docling document format, pydantic model and
 tests

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/newdoc/__init__.py |   0
 docling_core/types/newdoc/base.py     | 120 ++++++++++++++++++
 docling_core/types/newdoc/document.py |  95 +++++++++++++++
 pyproject.toml                        |   1 +
 test/data/newdoc/dummy_doc.yaml       | 169 ++++++++++++++++++++++++++
 test/test_newdoc.py                   |  36 ++++++
 6 files changed, 421 insertions(+)
 create mode 100644 docling_core/types/newdoc/__init__.py
 create mode 100644 docling_core/types/newdoc/base.py
 create mode 100644 docling_core/types/newdoc/document.py
 create mode 100644 test/data/newdoc/dummy_doc.yaml
 create mode 100644 test/test_newdoc.py

diff --git a/docling_core/types/newdoc/__init__.py b/docling_core/types/newdoc/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/docling_core/types/newdoc/base.py b/docling_core/types/newdoc/base.py
new file mode 100644
index 0000000..b082ea6
--- /dev/null
+++ b/docling_core/types/newdoc/base.py
@@ -0,0 +1,120 @@
+import copy
+from enum import Enum
+from typing import Tuple
+
+from pydantic import BaseModel
+
+
+## All copied from docling
+class CoordOrigin(str, Enum):
+    TOPLEFT = "TOPLEFT"
+    BOTTOMLEFT = "BOTTOMLEFT"
+
+
+class Size(BaseModel):
+    width: float = 0.0
+    height: float = 0.0
+
+
+class BoundingBox(BaseModel):
+    l: float  # left
+    t: float  # top
+    r: float  # right
+    b: float  # bottom
+
+    coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
+
+    @property
+    def width(self):
+        return self.r - self.l
+
+    @property
+    def height(self):
+        return abs(self.t - self.b)
+
+    def scaled(self, scale: float) -> "BoundingBox":
+        out_bbox = copy.deepcopy(self)
+        out_bbox.l *= scale
+        out_bbox.r *= scale
+        out_bbox.t *= scale
+        out_bbox.b *= scale
+
+        return out_bbox
+
+    def normalized(self, page_size: Size) -> "BoundingBox":
+        out_bbox = copy.deepcopy(self)
+        out_bbox.l /= page_size.width
+        out_bbox.r /= page_size.width
+        out_bbox.t /= page_size.height
+        out_bbox.b /= page_size.height
+
+        return out_bbox
+
+    def as_tuple(self):
+        if self.coord_origin == CoordOrigin.TOPLEFT:
+            return (self.l, self.t, self.r, self.b)
+        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return (self.l, self.b, self.r, self.t)
+
+    @classmethod
+    def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
+        if origin == CoordOrigin.TOPLEFT:
+            l, t, r, b = coord[0], coord[1], coord[2], coord[3]
+            if r < l:
+                l, r = r, l
+            if b < t:
+                b, t = t, b
+
+            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
+        elif origin == CoordOrigin.BOTTOMLEFT:
+            l, b, r, t = coord[0], coord[1], coord[2], coord[3]
+            if r < l:
+                l, r = r, l
+            if b > t:
+                b, t = t, b
+
+            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
+
+    def area(self) -> float:
+        return (self.r - self.l) * (self.b - self.t)
+
+    def intersection_area_with(self, other: "BoundingBox") -> float:
+        # Calculate intersection coordinates
+        left = max(self.l, other.l)
+        top = max(self.t, other.t)
+        right = min(self.r, other.r)
+        bottom = min(self.b, other.b)
+
+        # Calculate intersection dimensions
+        width = right - left
+        height = bottom - top
+
+        # If the bounding boxes do not overlap, width or height will be negative
+        if width <= 0 or height <= 0:
+            return 0.0
+
+        return width * height
+
+    def to_bottom_left_origin(self, page_height) -> "BoundingBox":
+        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return self
+        elif self.coord_origin == CoordOrigin.TOPLEFT:
+            return BoundingBox(
+                l=self.l,
+                r=self.r,
+                t=page_height - self.t,
+                b=page_height - self.b,
+                coord_origin=CoordOrigin.BOTTOMLEFT,
+            )
+
+    def to_top_left_origin(self, page_height):
+        if self.coord_origin == CoordOrigin.TOPLEFT:
+            return self
+        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return BoundingBox(
+                l=self.l,
+                r=self.r,
+                t=page_height - self.t,  # self.b
+                b=page_height - self.b,  # self.t
+                coord_origin=CoordOrigin.TOPLEFT,
+            )
diff --git a/docling_core/types/newdoc/document.py b/docling_core/types/newdoc/document.py
new file mode 100644
index 0000000..dd2597d
--- /dev/null
+++ b/docling_core/types/newdoc/document.py
@@ -0,0 +1,95 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from pydantic import AnyUrl, BaseModel, Field
+
+from docling_core.types.newdoc.base import BoundingBox, Size
+
+
+class FigureData(BaseModel):  # TBD
+    pass
+
+
+class TableData(BaseModel):  # TBD
+    pass
+
+
+class RefItem(BaseModel):
+    cref: str = Field(alias="$ref")
+
+    def resolve(self, doc: "DoclingDocument"):
+        _, path, index_str = self.cref.split("/")
+        index = int(index_str)
+        obj = doc.__getattribute__(path)[index]
+        return obj
+
+
+class ImageRef(BaseModel):
+    format: str  # png, etc.
+    dpi: int  # ...
+    size: Size
+    uri: AnyUrl
+
+
+class ProvenanceItem(BaseModel):
+    page_no: int
+    bbox: BoundingBox
+    charspan: Tuple[int, int]
+
+
+class DocItem(BaseModel):
+    dloc: str  # format spec ({document_hash}{json-path})
+    hash: int
+    label: str
+    parent: Optional[RefItem]
+    children: List[RefItem]
+    prov: List[ProvenanceItem]
+
+
+class TextItem(DocItem):
+    orig: str  # untreated representation
+    text: str  # sanitized representation
+
+
+class FloatingItem(DocItem):
+    caption: Optional[Union[RefItem, TextItem]]
+    references: List[Union[RefItem, TextItem]]
+    footnotes: List[Union[RefItem, TextItem]]
+    data: Any
+    image: Optional[ImageRef]
+
+
+class FigureItem(DocItem):
+    data: FigureData
+
+
+class TableItem(DocItem):
+    data: TableData
+
+
+class KeyValueItem(DocItem):
+    pass
+
+
+ContentItem = Union[TextItem, FigureItem, TableItem, KeyValueItem]
+
+
+class DocumentContent(BaseModel):
+    furniture: List[RefItem] = []
+    body: List[RefItem] = []
+    texts: List[TextItem] = []
+    figures: List[FigureItem] = []
+    tables: List[TableItem] = []
+    key_value_items: List[KeyValueItem] = []
+
+
+class PageItem(DocumentContent):
+    hash: str  # page hash
+    size: Size
+    image: Optional[ImageRef]
+    num_elements: int
+
+
+class DoclingDocument(DocumentContent):
+    description: Any
+    file_info: Any
+    pages: Dict[int, PageItem] = {}  # empty as default
diff --git a/pyproject.toml b/pyproject.toml
index 689d41b..859121a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -119,6 +119,7 @@ module = [
     "json_schema_for_humans.*",
     "pandas.*",
     "tabulate.*",
+    "yaml.*"
 ]
 ignore_missing_imports = true
 
diff --git a/test/data/newdoc/dummy_doc.yaml b/test/data/newdoc/dummy_doc.yaml
new file mode 100644
index 0000000..f092eb8
--- /dev/null
+++ b/test/data/newdoc/dummy_doc.yaml
@@ -0,0 +1,169 @@
+---
+## Document with content + layout info
+description: { } # DescriptionType - TBD
+file_info: # FileInfoType - TBD
+  document_hash: e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5
+furniture: # Headers, footers, framing, navigation elements, all other non-body text
+  - $ref: "/texts/0"
+
+body: # All elements in other arrays, by-reference only
+  - $ref: "/texts/1"
+  - $ref: "/figure/0"
+  - $ref: "/texts/2"
+  - $ref: "/texts/3"
+  - $ref: "/tables/0"
+
+texts: # All elements that have a text-string representation, with actual data
+  - orig: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
+    text: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
+    dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/0"
+    hash: 132103230
+    label: "page_header"
+    parent: null
+    children: [ ]
+    prov:
+      - page_no: 1
+        bbox:
+          l: 21.3
+          t: 52.3
+          b: 476.2
+          r: 35.2
+        charspan: [ 1,423 ] # 2-tuple, references to "orig"
+  - orig: "DocLayNet: A Large Human-Annotated Dataset for\nDocument-Layout Analysis"
+    text: "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
+    dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/1"
+    hash: 2349732 # uint64 hash of dloc
+    label: "title"
+    parent: null
+    children: [ ]
+    prov: # must exist, can be empty
+      - page_no: 1
+        bbox:
+          l: 65.0
+          t: 30.1
+          b: 53.4
+          r: 623.2
+        charspan: [ 1,423 ] # 2-tuple, references to "orig"
+  - orig: "OPERATION (cont.)" # nested inside the figure
+    text: "OPERATION (cont.)"
+    dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/2"
+    hash: 6978483
+    label: "section_header"
+    parent:
+      $ref: "/figures/0"
+    children: [ ]
+    prov:
+      - page_no: 1
+        bbox:
+          l: 323.0
+          t: 354.3
+          b: 334.4
+          r: 376.0
+        charspan: [ 0,734 ]
+  - orig: "Figure 1: Four examples of complex page layouts across dif-\nferent document categories" # nested inside the figure
+    text: "Figure 1: Four examples of complex page layouts across different document categories"
+    dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/3"
+    hash: 6978483
+    label: "caption"
+    parent:
+      $ref: "/figures/0"
+    children: [ ]
+    prov:
+      - page_no: 1
+        bbox:
+          l: 323.0
+          t: 354.3
+          b: 334.4
+          r: 376.0
+          coord_origin: "BOTTOMLEFT"
+        charspan: [ 1,423 ] # 2-tuple, references to "orig"
+
+
+tables: # All tables...
+  - dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/table/0"
+    hash: 98574
+    label: "table"
+    parent: null
+    children: [ ]
+    caption:
+      $ref: "/texts/3"
+    references:
+      - $ref: "/text/??"
+    footnotes:
+      - $ref: "/text/??"
+    image:
+      format: png
+      dpi: 72
+      size:
+        width: 231
+        height: 351
+      uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/tables/0.png"
+      #alternatives: base64 encoded striong
+    data: # TableData Type
+      grid: [ [ ] ] # list-of-list of TableCell type
+      otsl: "<fcel><ecel>..." # OTSL token string
+      html: "" # ??
+    prov:
+      - page_no: 1
+        bbox:
+          l: 323.0
+          t: 354.3
+          b: 334.4
+          r: 376.0
+          coord_origin: "BOTTOMLEFT"
+        charspan: [ 1,423 ] # 2-tuple, references to "orig"
+
+figures: # All figures...
+  - dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/figures/0"
+    hash: 7782482
+    label: "figure"
+    parent: null
+    caption:
+      $ref: "/texts/2"
+    references:
+      - $ref: "/text/??"
+    footnotes:
+      - $ref: "/text/??"
+
+    data: # FigureData Type
+      classification: "illustration"
+      confidence: 0.78
+      description: "...."
+      # content structure?
+    image:
+      format: png
+      dpi: 72
+      size:
+        width: 231
+        height: 351
+      uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/figures/0.png"
+      #alternatives: base64 encoded striong
+    children:
+      - $ref: "/texts/2"
+    prov:
+      - page_no: 1
+        bbox:
+          l: 456.3
+          t: 145.8
+          b: 623.4
+          r: 702.5
+        charspan: [ 0,288 ]
+
+key_value_items: [ ] # All KV-items
+
+# We should consider this for pages
+pages: # Optional, for layout documents
+  1:
+    hash: "5b0916ed3ead46e69efcddb2c932afd91d0e25ce6828c39e5617e6ee2bd0cf6e"
+    size:
+      width: 768.23
+      height: 583.15
+    image:
+      format: png
+      dpi: 144
+      size:
+        width: 1536
+        height: 1166
+      uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/pages/1.png"
+      #alternatives: base64 encoded string
+    num_elements: 23
\ No newline at end of file
diff --git a/test/test_newdoc.py b/test/test_newdoc.py
new file mode 100644
index 0000000..ff3aa3c
--- /dev/null
+++ b/test/test_newdoc.py
@@ -0,0 +1,36 @@
+import yaml
+
+from docling_core.types.newdoc.document import DoclingDocument
+
+if __name__ == "__main__":
+    # Read YAML file
+    with open("data/newdoc/dummy_doc.yaml", "r") as fp:
+        dict_from_yaml = yaml.safe_load(fp)
+
+    doc = DoclingDocument.model_validate(dict_from_yaml)
+
+    # Objects can be accessed
+    text_item = doc.texts[0]
+
+    # access members
+    text_item.text
+    text_item.prov[0].page_no
+
+    # Objects that are references need explicit resolution for now:
+    obj = doc.body[2].resolve(doc=doc)  # Text item with parent
+    parent = obj.parent.resolve(doc=doc)  # it is a figure
+
+    obj2 = parent.children[0].resolve(
+        doc=doc
+    )  # Child of figure must be the same as obj
+
+    assert obj == obj2
+    assert obj is obj2
+
+    doc_dumped = doc.model_dump(mode="json", by_alias=True)
+    out_yaml = yaml.safe_dump(doc_dumped)
+
+    doc_reload = DoclingDocument.model_validate(yaml.safe_load(out_yaml))
+
+    assert doc_reload == doc  # must be equal
+    assert doc_reload is not doc  # can't be identical

From 43c23b98494d6c286327133dcc37e33d51cab7d3 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Wed, 18 Sep 2024 17:01:27 +0200
Subject: [PATCH 02/34] Fix tests to have unique document_hashes per test

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/__init__.py                |  17 +-
 docling_core/types/doc/base.py                | 316 +++----
 docling_core/types/doc/document.py            | 822 ++----------------
 docling_core/types/legacy/__init__.py         |   6 +
 docling_core/types/legacy/base.py             | 196 +++++
 docling_core/types/{doc => legacy}/doc_ann.py |   2 +-
 docling_core/types/{doc => legacy}/doc_ocr.py |   2 +-
 docling_core/types/{doc => legacy}/doc_raw.py |   2 +-
 docling_core/types/legacy/document.py         | 791 +++++++++++++++++
 docling_core/types/newdoc/__init__.py         |   0
 docling_core/types/newdoc/base.py             | 120 ---
 docling_core/types/newdoc/document.py         |  95 --
 docling_core/types/rec/subject.py             |   2 +-
 docling_core/utils/ds_generate_jsonschema.py  |   4 +-
 test/data/newdoc/dummy_doc.yaml               |   4 +-
 test/test_base.py                             |   2 +-
 test/test_doc_base.py                         |   2 +-
 test/test_doc_schema.py                       |   2 +-
 test/test_doc_schema_extractor.py             |   2 +-
 test/test_docling_doc.py                      |  63 ++
 test/test_json_schema_to_search_mapper.py     |   2 +-
 test/test_newdoc.py                           |  36 -
 22 files changed, 1261 insertions(+), 1227 deletions(-)
 create mode 100644 docling_core/types/legacy/__init__.py
 create mode 100644 docling_core/types/legacy/base.py
 rename docling_core/types/{doc => legacy}/doc_ann.py (95%)
 rename docling_core/types/{doc => legacy}/doc_ocr.py (96%)
 rename docling_core/types/{doc => legacy}/doc_raw.py (98%)
 create mode 100644 docling_core/types/legacy/document.py
 delete mode 100644 docling_core/types/newdoc/__init__.py
 delete mode 100644 docling_core/types/newdoc/base.py
 delete mode 100644 docling_core/types/newdoc/document.py
 create mode 100644 test/test_docling_doc.py
 delete mode 100644 test/test_newdoc.py

diff --git a/docling_core/types/__init__.py b/docling_core/types/__init__.py
index fde140e..30c1f4d 100644
--- a/docling_core/types/__init__.py
+++ b/docling_core/types/__init__.py
@@ -5,10 +5,10 @@
 
 """Define the main types."""
 
-from docling_core.types.doc.base import BoundingBox  # noqa
-from docling_core.types.doc.base import Table  # noqa
-from docling_core.types.doc.base import TableCell  # noqa
-from docling_core.types.doc.base import (  # noqa
+from docling_core.types.legacy.base import BoundingBox as LegacyBoundingBox # noqa
+from docling_core.types.legacy.base import Table  # noqa
+from docling_core.types.legacy.base import TableCell  # noqa
+from docling_core.types.legacy.base import (  # noqa
     BaseCell,
     BaseText,
     PageDimensions,
@@ -16,10 +16,13 @@
     Prov,
     Ref,
 )
-from docling_core.types.doc.document import (  # noqa
+from docling_core.types.legacy.document import (  # noqa
     CCSDocumentDescription as DocumentDescription,
 )
-from docling_core.types.doc.document import CCSFileInfoObject as FileInfoObject  # noqa
-from docling_core.types.doc.document import ExportedCCSDocument as Document  # noqa
+from docling_core.types.legacy.document import CCSFileInfoObject as FileInfoObject  # noqa
+from docling_core.types.legacy.document import ExportedCCSDocument as Document  # noqa
 from docling_core.types.gen.generic import Generic  # noqa
 from docling_core.types.rec.record import Record  # noqa
+
+from docling_core.types.doc.document import DoclingDocument, DocItem, TextItem, FloatingItem, TableItem, FigureItem, TableData, FigureData, PageItem
+from docling_core.types.doc.base import CoordOrigin, BoundingBox
\ No newline at end of file
diff --git a/docling_core/types/doc/base.py b/docling_core/types/doc/base.py
index 2f1eeed..b082ea6 100644
--- a/docling_core/types/doc/base.py
+++ b/docling_core/types/doc/base.py
@@ -1,196 +1,120 @@
-#
-# Copyright IBM Corp. 2024 - 2024
-# SPDX-License-Identifier: MIT
-#
-
-"""Define common models across CCS objects."""
-from typing import Annotated, Literal, Optional, Union
-
-from pydantic import BaseModel, Field, PositiveInt, StrictStr
-
-from docling_core.search.mapping import es_field
-from docling_core.utils.alias import AliasModel
-
-CellData = tuple[float, float, float, float, str, str]
-
-CellHeader = tuple[
-    Literal["x0"],
-    Literal["y0"],
-    Literal["x1"],
-    Literal["y1"],
-    Literal["font"],
-    Literal["text"],
-]
-
-BoundingBox = Annotated[list[float], Field(min_length=4, max_length=4)]
-
-Span = Annotated[list[int], Field(min_length=2, max_length=2)]
-
-
-class CellsContainer(BaseModel):
-    """Cell container."""
-
-    data: Optional[list[CellData]] = None
-    header: CellHeader = ("x0", "y0", "x1", "y1", "font", "text")
-
-
-class S3Resource(BaseModel):
-    """Resource in a cloud object storage."""
-
-    mime: str
-    path: str
-    page: Optional[PositiveInt] = None
-
-
-class S3Data(AliasModel):
-    """Data object in a cloud object storage."""
-
-    pdf_document: Optional[list[S3Resource]] = Field(default=None, alias="pdf-document")
-    pdf_pages: Optional[list[S3Resource]] = Field(default=None, alias="pdf-pages")
-    pdf_images: Optional[list[S3Resource]] = Field(default=None, alias="pdf-images")
-    json_document: Optional[S3Resource] = Field(default=None, alias="json-document")
-    json_meta: Optional[S3Resource] = Field(default=None, alias="json-meta")
-    glm_json_document: Optional[S3Resource] = Field(
-        default=None, alias="glm-json-document"
-    )
-    figures: Optional[list[S3Resource]] = None
-
-
-class S3Reference(AliasModel):
-    """References an s3 resource."""
-
-    ref_s3_data: StrictStr = Field(
-        alias="__ref_s3_data", examples=["#/_s3_data/figures/0"]
-    )
-
-
-class Prov(AliasModel):
-    """Provenance."""
-
-    bbox: BoundingBox
-    page: PositiveInt
-    span: Span
-    ref_s3_data: Optional[StrictStr] = Field(
-        default=None, alias="__ref_s3_data", json_schema_extra=es_field(suppress=True)
-    )
-
-
-class BoundingBoxContainer(BaseModel):
-    """Bounding box container."""
-
-    min: BoundingBox
-    max: BoundingBox
-
-
-class BitmapObject(AliasModel):
-    """Bitmap object."""
-
-    obj_type: str = Field(alias="type")
-    bounding_box: BoundingBoxContainer = Field(
-        json_schema_extra=es_field(suppress=True)
-    )
-    prov: Prov
-
-
-class PageDimensions(BaseModel):
-    """Page dimensions."""
-
-    height: float
-    page: PositiveInt
-    width: float
-
-
-class TableCell(AliasModel):
-    """Table cell."""
-
-    bbox: Optional[BoundingBox] = None
-    spans: Optional[list[Span]] = None
-    text: str = Field(json_schema_extra=es_field(term_vector="with_positions_offsets"))
-    obj_type: str = Field(alias="type")
-
-
-class GlmTableCell(TableCell):
-    """Glm Table cell."""
-
-    col: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True))
-    col_header: bool = Field(
-        default=False, alias="col-header", json_schema_extra=es_field(suppress=True)
-    )
-    col_span: Optional[Span] = Field(
-        default=None, alias="col-span", json_schema_extra=es_field(suppress=True)
-    )
-    row: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True))
-    row_header: bool = Field(
-        default=False, alias="row-header", json_schema_extra=es_field(suppress=True)
-    )
-    row_span: Optional[Span] = Field(
-        default=None, alias="row-span", json_schema_extra=es_field(suppress=True)
-    )
-
-
-class BaseCell(AliasModel):
-    """Base cell."""
-
-    # FIXME: we need to check why we have bounding_box (this should be in prov)
-    bounding_box: Optional[BoundingBoxContainer] = Field(
-        default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
-    )
-    prov: Optional[list[Prov]] = None
-    text: Optional[str] = Field(
-        default=None, json_schema_extra=es_field(term_vector="with_positions_offsets")
-    )
-    obj_type: str = Field(
-        alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-
-
-class Table(BaseCell):
-    """Table."""
-
-    num_cols: int = Field(alias="#-cols")
-    num_rows: int = Field(alias="#-rows")
-    data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
-    model: Optional[str] = None
-
-
-# FIXME: let's add some figure specific data-types later
-class Figure(BaseCell):
-    """Figure."""
-
-
-class BaseText(AliasModel):
-    """Base model for text objects."""
-
-    text: StrictStr = Field(
-        json_schema_extra=es_field(term_vector="with_positions_offsets")
-    )
-    obj_type: StrictStr = Field(
-        alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    name: Optional[StrictStr] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    font: Optional[str] = None
-    prov: Optional[list[Prov]] = None
-
-
-class ListItem(BaseText):
-    """List item."""
-
-    identifier: str
-
-
-class Ref(AliasModel):
-    """Reference."""
-
-    name: str
-    obj_type: str = Field(alias="type")
-    ref: str = Field(alias="$ref")
-
-
-class PageReference(BaseModel):
-    """Page reference."""
-
-    hash: str = Field(json_schema_extra=es_field(type="keyword", ignore_above=8191))
-    model: str = Field(json_schema_extra=es_field(suppress=True))
-    page: PositiveInt = Field(json_schema_extra=es_field(type="short"))
+import copy
+from enum import Enum
+from typing import Tuple
+
+from pydantic import BaseModel
+
+
+## All copied from docling
+class CoordOrigin(str, Enum):
+    TOPLEFT = "TOPLEFT"
+    BOTTOMLEFT = "BOTTOMLEFT"
+
+
+class Size(BaseModel):
+    width: float = 0.0
+    height: float = 0.0
+
+
+class BoundingBox(BaseModel):
+    l: float  # left
+    t: float  # top
+    r: float  # right
+    b: float  # bottom
+
+    coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
+
+    @property
+    def width(self):
+        return self.r - self.l
+
+    @property
+    def height(self):
+        return abs(self.t - self.b)
+
+    def scaled(self, scale: float) -> "BoundingBox":
+        out_bbox = copy.deepcopy(self)
+        out_bbox.l *= scale
+        out_bbox.r *= scale
+        out_bbox.t *= scale
+        out_bbox.b *= scale
+
+        return out_bbox
+
+    def normalized(self, page_size: Size) -> "BoundingBox":
+        out_bbox = copy.deepcopy(self)
+        out_bbox.l /= page_size.width
+        out_bbox.r /= page_size.width
+        out_bbox.t /= page_size.height
+        out_bbox.b /= page_size.height
+
+        return out_bbox
+
+    def as_tuple(self):
+        if self.coord_origin == CoordOrigin.TOPLEFT:
+            return (self.l, self.t, self.r, self.b)
+        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return (self.l, self.b, self.r, self.t)
+
+    @classmethod
+    def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
+        if origin == CoordOrigin.TOPLEFT:
+            l, t, r, b = coord[0], coord[1], coord[2], coord[3]
+            if r < l:
+                l, r = r, l
+            if b < t:
+                b, t = t, b
+
+            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
+        elif origin == CoordOrigin.BOTTOMLEFT:
+            l, b, r, t = coord[0], coord[1], coord[2], coord[3]
+            if r < l:
+                l, r = r, l
+            if b > t:
+                b, t = t, b
+
+            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
+
+    def area(self) -> float:
+        return (self.r - self.l) * (self.b - self.t)
+
+    def intersection_area_with(self, other: "BoundingBox") -> float:
+        # Calculate intersection coordinates
+        left = max(self.l, other.l)
+        top = max(self.t, other.t)
+        right = min(self.r, other.r)
+        bottom = min(self.b, other.b)
+
+        # Calculate intersection dimensions
+        width = right - left
+        height = bottom - top
+
+        # If the bounding boxes do not overlap, width or height will be negative
+        if width <= 0 or height <= 0:
+            return 0.0
+
+        return width * height
+
+    def to_bottom_left_origin(self, page_height) -> "BoundingBox":
+        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return self
+        elif self.coord_origin == CoordOrigin.TOPLEFT:
+            return BoundingBox(
+                l=self.l,
+                r=self.r,
+                t=page_height - self.t,
+                b=page_height - self.b,
+                coord_origin=CoordOrigin.BOTTOMLEFT,
+            )
+
+    def to_top_left_origin(self, page_height):
+        if self.coord_origin == CoordOrigin.TOPLEFT:
+            return self
+        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return BoundingBox(
+                l=self.l,
+                r=self.r,
+                t=page_height - self.t,  # self.b
+                b=page_height - self.b,  # self.t
+                coord_origin=CoordOrigin.TOPLEFT,
+            )
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 427605c..146616b 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -1,791 +1,93 @@
-#
-# Copyright IBM Corp. 2024 - 2024
-# SPDX-License-Identifier: MIT
-#
+from typing import Any, Dict, List, Optional, Tuple, Union
 
-"""Models for the Docling Document data type."""
+from pydantic import AnyUrl, BaseModel, Field
 
-from datetime import datetime
-from enum import Enum
-from typing import Generic, Optional, Tuple, Union
+from docling_core.types.doc.base import BoundingBox, Size
 
-from pydantic import (
-    AnyHttpUrl,
-    BaseModel,
-    Field,
-    NonNegativeInt,
-    StrictStr,
-    model_validator,
-)
-from tabulate import tabulate
 
-from docling_core.search.mapping import es_field
-from docling_core.types.base import (
-    Acquisition,
-    CollectionDocumentInfo,
-    CollectionNameTypeT,
-    DescriptionAdvancedT,
-    DescriptionAnalyticsT,
-    FileInfoObject,
-    Identifier,
-    IdentifierTypeT,
-    LanguageT,
-    Log,
-)
-from docling_core.types.doc.base import (
-    BaseCell,
-    BaseText,
-    BitmapObject,
-    Figure,
-    PageDimensions,
-    PageReference,
-    Ref,
-    S3Data,
-    Table,
-)
-from docling_core.utils.alias import AliasModel
+class FigureData(BaseModel):  # TBD
+    pass
 
 
-class CCSFileInfoDescription(BaseModel, extra="forbid"):
-    """File info description."""
+class TableData(BaseModel):  # TBD
+    pass
 
-    author: Optional[list[StrictStr]] = None
-    keywords: Optional[str] = None
-    subject: Optional[str] = None
-    title: Optional[StrictStr] = None
-    creation_date: Optional[str] = None  # datetime
 
+class RefItem(BaseModel):
+    cref: str = Field(alias="$ref")
 
-class CCSFileInfoObject(FileInfoObject, extra="forbid"):
-    """File info object."""
+    def resolve(self, doc: "DoclingDocument"):
+        _, path, index_str = self.cref.split("/")
+        index = int(index_str)
+        obj = doc.__getattribute__(path)[index]
+        return obj
 
-    num_pages: Optional[int] = Field(default=None, alias="#-pages")
 
-    collection_name: Optional[str] = Field(
-        default=None,
-        alias="collection-name",
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-    )
-    description: Optional[CCSFileInfoDescription] = Field(
-        default=None, json_schema_extra=es_field(suppress=True)
-    )
-    page_hashes: Optional[list[PageReference]] = Field(
-        default=None, alias="page-hashes"
-    )
+class ImageRef(BaseModel):
+    format: str  # png, etc.
+    dpi: int  # ...
+    size: Size
+    uri: AnyUrl
 
 
-class Affiliation(BaseModel, extra="forbid"):
-    """Affiliation."""
+class ProvenanceItem(BaseModel):
+    page_no: int
+    bbox: BoundingBox
+    charspan: Tuple[int, int]
 
-    name: str = Field(
-        ...,
-        json_schema_extra=es_field(
-            fields={
-                "lower": {
-                    "normalizer": "lowercase_asciifolding",
-                    "type": "keyword",
-                    "ignore_above": 8191,
-                },
-                "keyword": {"type": "keyword", "ignore_above": 8191},
-            },
-        ),
-    )
-    id: Optional[str] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    source: Optional[str] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
 
+class DocItem(BaseModel):
+    dloc: str  # format spec ({document_hash}{json-path})
+    hash: int
+    label: str
+    parent: Optional[RefItem]
+    children: List[RefItem]
+    prov: List[ProvenanceItem]
 
-class Author(BaseModel, extra="forbid"):
-    """Author."""
 
-    name: str = Field(
-        ...,
-        json_schema_extra=es_field(
-            type="text",
-            fields={
-                "lower": {
-                    "normalizer": "lowercase_asciifolding",
-                    "type": "keyword",
-                    "ignore_above": 8191,
-                },
-                "keyword": {"type": "keyword", "ignore_above": 8191},
-            },
-        ),
-    )
-    id: Optional[str] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    source: Optional[str] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    affiliations: Optional[list[Affiliation]] = None
+class TextItem(DocItem):
+    orig: str  # untreated representation
+    text: str  # sanitized representation
 
 
-class Publication(BaseModel, Generic[IdentifierTypeT], extra="forbid"):
-    """Publication details of a journal or venue."""
+class FloatingItem(DocItem):
+    caption: Optional[Union[RefItem, TextItem]]
+    references: List[Union[RefItem, TextItem]]
+    footnotes: List[Union[RefItem, TextItem]]
+    data: Any
+    image: Optional[ImageRef]
 
-    identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field(
-        default=None,
-        description="Unique identifiers of a publication venue.",
-    )
-    name: StrictStr = Field(
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-        description="Name of the publication.",
-    )
-    alternate_names: Optional[list[StrictStr]] = Field(
-        default=None,
-        json_schema_extra=es_field(type="text"),
-        title="Alternate Names",
-        description="Other names or abbreviations of this publication.",
-    )
-    type: Optional[list[StrictStr]] = Field(
-        default=None,
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-        description="Type of publication (journal article, conference, review,...).",
-    )
-    pages: Optional[StrictStr] = Field(
-        default=None,
-        json_schema_extra=es_field(type="text"),
-        description="Page range in the publication.",
-    )
-    issue: Optional[StrictStr] = Field(
-        default=None,
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-        description="Publication issue (issue number).",
-    )
-    volume: Optional[StrictStr] = Field(
-        default=None,
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-        description="Publication volume.",
-    )
-    url: Optional[AnyHttpUrl] = Field(
-        default=None,
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-        description="URL on the publication site.",
-    )
 
+class FigureItem(DocItem):
+    data: FigureData
 
-class DescriptionLicense(BaseModel, extra="forbid"):
-    """Licence in document description."""
 
-    code: Optional[StrictStr] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    text: Optional[StrictStr] = None
+class TableItem(DocItem):
+    data: TableData
 
 
-class CCSDocumentDescription(
-    AliasModel,
-    Generic[
-        DescriptionAdvancedT,
-        DescriptionAnalyticsT,
-        IdentifierTypeT,
-        LanguageT,
-        CollectionNameTypeT,
-    ],
-):
-    """Description in document."""
+class KeyValueItem(DocItem):
+    pass
 
-    title: Optional[StrictStr] = None
-    abstract: Optional[list[StrictStr]] = None
-    authors: Optional[list[Author]] = None
-    affiliations: Optional[list[Affiliation]] = None
-    subjects: Optional[list[str]] = Field(
-        default=None,
-        json_schema_extra=es_field(
-            fields={"keyword": {"ignore_above": 8191, "type": "keyword"}}
-        ),
-    )
-    keywords: Optional[list[str]] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    publication_date: Optional[datetime] = None
-    languages: Optional[list[LanguageT]] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    license_: Optional[DescriptionLicense] = Field(default=None, alias="license")
-    publishers: Optional[list[StrictStr]] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    url_refs: Optional[list[str]] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    references: Optional[list[Identifier[IdentifierTypeT]]] = None
-    publication: Optional[list[Publication]] = Field(
-        default=None, description="List of publication journals or venues."
-    )
-    reference_count: Optional[NonNegativeInt] = Field(
-        default=None,
-        title="Reference Count",
-        description="Total number of documents referenced by this document.",
-        json_schema_extra=es_field(type="integer"),
-    )
-    citation_count: Optional[NonNegativeInt] = Field(
-        default=None,
-        title="Citation Count",
-        description=(
-            "Total number of citations that this document has received (number "
-            "of documents in whose bibliography this document appears)."
-        ),
-        json_schema_extra=es_field(type="integer"),
-    )
-    citation_date: Optional[datetime] = Field(
-        default=None,
-        title="Citation Count Date",
-        description="Last update date of the citation count.",
-    )
-    advanced: Optional[DescriptionAdvancedT] = None
-    analytics: Optional[DescriptionAnalyticsT] = None
-    logs: list[Log]
-    collection: Optional[CollectionDocumentInfo[CollectionNameTypeT]] = Field(
-        default=None, description="The collection information of this document."
-    )
-    acquisition: Optional[Acquisition] = Field(
-        default=None,
-        description=(
-            "Information on how the document was obtained, for data governance"
-            " purposes."
-        ),
-    )
+ContentItem = Union[TextItem, FigureItem, TableItem, KeyValueItem]
 
 
-class MinimalDocument(
-    AliasModel,
-    Generic[
-        DescriptionAdvancedT,
-        DescriptionAnalyticsT,
-        IdentifierTypeT,
-        LanguageT,
-        CollectionNameTypeT,
-    ],
-):
-    """Minimal model for a document."""
+class DocumentContent(BaseModel):
+    furniture: List[RefItem] = []
+    body: List[RefItem] = []
+    texts: List[TextItem] = []
+    figures: List[FigureItem] = []
+    tables: List[TableItem] = []
+    key_value_items: List[KeyValueItem] = []
 
-    name: StrictStr = Field(alias="_name")
-    obj_type: Optional[StrictStr] = Field("document", alias="type")
-    description: CCSDocumentDescription[
-        DescriptionAdvancedT,
-        DescriptionAnalyticsT,
-        IdentifierTypeT,
-        LanguageT,
-        CollectionNameTypeT,
-    ]
-    file_info: FileInfoObject = Field(alias="file-info")
-    main_text: Optional[list[Union[Ref, BaseText]]] = Field(
-        default=None, alias="main-text"
-    )
-    figures: Optional[list[Figure]] = None
-    tables: Optional[list[Table]] = None
+class PageItem(DocumentContent):
+    hash: str  # page hash
+    size: Size
+    image: Optional[ImageRef]
+    num_elements: int
 
 
-class CCSDocument(
-    MinimalDocument,
-    Generic[
-        DescriptionAdvancedT,
-        DescriptionAnalyticsT,
-        IdentifierTypeT,
-        LanguageT,
-        CollectionNameTypeT,
-    ],
-):
-    """Model for a CCS-generated document."""
-
-    obj_type: Optional[StrictStr] = Field("pdf-document", alias="type")
-    bitmaps: Optional[list[BitmapObject]] = None
-    equations: Optional[list[BaseCell]] = None
-    footnotes: Optional[list[BaseText]] = None
-    file_info: CCSFileInfoObject = Field(alias="file-info")
-    main_text: Optional[list[Union[Ref, BaseText]]] = Field(
-        default=None,
-        alias="main-text",
-    )
-    page_dimensions: Optional[list[PageDimensions]] = Field(
-        default=None, alias="page-dimensions"
-    )
-    page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
-    page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
-    s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
-
-    @model_validator(mode="before")
-    @classmethod
-    def from_dict(cls, data):
-        """Validates and fixes the input data."""
-        if not isinstance(data, dict):
-            return data
-        description_collection = data["description"].get("collection")
-        if not description_collection:
-            data["description"].setdefault("collection", {})
-
-        data["description"]["collection"].setdefault("type", "Document")
-        logs = data["description"].get("logs")
-        if not logs:
-            data["description"].setdefault("logs", [])
-
-        abstract = data["description"].get("abstract")
-        if abstract is not None and not isinstance(abstract, list):
-            if isinstance(abstract, str):
-                data["description"]["abstract"] = [abstract]
-            else:
-                data["description"].pop("abstract")
-
-        for key in ["affiliations", "authors"]:
-            descr = data["description"].get(key)
-            if descr is not None and not isinstance(descr, list):
-                if isinstance(descr, dict):
-                    data["description"][key] = [descr]
-                else:
-                    data["description"].pop(key)
-
-        if data.get("main-text"):
-            for item in data["main-text"]:
-                if ref := item.pop("__ref", None):
-                    item["$ref"] = ref
-
-        return data
-
-
-class DocumentToken(Enum):
-    """Class to represent an LLM friendly representation of a Document."""
-
-    BEG_DOCUMENT = "<document>"
-    END_DOCUMENT = "</document>"
-
-    BEG_TITLE = "<title>"
-    END_TITLE = "</title>"
-
-    BEG_ABSTRACT = "<abstract>"
-    END_ABSTRACT = "</abstract>"
-
-    BEG_DOI = "<doi>"
-    END_DOI = "</doi>"
-    BEG_DATE = "<date>"
-    END_DATE = "</date>"
-
-    BEG_AUTHORS = "<authors>"
-    END_AUTHORS = "</authors>"
-    BEG_AUTHOR = "<author>"
-    END_AUTHOR = "</author>"
-
-    BEG_AFFILIATIONS = "<affiliations>"
-    END_AFFILIATIONS = "</affiliations>"
-    BEG_AFFILIATION = "<affiliation>"
-    END_AFFILIATION = "</affiliation>"
-
-    BEG_HEADER = "<section-header>"
-    END_HEADER = "</section-header>"
-    BEG_TEXT = "<text>"
-    END_TEXT = "</text>"
-    BEG_PARAGRAPH = "<paragraph>"
-    END_PARAGRAPH = "</paragraph>"
-    BEG_TABLE = "<table>"
-    END_TABLE = "</table>"
-    BEG_FIGURE = "<figure>"
-    END_FIGURE = "</figure>"
-    BEG_CAPTION = "<caption>"
-    END_CAPTION = "</caption>"
-    BEG_EQUATION = "<equation>"
-    END_EQUATION = "</equation>"
-    BEG_LIST = "<list>"
-    END_LIST = "</list>"
-    BEG_LISTITEM = "<list-item>"
-    END_LISTITEM = "</list-item>"
-
-    BEG_LOCATION = "<location>"
-    END_LOCATION = "</location>"
-    BEG_GROUP = "<group>"
-    END_GROUP = "</group>"
-
-    @classmethod
-    def get_special_tokens(
-        cls,
-        max_rows: int = 100,
-        max_cols: int = 100,
-        max_pages: int = 1000,
-        page_dimension: Tuple[int, int] = (100, 100),
-    ):
-        """Function to get all special document tokens."""
-        special_tokens = [token.value for token in cls]
-
-        # Adding dynamically generated row and col tokens
-        for i in range(0, max_rows):
-            special_tokens += [f"<row_{i}>", f"</row_{i}>"]
-
-        for i in range(0, max_cols):
-            special_tokens += [f"<col_{i}>", f"</col_{i}>"]
-
-        for i in range(6):
-            special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
-
-        # Adding dynamically generated page-tokens
-        for i in range(0, max_pages):
-            special_tokens.append(f"<page_{i}>")
-
-        # Adding dynamically generated location-tokens
-        for i in range(0, max(page_dimension[0], page_dimension[1])):
-            special_tokens.append(f"<loc_{i}>")
-
-        return special_tokens
-
-    @staticmethod
-    def get_page_token(page: int):
-        """Function to get page tokens."""
-        return f"<page_{page}>"
-
-    @staticmethod
-    def get_location_token(val: float, rnorm: int = 100):
-        """Function to get location tokens."""
-        val_ = round(rnorm * val)
-
-        if val_ < 0:
-            return "<loc_0>"
-
-        if val_ > rnorm:
-            return f"<loc_{rnorm}>"
-
-        return f"<loc_{val_}>"
-
-
-class ExportedCCSDocument(
-    MinimalDocument,
-    Generic[
-        DescriptionAdvancedT,
-        DescriptionAnalyticsT,
-        IdentifierTypeT,
-        LanguageT,
-        CollectionNameTypeT,
-    ],
-):
-    """Document model for Docling."""
-
-    obj_type: Optional[StrictStr] = Field(
-        "pdf-document",
-        alias="type",
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-    )
-    bitmaps: Optional[list[BitmapObject]] = None
-    equations: Optional[list[BaseCell]] = None
-    footnotes: Optional[list[BaseText]] = None
-    description: CCSDocumentDescription[
-        DescriptionAdvancedT,
-        DescriptionAnalyticsT,
-        IdentifierTypeT,
-        LanguageT,
-        CollectionNameTypeT,
-    ]
-    file_info: CCSFileInfoObject = Field(alias="file-info")
-    main_text: Optional[list[Union[Ref, BaseText]]] = Field(
-        default=None, alias="main-text"
-    )
-    page_dimensions: Optional[list[PageDimensions]] = Field(
-        default=None, alias="page-dimensions"
-    )
-    page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
-    page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
-    s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
-    identifiers: Optional[list[Identifier[IdentifierTypeT]]] = None
-
-    @model_validator(mode="before")
-    @classmethod
-    def from_dict(cls, data):
-        """Fix ref in main-text."""
-        if not isinstance(data, dict):
-            return data
-        if data.get("main-text"):
-            for item in data["main-text"]:
-                if ref := item.pop("__ref", None):
-                    item["$ref"] = ref
-
-        return data
-
-    def _resolve_ref(self, item: Ref) -> Optional[Union[BaseCell, BaseText]]:
-        """Return the resolved reference.
-
-        Resolved the Ref object within the document.
-        If the object is not found, None is returned.
-        """
-        result: Optional[Union[BaseCell, BaseText]] = None
-
-        # NOTE: currently only resolves refs explicitely, such that we can make
-        # assumptions on ref parts
-        if item.obj_type == "table" and self.tables:
-            parts = item.ref.split("/")
-            result = self.tables[int(parts[2])]
-        elif item.obj_type == "figure" and self.figures:
-            parts = item.ref.split("/")
-            result = self.figures[int(parts[2])]
-        elif item.obj_type == "equation" and self.equations:
-            parts = item.ref.split("/")
-            result = self.equations[int(parts[2])]
-        elif item.obj_type == "footnote" and self.footnotes:
-            parts = item.ref.split("/")
-            result = self.footnotes[int(parts[2])]
-
-        return result
-
-    def export_to_markdown(
-        self,
-        delim: str = "\n\n",
-        main_text_start: int = 0,
-        main_text_stop: Optional[int] = None,
-        main_text_labels: list[str] = [
-            "title",
-            "subtitle-level-1",
-            "paragraph",
-            "caption",
-            "table",
-        ],
-        strict_text: bool = False,
-    ) -> str:
-        r"""Serialize to Markdown.
-
-        Operates on a slice of the document's main_text as defined through arguments
-        main_text_start and main_text_stop; defaulting to the whole main_text.
-
-        Args:
-            delim (str, optional): Delimiter to use when concatenating the various
-                Markdown parts. Defaults to "\n\n".
-            main_text_start (int, optional): Main-text slicing start index (inclusive).
-                Defaults to 0.
-            main_text_end (Optional[int], optional): Main-text slicing stop index
-                (exclusive). Defaults to None.
-
-        Returns:
-            str: The exported Markdown representation.
-        """
-        has_title = False
-        prev_text = ""
-        md_texts: list[str] = []
-
-        if self.main_text is not None:
-            for orig_item in self.main_text[main_text_start:main_text_stop]:
-                markdown_text = ""
-
-                item = (
-                    self._resolve_ref(orig_item)
-                    if isinstance(orig_item, Ref)
-                    else orig_item
-                )
-                if item is None:
-                    continue
-
-                item_type = item.obj_type
-                if isinstance(item, BaseText) and item_type in main_text_labels:
-                    text = item.text
-
-                    # ignore repeated text
-                    if prev_text == text:
-                        continue
-                    else:
-                        prev_text = text
-
-                    # first title match
-                    if item_type == "title" and not has_title:
-                        if strict_text:
-                            markdown_text = f"{text}"
-                        else:
-                            markdown_text = f"# {text}"
-                        has_title = True
-
-                    # secondary titles
-                    elif item_type in {"title", "subtitle-level-1"} or (
-                        has_title and item_type == "title"
-                    ):
-                        if strict_text:
-                            markdown_text = f"{text}"
-                        else:
-                            markdown_text = f"## {text}"
-
-                    # normal text
-                    else:
-                        markdown_text = text
-
-                elif (
-                    isinstance(item, Table)
-                    and item.data
-                    and item_type in main_text_labels
-                    and not strict_text
-                ):
-                    table = []
-                    for row in item.data:
-                        tmp = []
-                        for col in row:
-                            tmp.append(col.text)
-                        table.append(tmp)
-
-                    if len(table) > 1 and len(table[0]) > 0:
-                        try:
-                            md_table = tabulate(
-                                table[1:], headers=table[0], tablefmt="github"
-                            )
-                        except ValueError:
-                            md_table = tabulate(
-                                table[1:],
-                                headers=table[0],
-                                tablefmt="github",
-                                disable_numparse=True,
-                            )
-
-                        markdown_text = md_table
-
-                if markdown_text:
-                    md_texts.append(markdown_text)
-
-        result = delim.join(md_texts)
-        return result
-
-    def export_to_document_tokens(
-        self,
-        delim: str = "\n\n",
-        main_text_start: int = 0,
-        main_text_stop: Optional[int] = None,
-        main_text_labels: list[str] = [
-            "title",
-            "subtitle-level-1",
-            "paragraph",
-            "caption",
-            "table",
-            "figure",
-        ],
-        page_tagging: bool = True,
-        location_tagging: bool = True,
-        location_dimensions: Tuple[int, int] = (100, 100),
-        add_new_line: bool = True,
-    ) -> str:
-        r"""Exports the document content to an DocumentToken format.
-
-        Operates on a slice of the document's main_text as defined through arguments
-        main_text_start and main_text_stop; defaulting to the whole main_text.
-
-        Args:
-            delim (str, optional): The delimiter used to separate text blocks in the
-                exported XML. Default is two newline characters ("\n\n").
-            main_text_start (int, optional): The starting index of the main text to
-                be included in the XML. Default is 0 (the beginning of the text).
-            main_text_stop (Optional[int], optional): The stopping index of the main
-                text. If set to None, the export includes text up to the end.
-                Default is None.
-            main_text_labels (list[str], optional): A list of text labels that
-                categorize the different sections of the document (e.g., "title",
-                "subtitle-level-1", "paragraph", "caption"). Default labels are
-                "title", "subtitle-level-1", "paragraph", and "caption".
-            location_tagging (bool, optional): Determines whether to include
-                location-based tagging in the XML. If True, the exported XML will
-                contain information about the locations of the text elements.
-                Default is True.
-            location_dimensions (Tuple[int, int], optional): Specifies the dimensions
-                (width and height) for the location tagging, if enabled.
-                Default is [100, 100].
-            add_new_line (bool, optional): Whether to add new line characters after
-                each text block. If True, a new line is added after each block of
-                text in the XML. Default is True.
-
-        Returns:
-            str: The content of the document formatted as an XML string.
-        """
-        xml_str = DocumentToken.BEG_DOCUMENT.value
-
-        new_line = ""
-        if add_new_line:
-            new_line = "\n"
-
-        if self.main_text is not None:
-            for orig_item in self.main_text[main_text_start:main_text_stop]:
-
-                item = (
-                    self._resolve_ref(orig_item)
-                    if isinstance(orig_item, Ref)
-                    else orig_item
-                )
-
-                if item is None:
-                    continue
-
-                prov = item.prov
-
-                loc_str = ""  # default is zero
-                if (
-                    location_tagging
-                    and self.page_dimensions is not None
-                    and prov is not None
-                    and len(prov) > 0
-                ):
-
-                    page = prov[0].page
-                    page_dim = self.page_dimensions[page - 1]
-
-                    page_w = float(page_dim.width)
-                    page_h = float(page_dim.height)
-
-                    x0 = float(prov[0].bbox[0]) / float(page_w)
-                    y0 = float(prov[0].bbox[1]) / float(page_h)
-                    x1 = float(prov[0].bbox[2]) / float(page_w)
-                    y1 = float(prov[0].bbox[3]) / float(page_h)
-
-                    page_tok = ""
-                    if page_tagging:
-                        page_tok = DocumentToken.get_page_token(page=page)
-
-                    x0_tok = DocumentToken.get_location_token(
-                        val=min(x0, x1), rnorm=location_dimensions[0]
-                    )
-                    y0_tok = DocumentToken.get_location_token(
-                        val=min(y0, y1), rnorm=location_dimensions[1]
-                    )
-                    x1_tok = DocumentToken.get_location_token(
-                        val=max(x0, x1), rnorm=location_dimensions[0]
-                    )
-                    y1_tok = DocumentToken.get_location_token(
-                        val=max(y0, y1), rnorm=location_dimensions[1]
-                    )
-
-                    # update
-                    loc_str = f"{DocumentToken.BEG_LOCATION.value}"
-                    loc_str += f"{page_tok}"
-                    loc_str += f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
-                    loc_str += f"{DocumentToken.END_LOCATION.value}"
-
-                item_type = item.obj_type
-                if isinstance(item, BaseText) and (item_type in main_text_labels):
-                    text = item.text
-
-                    xml_str += f"<{item_type}>{loc_str}{text}</{item_type}>{new_line}"
-
-                elif isinstance(item, Table) and (item_type in main_text_labels):
-
-                    xml_str += f"<{item_type}>{loc_str}"
-
-                    if item.text is not None and len(item.text) > 0:
-                        xml_str += f"{DocumentToken.BEG_CAPTION.value}"
-                        xml_str += (
-                            f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
-                        )
-
-                    if item.data is not None and len(item.data) > 0:
-                        for i, row in enumerate(item.data):
-                            xml_str += f"<row_{i}>"
-                            for j, col in enumerate(row):
-                                text = col.text
-                                xml_str += f"<col_{j}>{text}</col_{j}>"
-
-                            xml_str += f"</row_{i}>{new_line}"
-
-                    xml_str += f"</{item_type}>{new_line}"
-
-                elif isinstance(item, Figure) and (item_type in main_text_labels):
-
-                    xml_str += f"<{item_type}>{loc_str}"
-
-                    if item.text is not None and len(item.text) > 0:
-                        xml_str += f"{DocumentToken.BEG_CAPTION.value}"
-                        xml_str += (
-                            f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
-                        )
-
-                    xml_str += f"</{item_type}>{new_line}"
-
-        xml_str += DocumentToken.END_DOCUMENT.value
-
-        return xml_str
+class DoclingDocument(DocumentContent):
+    description: Any
+    file_info: Any
+    pages: Dict[int, PageItem] = {}  # empty as default
diff --git a/docling_core/types/legacy/__init__.py b/docling_core/types/legacy/__init__.py
new file mode 100644
index 0000000..79fe213
--- /dev/null
+++ b/docling_core/types/legacy/__init__.py
@@ -0,0 +1,6 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+
+"""Package for models defined by the Document type."""
diff --git a/docling_core/types/legacy/base.py b/docling_core/types/legacy/base.py
new file mode 100644
index 0000000..2f1eeed
--- /dev/null
+++ b/docling_core/types/legacy/base.py
@@ -0,0 +1,196 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+
+"""Define common models across CCS objects."""
+from typing import Annotated, Literal, Optional, Union
+
+from pydantic import BaseModel, Field, PositiveInt, StrictStr
+
+from docling_core.search.mapping import es_field
+from docling_core.utils.alias import AliasModel
+
+CellData = tuple[float, float, float, float, str, str]
+
+CellHeader = tuple[
+    Literal["x0"],
+    Literal["y0"],
+    Literal["x1"],
+    Literal["y1"],
+    Literal["font"],
+    Literal["text"],
+]
+
+BoundingBox = Annotated[list[float], Field(min_length=4, max_length=4)]
+
+Span = Annotated[list[int], Field(min_length=2, max_length=2)]
+
+
+class CellsContainer(BaseModel):
+    """Cell container."""
+
+    data: Optional[list[CellData]] = None
+    header: CellHeader = ("x0", "y0", "x1", "y1", "font", "text")
+
+
+class S3Resource(BaseModel):
+    """Resource in a cloud object storage."""
+
+    mime: str
+    path: str
+    page: Optional[PositiveInt] = None
+
+
+class S3Data(AliasModel):
+    """Data object in a cloud object storage."""
+
+    pdf_document: Optional[list[S3Resource]] = Field(default=None, alias="pdf-document")
+    pdf_pages: Optional[list[S3Resource]] = Field(default=None, alias="pdf-pages")
+    pdf_images: Optional[list[S3Resource]] = Field(default=None, alias="pdf-images")
+    json_document: Optional[S3Resource] = Field(default=None, alias="json-document")
+    json_meta: Optional[S3Resource] = Field(default=None, alias="json-meta")
+    glm_json_document: Optional[S3Resource] = Field(
+        default=None, alias="glm-json-document"
+    )
+    figures: Optional[list[S3Resource]] = None
+
+
+class S3Reference(AliasModel):
+    """References an s3 resource."""
+
+    ref_s3_data: StrictStr = Field(
+        alias="__ref_s3_data", examples=["#/_s3_data/figures/0"]
+    )
+
+
+class Prov(AliasModel):
+    """Provenance."""
+
+    bbox: BoundingBox
+    page: PositiveInt
+    span: Span
+    ref_s3_data: Optional[StrictStr] = Field(
+        default=None, alias="__ref_s3_data", json_schema_extra=es_field(suppress=True)
+    )
+
+
+class BoundingBoxContainer(BaseModel):
+    """Bounding box container."""
+
+    min: BoundingBox
+    max: BoundingBox
+
+
+class BitmapObject(AliasModel):
+    """Bitmap object."""
+
+    obj_type: str = Field(alias="type")
+    bounding_box: BoundingBoxContainer = Field(
+        json_schema_extra=es_field(suppress=True)
+    )
+    prov: Prov
+
+
+class PageDimensions(BaseModel):
+    """Page dimensions."""
+
+    height: float
+    page: PositiveInt
+    width: float
+
+
+class TableCell(AliasModel):
+    """Table cell."""
+
+    bbox: Optional[BoundingBox] = None
+    spans: Optional[list[Span]] = None
+    text: str = Field(json_schema_extra=es_field(term_vector="with_positions_offsets"))
+    obj_type: str = Field(alias="type")
+
+
+class GlmTableCell(TableCell):
+    """Glm Table cell."""
+
+    col: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True))
+    col_header: bool = Field(
+        default=False, alias="col-header", json_schema_extra=es_field(suppress=True)
+    )
+    col_span: Optional[Span] = Field(
+        default=None, alias="col-span", json_schema_extra=es_field(suppress=True)
+    )
+    row: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True))
+    row_header: bool = Field(
+        default=False, alias="row-header", json_schema_extra=es_field(suppress=True)
+    )
+    row_span: Optional[Span] = Field(
+        default=None, alias="row-span", json_schema_extra=es_field(suppress=True)
+    )
+
+
+class BaseCell(AliasModel):
+    """Base cell."""
+
+    # FIXME: we need to check why we have bounding_box (this should be in prov)
+    bounding_box: Optional[BoundingBoxContainer] = Field(
+        default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
+    )
+    prov: Optional[list[Prov]] = None
+    text: Optional[str] = Field(
+        default=None, json_schema_extra=es_field(term_vector="with_positions_offsets")
+    )
+    obj_type: str = Field(
+        alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+
+
+class Table(BaseCell):
+    """Table."""
+
+    num_cols: int = Field(alias="#-cols")
+    num_rows: int = Field(alias="#-rows")
+    data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
+    model: Optional[str] = None
+
+
+# FIXME: let's add some figure specific data-types later
+class Figure(BaseCell):
+    """Figure."""
+
+
+class BaseText(AliasModel):
+    """Base model for text objects."""
+
+    text: StrictStr = Field(
+        json_schema_extra=es_field(term_vector="with_positions_offsets")
+    )
+    obj_type: StrictStr = Field(
+        alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    name: Optional[StrictStr] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    font: Optional[str] = None
+    prov: Optional[list[Prov]] = None
+
+
+class ListItem(BaseText):
+    """List item."""
+
+    identifier: str
+
+
+class Ref(AliasModel):
+    """Reference."""
+
+    name: str
+    obj_type: str = Field(alias="type")
+    ref: str = Field(alias="$ref")
+
+
+class PageReference(BaseModel):
+    """Page reference."""
+
+    hash: str = Field(json_schema_extra=es_field(type="keyword", ignore_above=8191))
+    model: str = Field(json_schema_extra=es_field(suppress=True))
+    page: PositiveInt = Field(json_schema_extra=es_field(type="short"))
diff --git a/docling_core/types/doc/doc_ann.py b/docling_core/types/legacy/doc_ann.py
similarity index 95%
rename from docling_core/types/doc/doc_ann.py
rename to docling_core/types/legacy/doc_ann.py
index f836615..974ea12 100644
--- a/docling_core/types/doc/doc_ann.py
+++ b/docling_core/types/legacy/doc_ann.py
@@ -8,7 +8,7 @@
 
 from pydantic import BaseModel
 
-from docling_core.types.doc.base import BoundingBox
+from docling_core.types.legacy.base import BoundingBox
 
 AnnotationReport = Any  # TODO
 
diff --git a/docling_core/types/doc/doc_ocr.py b/docling_core/types/legacy/doc_ocr.py
similarity index 96%
rename from docling_core/types/doc/doc_ocr.py
rename to docling_core/types/legacy/doc_ocr.py
index 875c0d1..656f54a 100644
--- a/docling_core/types/doc/doc_ocr.py
+++ b/docling_core/types/legacy/doc_ocr.py
@@ -8,7 +8,7 @@
 
 from pydantic import BaseModel, Field
 
-from docling_core.types.doc.base import BoundingBox
+from docling_core.types.legacy.base import BoundingBox
 from docling_core.utils.alias import AliasModel
 
 CoordsOrder = Literal["x1", "y1", "x2", "y2"]
diff --git a/docling_core/types/doc/doc_raw.py b/docling_core/types/legacy/doc_raw.py
similarity index 98%
rename from docling_core/types/doc/doc_raw.py
rename to docling_core/types/legacy/doc_raw.py
index 3e6a7e0..d0b4d71 100644
--- a/docling_core/types/doc/doc_raw.py
+++ b/docling_core/types/legacy/doc_raw.py
@@ -9,7 +9,7 @@
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
 
-from docling_core.types.doc.base import BoundingBox
+from docling_core.types.legacy.base import BoundingBox
 from docling_core.utils.alias import AliasModel
 
 FontDifferences = dict[str, Any]
diff --git a/docling_core/types/legacy/document.py b/docling_core/types/legacy/document.py
new file mode 100644
index 0000000..bccaf4b
--- /dev/null
+++ b/docling_core/types/legacy/document.py
@@ -0,0 +1,791 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+
+"""Models for the Docling Document data type."""
+
+from datetime import datetime
+from enum import Enum
+from typing import Generic, Optional, Tuple, Union
+
+from pydantic import (
+    AnyHttpUrl,
+    BaseModel,
+    Field,
+    NonNegativeInt,
+    StrictStr,
+    model_validator,
+)
+from tabulate import tabulate
+
+from docling_core.search.mapping import es_field
+from docling_core.types.base import (
+    Acquisition,
+    CollectionDocumentInfo,
+    CollectionNameTypeT,
+    DescriptionAdvancedT,
+    DescriptionAnalyticsT,
+    FileInfoObject,
+    Identifier,
+    IdentifierTypeT,
+    LanguageT,
+    Log,
+)
+from docling_core.types.legacy.base import (
+    BaseCell,
+    BaseText,
+    BitmapObject,
+    Figure,
+    PageDimensions,
+    PageReference,
+    Ref,
+    S3Data,
+    Table,
+)
+from docling_core.utils.alias import AliasModel
+
+
+class CCSFileInfoDescription(BaseModel, extra="forbid"):
+    """File info description."""
+
+    author: Optional[list[StrictStr]] = None
+    keywords: Optional[str] = None
+    subject: Optional[str] = None
+    title: Optional[StrictStr] = None
+    creation_date: Optional[str] = None  # datetime
+
+
+class CCSFileInfoObject(FileInfoObject, extra="forbid"):
+    """File info object."""
+
+    num_pages: Optional[int] = Field(default=None, alias="#-pages")
+
+    collection_name: Optional[str] = Field(
+        default=None,
+        alias="collection-name",
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+    description: Optional[CCSFileInfoDescription] = Field(
+        default=None, json_schema_extra=es_field(suppress=True)
+    )
+    page_hashes: Optional[list[PageReference]] = Field(
+        default=None, alias="page-hashes"
+    )
+
+
+class Affiliation(BaseModel, extra="forbid"):
+    """Affiliation."""
+
+    name: str = Field(
+        ...,
+        json_schema_extra=es_field(
+            fields={
+                "lower": {
+                    "normalizer": "lowercase_asciifolding",
+                    "type": "keyword",
+                    "ignore_above": 8191,
+                },
+                "keyword": {"type": "keyword", "ignore_above": 8191},
+            },
+        ),
+    )
+    id: Optional[str] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    source: Optional[str] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+
+
+class Author(BaseModel, extra="forbid"):
+    """Author."""
+
+    name: str = Field(
+        ...,
+        json_schema_extra=es_field(
+            type="text",
+            fields={
+                "lower": {
+                    "normalizer": "lowercase_asciifolding",
+                    "type": "keyword",
+                    "ignore_above": 8191,
+                },
+                "keyword": {"type": "keyword", "ignore_above": 8191},
+            },
+        ),
+    )
+    id: Optional[str] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    source: Optional[str] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    affiliations: Optional[list[Affiliation]] = None
+
+
+class Publication(BaseModel, Generic[IdentifierTypeT], extra="forbid"):
+    """Publication details of a journal or venue."""
+
+    identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field(
+        default=None,
+        description="Unique identifiers of a publication venue.",
+    )
+    name: StrictStr = Field(
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+        description="Name of the publication.",
+    )
+    alternate_names: Optional[list[StrictStr]] = Field(
+        default=None,
+        json_schema_extra=es_field(type="text"),
+        title="Alternate Names",
+        description="Other names or abbreviations of this publication.",
+    )
+    type: Optional[list[StrictStr]] = Field(
+        default=None,
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+        description="Type of publication (journal article, conference, review,...).",
+    )
+    pages: Optional[StrictStr] = Field(
+        default=None,
+        json_schema_extra=es_field(type="text"),
+        description="Page range in the publication.",
+    )
+    issue: Optional[StrictStr] = Field(
+        default=None,
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+        description="Publication issue (issue number).",
+    )
+    volume: Optional[StrictStr] = Field(
+        default=None,
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+        description="Publication volume.",
+    )
+    url: Optional[AnyHttpUrl] = Field(
+        default=None,
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+        description="URL on the publication site.",
+    )
+
+
+class DescriptionLicense(BaseModel, extra="forbid"):
+    """Licence in document description."""
+
+    code: Optional[StrictStr] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    text: Optional[StrictStr] = None
+
+
+class CCSDocumentDescription(
+    AliasModel,
+    Generic[
+        DescriptionAdvancedT,
+        DescriptionAnalyticsT,
+        IdentifierTypeT,
+        LanguageT,
+        CollectionNameTypeT,
+    ],
+):
+    """Description in document."""
+
+    title: Optional[StrictStr] = None
+    abstract: Optional[list[StrictStr]] = None
+    authors: Optional[list[Author]] = None
+    affiliations: Optional[list[Affiliation]] = None
+    subjects: Optional[list[str]] = Field(
+        default=None,
+        json_schema_extra=es_field(
+            fields={"keyword": {"ignore_above": 8191, "type": "keyword"}}
+        ),
+    )
+    keywords: Optional[list[str]] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    publication_date: Optional[datetime] = None
+    languages: Optional[list[LanguageT]] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    license_: Optional[DescriptionLicense] = Field(default=None, alias="license")
+    publishers: Optional[list[StrictStr]] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    url_refs: Optional[list[str]] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    references: Optional[list[Identifier[IdentifierTypeT]]] = None
+    publication: Optional[list[Publication]] = Field(
+        default=None, description="List of publication journals or venues."
+    )
+    reference_count: Optional[NonNegativeInt] = Field(
+        default=None,
+        title="Reference Count",
+        description="Total number of documents referenced by this document.",
+        json_schema_extra=es_field(type="integer"),
+    )
+    citation_count: Optional[NonNegativeInt] = Field(
+        default=None,
+        title="Citation Count",
+        description=(
+            "Total number of citations that this document has received (number "
+            "of documents in whose bibliography this document appears)."
+        ),
+        json_schema_extra=es_field(type="integer"),
+    )
+    citation_date: Optional[datetime] = Field(
+        default=None,
+        title="Citation Count Date",
+        description="Last update date of the citation count.",
+    )
+    advanced: Optional[DescriptionAdvancedT] = None
+    analytics: Optional[DescriptionAnalyticsT] = None
+    logs: list[Log]
+    collection: Optional[CollectionDocumentInfo[CollectionNameTypeT]] = Field(
+        default=None, description="The collection information of this document."
+    )
+    acquisition: Optional[Acquisition] = Field(
+        default=None,
+        description=(
+            "Information on how the document was obtained, for data governance"
+            " purposes."
+        ),
+    )
+
+
+class MinimalDocument(
+    AliasModel,
+    Generic[
+        DescriptionAdvancedT,
+        DescriptionAnalyticsT,
+        IdentifierTypeT,
+        LanguageT,
+        CollectionNameTypeT,
+    ],
+):
+    """Minimal model for a document."""
+
+    name: StrictStr = Field(alias="_name")
+    obj_type: Optional[StrictStr] = Field("document", alias="type")
+    description: CCSDocumentDescription[
+        DescriptionAdvancedT,
+        DescriptionAnalyticsT,
+        IdentifierTypeT,
+        LanguageT,
+        CollectionNameTypeT,
+    ]
+    file_info: FileInfoObject = Field(alias="file-info")
+    main_text: Optional[list[Union[Ref, BaseText]]] = Field(
+        default=None, alias="main-text"
+    )
+    figures: Optional[list[Figure]] = None
+    tables: Optional[list[Table]] = None
+
+
+class CCSDocument(
+    MinimalDocument,
+    Generic[
+        DescriptionAdvancedT,
+        DescriptionAnalyticsT,
+        IdentifierTypeT,
+        LanguageT,
+        CollectionNameTypeT,
+    ],
+):
+    """Model for a CCS-generated document."""
+
+    obj_type: Optional[StrictStr] = Field("pdf-document", alias="type")
+    bitmaps: Optional[list[BitmapObject]] = None
+    equations: Optional[list[BaseCell]] = None
+    footnotes: Optional[list[BaseText]] = None
+    file_info: CCSFileInfoObject = Field(alias="file-info")
+    main_text: Optional[list[Union[Ref, BaseText]]] = Field(
+        default=None,
+        alias="main-text",
+    )
+    page_dimensions: Optional[list[PageDimensions]] = Field(
+        default=None, alias="page-dimensions"
+    )
+    page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
+    page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
+    s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
+
+    @model_validator(mode="before")
+    @classmethod
+    def from_dict(cls, data):
+        """Validates and fixes the input data."""
+        if not isinstance(data, dict):
+            return data
+        description_collection = data["description"].get("collection")
+        if not description_collection:
+            data["description"].setdefault("collection", {})
+
+        data["description"]["collection"].setdefault("type", "Document")
+        logs = data["description"].get("logs")
+        if not logs:
+            data["description"].setdefault("logs", [])
+
+        abstract = data["description"].get("abstract")
+        if abstract is not None and not isinstance(abstract, list):
+            if isinstance(abstract, str):
+                data["description"]["abstract"] = [abstract]
+            else:
+                data["description"].pop("abstract")
+
+        for key in ["affiliations", "authors"]:
+            descr = data["description"].get(key)
+            if descr is not None and not isinstance(descr, list):
+                if isinstance(descr, dict):
+                    data["description"][key] = [descr]
+                else:
+                    data["description"].pop(key)
+
+        if data.get("main-text"):
+            for item in data["main-text"]:
+                if ref := item.pop("__ref", None):
+                    item["$ref"] = ref
+
+        return data
+
+
+class DocumentToken(Enum):
+    """Class to represent an LLM friendly representation of a Document."""
+
+    BEG_DOCUMENT = "<document>"
+    END_DOCUMENT = "</document>"
+
+    BEG_TITLE = "<title>"
+    END_TITLE = "</title>"
+
+    BEG_ABSTRACT = "<abstract>"
+    END_ABSTRACT = "</abstract>"
+
+    BEG_DOI = "<doi>"
+    END_DOI = "</doi>"
+    BEG_DATE = "<date>"
+    END_DATE = "</date>"
+
+    BEG_AUTHORS = "<authors>"
+    END_AUTHORS = "</authors>"
+    BEG_AUTHOR = "<author>"
+    END_AUTHOR = "</author>"
+
+    BEG_AFFILIATIONS = "<affiliations>"
+    END_AFFILIATIONS = "</affiliations>"
+    BEG_AFFILIATION = "<affiliation>"
+    END_AFFILIATION = "</affiliation>"
+
+    BEG_HEADER = "<section-header>"
+    END_HEADER = "</section-header>"
+    BEG_TEXT = "<text>"
+    END_TEXT = "</text>"
+    BEG_PARAGRAPH = "<paragraph>"
+    END_PARAGRAPH = "</paragraph>"
+    BEG_TABLE = "<table>"
+    END_TABLE = "</table>"
+    BEG_FIGURE = "<figure>"
+    END_FIGURE = "</figure>"
+    BEG_CAPTION = "<caption>"
+    END_CAPTION = "</caption>"
+    BEG_EQUATION = "<equation>"
+    END_EQUATION = "</equation>"
+    BEG_LIST = "<list>"
+    END_LIST = "</list>"
+    BEG_LISTITEM = "<list-item>"
+    END_LISTITEM = "</list-item>"
+
+    BEG_LOCATION = "<location>"
+    END_LOCATION = "</location>"
+    BEG_GROUP = "<group>"
+    END_GROUP = "</group>"
+
+    @classmethod
+    def get_special_tokens(
+        cls,
+        max_rows: int = 100,
+        max_cols: int = 100,
+        max_pages: int = 1000,
+        page_dimension: Tuple[int, int] = (100, 100),
+    ):
+        """Function to get all special document tokens."""
+        special_tokens = [token.value for token in cls]
+
+        # Adding dynamically generated row and col tokens
+        for i in range(0, max_rows):
+            special_tokens += [f"<row_{i}>", f"</row_{i}>"]
+
+        for i in range(0, max_cols):
+            special_tokens += [f"<col_{i}>", f"</col_{i}>"]
+
+        for i in range(6):
+            special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
+
+        # Adding dynamically generated page-tokens
+        for i in range(0, max_pages):
+            special_tokens.append(f"<page_{i}>")
+
+        # Adding dynamically generated location-tokens
+        for i in range(0, max(page_dimension[0], page_dimension[1])):
+            special_tokens.append(f"<loc_{i}>")
+
+        return special_tokens
+
+    @staticmethod
+    def get_page_token(page: int):
+        """Function to get page tokens."""
+        return f"<page_{page}>"
+
+    @staticmethod
+    def get_location_token(val: float, rnorm: int = 100):
+        """Function to get location tokens."""
+        val_ = round(rnorm * val)
+
+        if val_ < 0:
+            return "<loc_0>"
+
+        if val_ > rnorm:
+            return f"<loc_{rnorm}>"
+
+        return f"<loc_{val_}>"
+
+
+class ExportedCCSDocument(
+    MinimalDocument,
+    Generic[
+        DescriptionAdvancedT,
+        DescriptionAnalyticsT,
+        IdentifierTypeT,
+        LanguageT,
+        CollectionNameTypeT,
+    ],
+):
+    """Document model for Docling."""
+
+    obj_type: Optional[StrictStr] = Field(
+        "pdf-document",
+        alias="type",
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+    bitmaps: Optional[list[BitmapObject]] = None
+    equations: Optional[list[BaseCell]] = None
+    footnotes: Optional[list[BaseText]] = None
+    description: CCSDocumentDescription[
+        DescriptionAdvancedT,
+        DescriptionAnalyticsT,
+        IdentifierTypeT,
+        LanguageT,
+        CollectionNameTypeT,
+    ]
+    file_info: CCSFileInfoObject = Field(alias="file-info")
+    main_text: Optional[list[Union[Ref, BaseText]]] = Field(
+        default=None, alias="main-text"
+    )
+    page_dimensions: Optional[list[PageDimensions]] = Field(
+        default=None, alias="page-dimensions"
+    )
+    page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
+    page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
+    s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
+    identifiers: Optional[list[Identifier[IdentifierTypeT]]] = None
+
+    @model_validator(mode="before")
+    @classmethod
+    def from_dict(cls, data):
+        """Fix ref in main-text."""
+        if not isinstance(data, dict):
+            return data
+        if data.get("main-text"):
+            for item in data["main-text"]:
+                if ref := item.pop("__ref", None):
+                    item["$ref"] = ref
+
+        return data
+
+    def _resolve_ref(self, item: Ref) -> Optional[Union[BaseCell, BaseText]]:
+        """Return the resolved reference.
+
+        Resolved the Ref object within the document.
+        If the object is not found, None is returned.
+        """
+        result: Optional[Union[BaseCell, BaseText]] = None
+
+        # NOTE: currently only resolves refs explicitely, such that we can make
+        # assumptions on ref parts
+        if item.obj_type == "table" and self.tables:
+            parts = item.ref.split("/")
+            result = self.tables[int(parts[2])]
+        elif item.obj_type == "figure" and self.figures:
+            parts = item.ref.split("/")
+            result = self.figures[int(parts[2])]
+        elif item.obj_type == "equation" and self.equations:
+            parts = item.ref.split("/")
+            result = self.equations[int(parts[2])]
+        elif item.obj_type == "footnote" and self.footnotes:
+            parts = item.ref.split("/")
+            result = self.footnotes[int(parts[2])]
+
+        return result
+
+    def export_to_markdown(
+        self,
+        delim: str = "\n\n",
+        main_text_start: int = 0,
+        main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+            "table",
+        ],
+        strict_text: bool = False,
+    ) -> str:
+        r"""Serialize to Markdown.
+
+        Operates on a slice of the document's main_text as defined through arguments
+        main_text_start and main_text_stop; defaulting to the whole main_text.
+
+        Args:
+            delim (str, optional): Delimiter to use when concatenating the various
+                Markdown parts. Defaults to "\n\n".
+            main_text_start (int, optional): Main-text slicing start index (inclusive).
+                Defaults to 0.
+            main_text_end (Optional[int], optional): Main-text slicing stop index
+                (exclusive). Defaults to None.
+
+        Returns:
+            str: The exported Markdown representation.
+        """
+        has_title = False
+        prev_text = ""
+        md_texts: list[str] = []
+
+        if self.main_text is not None:
+            for orig_item in self.main_text[main_text_start:main_text_stop]:
+                markdown_text = ""
+
+                item = (
+                    self._resolve_ref(orig_item)
+                    if isinstance(orig_item, Ref)
+                    else orig_item
+                )
+                if item is None:
+                    continue
+
+                item_type = item.obj_type
+                if isinstance(item, BaseText) and item_type in main_text_labels:
+                    text = item.text
+
+                    # ignore repeated text
+                    if prev_text == text:
+                        continue
+                    else:
+                        prev_text = text
+
+                    # first title match
+                    if item_type == "title" and not has_title:
+                        if strict_text:
+                            markdown_text = f"{text}"
+                        else:
+                            markdown_text = f"# {text}"
+                        has_title = True
+
+                    # secondary titles
+                    elif item_type in {"title", "subtitle-level-1"} or (
+                        has_title and item_type == "title"
+                    ):
+                        if strict_text:
+                            markdown_text = f"{text}"
+                        else:
+                            markdown_text = f"## {text}"
+
+                    # normal text
+                    else:
+                        markdown_text = text
+
+                elif (
+                    isinstance(item, Table)
+                    and item.data
+                    and item_type in main_text_labels
+                    and not strict_text
+                ):
+                    table = []
+                    for row in item.data:
+                        tmp = []
+                        for col in row:
+                            tmp.append(col.text)
+                        table.append(tmp)
+
+                    if len(table) > 1 and len(table[0]) > 0:
+                        try:
+                            md_table = tabulate(
+                                table[1:], headers=table[0], tablefmt="github"
+                            )
+                        except ValueError:
+                            md_table = tabulate(
+                                table[1:],
+                                headers=table[0],
+                                tablefmt="github",
+                                disable_numparse=True,
+                            )
+
+                        markdown_text = md_table
+
+                if markdown_text:
+                    md_texts.append(markdown_text)
+
+        result = delim.join(md_texts)
+        return result
+
+    def export_to_document_tokens(
+        self,
+        delim: str = "\n\n",
+        main_text_start: int = 0,
+        main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+            "table",
+            "figure",
+        ],
+        page_tagging: bool = True,
+        location_tagging: bool = True,
+        location_dimensions: Tuple[int, int] = (100, 100),
+        add_new_line: bool = True,
+    ) -> str:
+        r"""Exports the document content to an DocumentToken format.
+
+        Operates on a slice of the document's main_text as defined through arguments
+        main_text_start and main_text_stop; defaulting to the whole main_text.
+
+        Args:
+            delim (str, optional): The delimiter used to separate text blocks in the
+                exported XML. Default is two newline characters ("\n\n").
+            main_text_start (int, optional): The starting index of the main text to
+                be included in the XML. Default is 0 (the beginning of the text).
+            main_text_stop (Optional[int], optional): The stopping index of the main
+                text. If set to None, the export includes text up to the end.
+                Default is None.
+            main_text_labels (list[str], optional): A list of text labels that
+                categorize the different sections of the document (e.g., "title",
+                "subtitle-level-1", "paragraph", "caption"). Default labels are
+                "title", "subtitle-level-1", "paragraph", and "caption".
+            location_tagging (bool, optional): Determines whether to include
+                location-based tagging in the XML. If True, the exported XML will
+                contain information about the locations of the text elements.
+                Default is True.
+            location_dimensions (Tuple[int, int], optional): Specifies the dimensions
+                (width and height) for the location tagging, if enabled.
+                Default is [100, 100].
+            add_new_line (bool, optional): Whether to add new line characters after
+                each text block. If True, a new line is added after each block of
+                text in the XML. Default is True.
+
+        Returns:
+            str: The content of the document formatted as an XML string.
+        """
+        xml_str = DocumentToken.BEG_DOCUMENT.value
+
+        new_line = ""
+        if add_new_line:
+            new_line = "\n"
+
+        if self.main_text is not None:
+            for orig_item in self.main_text[main_text_start:main_text_stop]:
+
+                item = (
+                    self._resolve_ref(orig_item)
+                    if isinstance(orig_item, Ref)
+                    else orig_item
+                )
+
+                if item is None:
+                    continue
+
+                prov = item.prov
+
+                loc_str = ""  # default is zero
+                if (
+                    location_tagging
+                    and self.page_dimensions is not None
+                    and prov is not None
+                    and len(prov) > 0
+                ):
+
+                    page = prov[0].page
+                    page_dim = self.page_dimensions[page - 1]
+
+                    page_w = float(page_dim.width)
+                    page_h = float(page_dim.height)
+
+                    x0 = float(prov[0].bbox[0]) / float(page_w)
+                    y0 = float(prov[0].bbox[1]) / float(page_h)
+                    x1 = float(prov[0].bbox[2]) / float(page_w)
+                    y1 = float(prov[0].bbox[3]) / float(page_h)
+
+                    page_tok = ""
+                    if page_tagging:
+                        page_tok = DocumentToken.get_page_token(page=page)
+
+                    x0_tok = DocumentToken.get_location_token(
+                        val=min(x0, x1), rnorm=location_dimensions[0]
+                    )
+                    y0_tok = DocumentToken.get_location_token(
+                        val=min(y0, y1), rnorm=location_dimensions[1]
+                    )
+                    x1_tok = DocumentToken.get_location_token(
+                        val=max(x0, x1), rnorm=location_dimensions[0]
+                    )
+                    y1_tok = DocumentToken.get_location_token(
+                        val=max(y0, y1), rnorm=location_dimensions[1]
+                    )
+
+                    # update
+                    loc_str = f"{DocumentToken.BEG_LOCATION.value}"
+                    loc_str += f"{page_tok}"
+                    loc_str += f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
+                    loc_str += f"{DocumentToken.END_LOCATION.value}"
+
+                item_type = item.obj_type
+                if isinstance(item, BaseText) and (item_type in main_text_labels):
+                    text = item.text
+
+                    xml_str += f"<{item_type}>{loc_str}{text}</{item_type}>{new_line}"
+
+                elif isinstance(item, Table) and (item_type in main_text_labels):
+
+                    xml_str += f"<{item_type}>{loc_str}"
+
+                    if item.text is not None and len(item.text) > 0:
+                        xml_str += f"{DocumentToken.BEG_CAPTION.value}"
+                        xml_str += (
+                            f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
+                        )
+
+                    if item.data is not None and len(item.data) > 0:
+                        for i, row in enumerate(item.data):
+                            xml_str += f"<row_{i}>"
+                            for j, col in enumerate(row):
+                                text = col.text
+                                xml_str += f"<col_{j}>{text}</col_{j}>"
+
+                            xml_str += f"</row_{i}>{new_line}"
+
+                    xml_str += f"</{item_type}>{new_line}"
+
+                elif isinstance(item, Figure) and (item_type in main_text_labels):
+
+                    xml_str += f"<{item_type}>{loc_str}"
+
+                    if item.text is not None and len(item.text) > 0:
+                        xml_str += f"{DocumentToken.BEG_CAPTION.value}"
+                        xml_str += (
+                            f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
+                        )
+
+                    xml_str += f"</{item_type}>{new_line}"
+
+        xml_str += DocumentToken.END_DOCUMENT.value
+
+        return xml_str
diff --git a/docling_core/types/newdoc/__init__.py b/docling_core/types/newdoc/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/docling_core/types/newdoc/base.py b/docling_core/types/newdoc/base.py
deleted file mode 100644
index b082ea6..0000000
--- a/docling_core/types/newdoc/base.py
+++ /dev/null
@@ -1,120 +0,0 @@
-import copy
-from enum import Enum
-from typing import Tuple
-
-from pydantic import BaseModel
-
-
-## All copied from docling
-class CoordOrigin(str, Enum):
-    TOPLEFT = "TOPLEFT"
-    BOTTOMLEFT = "BOTTOMLEFT"
-
-
-class Size(BaseModel):
-    width: float = 0.0
-    height: float = 0.0
-
-
-class BoundingBox(BaseModel):
-    l: float  # left
-    t: float  # top
-    r: float  # right
-    b: float  # bottom
-
-    coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
-
-    @property
-    def width(self):
-        return self.r - self.l
-
-    @property
-    def height(self):
-        return abs(self.t - self.b)
-
-    def scaled(self, scale: float) -> "BoundingBox":
-        out_bbox = copy.deepcopy(self)
-        out_bbox.l *= scale
-        out_bbox.r *= scale
-        out_bbox.t *= scale
-        out_bbox.b *= scale
-
-        return out_bbox
-
-    def normalized(self, page_size: Size) -> "BoundingBox":
-        out_bbox = copy.deepcopy(self)
-        out_bbox.l /= page_size.width
-        out_bbox.r /= page_size.width
-        out_bbox.t /= page_size.height
-        out_bbox.b /= page_size.height
-
-        return out_bbox
-
-    def as_tuple(self):
-        if self.coord_origin == CoordOrigin.TOPLEFT:
-            return (self.l, self.t, self.r, self.b)
-        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            return (self.l, self.b, self.r, self.t)
-
-    @classmethod
-    def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
-        if origin == CoordOrigin.TOPLEFT:
-            l, t, r, b = coord[0], coord[1], coord[2], coord[3]
-            if r < l:
-                l, r = r, l
-            if b < t:
-                b, t = t, b
-
-            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
-        elif origin == CoordOrigin.BOTTOMLEFT:
-            l, b, r, t = coord[0], coord[1], coord[2], coord[3]
-            if r < l:
-                l, r = r, l
-            if b > t:
-                b, t = t, b
-
-            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
-
-    def area(self) -> float:
-        return (self.r - self.l) * (self.b - self.t)
-
-    def intersection_area_with(self, other: "BoundingBox") -> float:
-        # Calculate intersection coordinates
-        left = max(self.l, other.l)
-        top = max(self.t, other.t)
-        right = min(self.r, other.r)
-        bottom = min(self.b, other.b)
-
-        # Calculate intersection dimensions
-        width = right - left
-        height = bottom - top
-
-        # If the bounding boxes do not overlap, width or height will be negative
-        if width <= 0 or height <= 0:
-            return 0.0
-
-        return width * height
-
-    def to_bottom_left_origin(self, page_height) -> "BoundingBox":
-        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            return self
-        elif self.coord_origin == CoordOrigin.TOPLEFT:
-            return BoundingBox(
-                l=self.l,
-                r=self.r,
-                t=page_height - self.t,
-                b=page_height - self.b,
-                coord_origin=CoordOrigin.BOTTOMLEFT,
-            )
-
-    def to_top_left_origin(self, page_height):
-        if self.coord_origin == CoordOrigin.TOPLEFT:
-            return self
-        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            return BoundingBox(
-                l=self.l,
-                r=self.r,
-                t=page_height - self.t,  # self.b
-                b=page_height - self.b,  # self.t
-                coord_origin=CoordOrigin.TOPLEFT,
-            )
diff --git a/docling_core/types/newdoc/document.py b/docling_core/types/newdoc/document.py
deleted file mode 100644
index dd2597d..0000000
--- a/docling_core/types/newdoc/document.py
+++ /dev/null
@@ -1,95 +0,0 @@
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-from pydantic import AnyUrl, BaseModel, Field
-
-from docling_core.types.newdoc.base import BoundingBox, Size
-
-
-class FigureData(BaseModel):  # TBD
-    pass
-
-
-class TableData(BaseModel):  # TBD
-    pass
-
-
-class RefItem(BaseModel):
-    cref: str = Field(alias="$ref")
-
-    def resolve(self, doc: "DoclingDocument"):
-        _, path, index_str = self.cref.split("/")
-        index = int(index_str)
-        obj = doc.__getattribute__(path)[index]
-        return obj
-
-
-class ImageRef(BaseModel):
-    format: str  # png, etc.
-    dpi: int  # ...
-    size: Size
-    uri: AnyUrl
-
-
-class ProvenanceItem(BaseModel):
-    page_no: int
-    bbox: BoundingBox
-    charspan: Tuple[int, int]
-
-
-class DocItem(BaseModel):
-    dloc: str  # format spec ({document_hash}{json-path})
-    hash: int
-    label: str
-    parent: Optional[RefItem]
-    children: List[RefItem]
-    prov: List[ProvenanceItem]
-
-
-class TextItem(DocItem):
-    orig: str  # untreated representation
-    text: str  # sanitized representation
-
-
-class FloatingItem(DocItem):
-    caption: Optional[Union[RefItem, TextItem]]
-    references: List[Union[RefItem, TextItem]]
-    footnotes: List[Union[RefItem, TextItem]]
-    data: Any
-    image: Optional[ImageRef]
-
-
-class FigureItem(DocItem):
-    data: FigureData
-
-
-class TableItem(DocItem):
-    data: TableData
-
-
-class KeyValueItem(DocItem):
-    pass
-
-
-ContentItem = Union[TextItem, FigureItem, TableItem, KeyValueItem]
-
-
-class DocumentContent(BaseModel):
-    furniture: List[RefItem] = []
-    body: List[RefItem] = []
-    texts: List[TextItem] = []
-    figures: List[FigureItem] = []
-    tables: List[TableItem] = []
-    key_value_items: List[KeyValueItem] = []
-
-
-class PageItem(DocumentContent):
-    hash: str  # page hash
-    size: Size
-    image: Optional[ImageRef]
-    num_elements: int
-
-
-class DoclingDocument(DocumentContent):
-    description: Any
-    file_info: Any
-    pages: Dict[int, PageItem] = {}  # empty as default
diff --git a/docling_core/types/rec/subject.py b/docling_core/types/rec/subject.py
index 69d2e88..ceb28e3 100644
--- a/docling_core/types/rec/subject.py
+++ b/docling_core/types/rec/subject.py
@@ -15,7 +15,7 @@
     SubjectNameTypeT,
     SubjectTypeT,
 )
-from docling_core.types.doc.base import S3Reference
+from docling_core.types.legacy.base import S3Reference
 from docling_core.utils.alias import AliasModel
 
 
diff --git a/docling_core/utils/ds_generate_jsonschema.py b/docling_core/utils/ds_generate_jsonschema.py
index 67acf19..8bdf5d6 100644
--- a/docling_core/utils/ds_generate_jsonschema.py
+++ b/docling_core/utils/ds_generate_jsonschema.py
@@ -6,7 +6,7 @@
 """Generate the JSON Schema of pydantic models and export them to files.
 
 Example:
-    python docling_core/utils/ds_generate_jsonschema.py doc.base.TableCell
+    python docling_core/utils/ds_generate_jsonschema.py legacy.base.TableCell
 
 """
 import argparse
@@ -48,7 +48,7 @@ def main() -> None:
     """Print the JSON Schema of a model."""
     argparser = argparse.ArgumentParser()
     argparser.add_argument(
-        "class_ref", help="Class reference, e.g., doc.base.TableCell"
+        "class_ref", help="Class reference, e.g., legacy.base.TableCell"
     )
     args = argparser.parse_args()
 
diff --git a/test/data/newdoc/dummy_doc.yaml b/test/data/newdoc/dummy_doc.yaml
index f092eb8..632c164 100644
--- a/test/data/newdoc/dummy_doc.yaml
+++ b/test/data/newdoc/dummy_doc.yaml
@@ -6,7 +6,7 @@ file_info: # FileInfoType - TBD
 furniture: # Headers, footers, framing, navigation elements, all other non-body text
   - $ref: "/texts/0"
 
-body: # All elements in other arrays, by-reference only
+body: # Top-level elements in other arrays, by-reference only, must not have parent.
   - $ref: "/texts/1"
   - $ref: "/figure/0"
   - $ref: "/texts/2"
@@ -20,7 +20,7 @@ texts: # All elements that have a text-string representation, with actual data
     hash: 132103230
     label: "page_header"
     parent: null
-    children: [ ]
+    children: []
     prov:
       - page_no: 1
         bbox:
diff --git a/test/test_base.py b/test/test_base.py
index 89cda5a..76c9cbc 100644
--- a/test/test_base.py
+++ b/test/test_base.py
@@ -20,7 +20,7 @@
     Log,
     StrictDateTime,
 )
-from docling_core.types.doc.document import CCSDocumentDescription
+from docling_core.types.legacy.document import CCSDocumentDescription
 from docling_core.types.rec.record import RecordDescription
 
 
diff --git a/test/test_doc_base.py b/test/test_doc_base.py
index d5a48ff..a1b8186 100644
--- a/test/test_doc_base.py
+++ b/test/test_doc_base.py
@@ -7,7 +7,7 @@
 import pytest
 from pydantic import ValidationError
 
-from docling_core.types.doc.base import Prov, S3Reference
+from docling_core.types.legacy.base import Prov, S3Reference
 
 
 def test_s3_reference():
diff --git a/test/test_doc_schema.py b/test/test_doc_schema.py
index a899b08..d3476f6 100644
--- a/test/test_doc_schema.py
+++ b/test/test_doc_schema.py
@@ -17,7 +17,7 @@
     IdentifierTypeT,
     LanguageT,
 )
-from docling_core.types.doc.document import (
+from docling_core.types.legacy.document import (
     CCSDocument,
     CCSDocumentDescription,
     Publication,
diff --git a/test/test_doc_schema_extractor.py b/test/test_doc_schema_extractor.py
index 9f1f9d9..78dbcd4 100644
--- a/test/test_doc_schema_extractor.py
+++ b/test/test_doc_schema_extractor.py
@@ -8,7 +8,7 @@
 
 from pydantic import ValidationError
 
-from docling_core.types.doc.document import CCSDocument
+from docling_core.types.legacy.document import CCSDocument
 
 
 def test_ccs_document_update():
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
new file mode 100644
index 0000000..ee448ca
--- /dev/null
+++ b/test/test_docling_doc.py
@@ -0,0 +1,63 @@
+import yaml
+import pytest
+from docling_core.types import DoclingDocument, BoundingBox
+from docling_core.types.doc.document import ProvenanceItem
+
+
+def test_load_serialize_doc():
+    # Read YAML file
+    with open("test/data/newdoc/dummy_doc.yaml", "r") as fp:
+        dict_from_yaml = yaml.safe_load(fp)
+
+    doc = DoclingDocument.model_validate(dict_from_yaml)
+
+    # Objects can be accessed
+    text_item = doc.texts[0]
+
+    # access members
+    text_item.text
+    text_item.prov[0].page_no
+
+    # Objects that are references need explicit resolution for now:
+    obj = doc.body[2].resolve(doc=doc)  # Text item with parent
+    parent = obj.parent.resolve(doc=doc)  # it is a figure
+
+    obj2 = parent.children[0].resolve(
+        doc=doc
+    )  # Child of figure must be the same as obj
+
+    assert obj == obj2
+    assert obj is obj2
+
+    doc_dumped = doc.model_dump(mode="json", by_alias=True)
+    out_yaml = yaml.safe_dump(doc_dumped)
+
+    doc_reload = DoclingDocument.model_validate(yaml.safe_load(out_yaml))
+
+    assert doc_reload == doc  # must be equal
+    assert doc_reload is not doc  # can't be identical
+
+def test_construct_doc():
+    doc = DoclingDocument(description={}, file_info={})
+
+    # group, heading, paragraph, table, figure, title, list, provenance
+    doc.add_title()
+    doc.add_paragraph(text="Author 1\nAffiliation 1").add_provenance(ProvenanceItem(page_no=1, bbox=BoundingBox(t=12, l=5, r=230, b=40), charspan=(0,22)))
+    doc.add_paragraph(text="Author 2\nAffiliation 2")
+
+    chapter1 = doc.add_group(name="Introduction")
+    chapter1.add_heading(text="1. Introduction", level=2)
+    chapter1.add_paragraph(text="This paper introduces the biggest invention ever made. ...")
+    mylist = chapter1.add_group()
+    mylist.add_item(text="Cooks your favourite meal before you know you want it.")
+    mylist.add_item(text="Cleans up all your dishes.")
+    mylist.add_item(text="Drains your bank account without consent.")
+
+
+
+    sec = doc.add_section(text="1. Introduction")
+
+    list = sec.add_child(label="container")
+    list.add_child()
+    list.add_child()
+
diff --git a/test/test_json_schema_to_search_mapper.py b/test/test_json_schema_to_search_mapper.py
index 413e5b2..ab1abe4 100644
--- a/test/test_json_schema_to_search_mapper.py
+++ b/test/test_json_schema_to_search_mapper.py
@@ -10,7 +10,7 @@
 import jsondiff
 
 from docling_core.search.json_schema_to_search_mapper import JsonSchemaToSearchMapper
-from docling_core.types.doc.document import ExportedCCSDocument
+from docling_core.types.legacy.document import ExportedCCSDocument
 from docling_core.types.rec.record import Record
 
 
diff --git a/test/test_newdoc.py b/test/test_newdoc.py
deleted file mode 100644
index ff3aa3c..0000000
--- a/test/test_newdoc.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import yaml
-
-from docling_core.types.newdoc.document import DoclingDocument
-
-if __name__ == "__main__":
-    # Read YAML file
-    with open("data/newdoc/dummy_doc.yaml", "r") as fp:
-        dict_from_yaml = yaml.safe_load(fp)
-
-    doc = DoclingDocument.model_validate(dict_from_yaml)
-
-    # Objects can be accessed
-    text_item = doc.texts[0]
-
-    # access members
-    text_item.text
-    text_item.prov[0].page_no
-
-    # Objects that are references need explicit resolution for now:
-    obj = doc.body[2].resolve(doc=doc)  # Text item with parent
-    parent = obj.parent.resolve(doc=doc)  # it is a figure
-
-    obj2 = parent.children[0].resolve(
-        doc=doc
-    )  # Child of figure must be the same as obj
-
-    assert obj == obj2
-    assert obj is obj2
-
-    doc_dumped = doc.model_dump(mode="json", by_alias=True)
-    out_yaml = yaml.safe_dump(doc_dumped)
-
-    doc_reload = DoclingDocument.model_validate(yaml.safe_load(out_yaml))
-
-    assert doc_reload == doc  # must be equal
-    assert doc_reload is not doc  # can't be identical

From 384d12a71d863b7aaa5c22b2c622f3f5dcc780b6 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Thu, 19 Sep 2024 15:42:24 +0200
Subject: [PATCH 03/34] Manual update from main

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/legacy/base.py     | 101 +++++++++++++++++++++++++-
 docling_core/types/legacy/document.py |  12 +--
 2 files changed, 105 insertions(+), 8 deletions(-)

diff --git a/docling_core/types/legacy/base.py b/docling_core/types/legacy/base.py
index 2f1eeed..fa61cbf 100644
--- a/docling_core/types/legacy/base.py
+++ b/docling_core/types/legacy/base.py
@@ -4,8 +4,9 @@
 #
 
 """Define common models across CCS objects."""
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, List, Literal, Optional, Union
 
+import pandas as pd
 from pydantic import BaseModel, Field, PositiveInt, StrictStr
 
 from docling_core.search.mapping import es_field
@@ -152,6 +153,102 @@ class Table(BaseCell):
     data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
     model: Optional[str] = None
 
+    def _get_tablecell_span(self, cell: TableCell, ix: int):
+        if cell.spans is None:
+            span = set()
+        else:
+            span = set([s[ix] for s in cell.spans])
+        if len(span) == 0:
+            return 1, None, None
+        return len(span), min(span), max(span)
+
+    def export_to_dataframe(self) -> pd.DataFrame:
+        """Export the table as a Pandas DataFrame."""
+        if self.data is None or self.num_rows == 0 or self.num_cols == 0:
+            return pd.DataFrame()
+
+        # Count how many rows are column headers
+        num_headers = 0
+        for i, row in enumerate(self.data):
+            if len(row) == 0:
+                raise RuntimeError(f"Invalid table. {len(row)=} but {self.num_cols=}.")
+
+            any_header = False
+            for cell in row:
+                if cell.obj_type == "col_header":
+                    any_header = True
+                    break
+
+            if any_header:
+                num_headers += 1
+            else:
+                break
+
+        # Create the column names from all col_headers
+        columns: Optional[List[str]] = None
+        if num_headers > 0:
+            columns = ["" for _ in range(self.num_cols)]
+            for i in range(num_headers):
+                for j, cell in enumerate(self.data[i]):
+                    col_name = cell.text
+                    if columns[j] != "":
+                        col_name = f".{col_name}"
+                    columns[j] += col_name
+
+        # Create table data
+        table_data = [[cell.text for cell in row] for row in self.data[num_headers:]]
+
+        # Create DataFrame
+        df = pd.DataFrame(table_data, columns=columns)
+
+        return df
+
+    def export_to_html(self) -> str:
+        """Export the table as html."""
+        body = ""
+        nrows = self.num_rows
+        ncols = self.num_cols
+
+        if self.data is None:
+            return ""
+        for i in range(nrows):
+            body += "<tr>"
+            for j in range(ncols):
+                cell: TableCell = self.data[i][j]
+
+                rowspan, rowstart, rowend = self._get_tablecell_span(cell, 0)
+                colspan, colstart, colend = self._get_tablecell_span(cell, 1)
+
+                if rowstart is not None and rowstart != i:
+                    continue
+                if colstart is not None and colstart != j:
+                    continue
+
+                if rowstart is None:
+                    rowstart = i
+                if colstart is None:
+                    colstart = j
+
+                content = cell.text.strip()
+                label = cell.obj_type
+                celltag = "td"
+                if label in ["row_header", "row_multi_header", "row_title"]:
+                    pass
+                elif label in ["col_header", "col_multi_header"]:
+                    celltag = "th"
+
+                opening_tag = f"{celltag}"
+                if rowspan > 1:
+                    opening_tag += f' rowspan="{rowspan}"'
+                if colspan > 1:
+                    opening_tag += f' colspan="{colspan}"'
+
+                body += f"<{opening_tag}>{content}</{celltag}>"
+            body += "</tr>"
+        body = f"<table>{body}</table>"
+
+        return body
+
 
 # FIXME: let's add some figure specific data-types later
 class Figure(BaseCell):
@@ -193,4 +290,4 @@ class PageReference(BaseModel):
 
     hash: str = Field(json_schema_extra=es_field(type="keyword", ignore_above=8191))
     model: str = Field(json_schema_extra=es_field(suppress=True))
-    page: PositiveInt = Field(json_schema_extra=es_field(type="short"))
+    page: PositiveInt = Field(json_schema_extra=es_field(type="short"))
\ No newline at end of file
diff --git a/docling_core/types/legacy/document.py b/docling_core/types/legacy/document.py
index bccaf4b..e54ea9e 100644
--- a/docling_core/types/legacy/document.py
+++ b/docling_core/types/legacy/document.py
@@ -32,7 +32,7 @@
     LanguageT,
     Log,
 )
-from docling_core.types.legacy.base import (
+from docling_core.types.doc.base import (
     BaseCell,
     BaseText,
     BitmapObject,
@@ -410,21 +410,21 @@ def get_special_tokens(
         special_tokens = [token.value for token in cls]
 
         # Adding dynamically generated row and col tokens
-        for i in range(0, max_rows):
+        for i in range(0, max_rows + 1):
             special_tokens += [f"<row_{i}>", f"</row_{i}>"]
 
-        for i in range(0, max_cols):
+        for i in range(0, max_cols + 1):
             special_tokens += [f"<col_{i}>", f"</col_{i}>"]
 
         for i in range(6):
             special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
 
         # Adding dynamically generated page-tokens
-        for i in range(0, max_pages):
+        for i in range(0, max_pages + 1):
             special_tokens.append(f"<page_{i}>")
 
         # Adding dynamically generated location-tokens
-        for i in range(0, max(page_dimension[0], page_dimension[1])):
+        for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
             special_tokens.append(f"<loc_{i}>")
 
         return special_tokens
@@ -788,4 +788,4 @@ def export_to_document_tokens(
 
         xml_str += DocumentToken.END_DOCUMENT.value
 
-        return xml_str
+        return xml_str
\ No newline at end of file

From 3d3c2f35507be138d93eb6019173d53241d0fb79 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Fri, 20 Sep 2024 10:37:35 +0200
Subject: [PATCH 04/34] Move new-format to experimental path

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/__init__.py                |  17 +-
 docling_core/types/doc/base.py                | 413 ++++++---
 docling_core/types/{legacy => doc}/doc_ann.py |   2 +-
 docling_core/types/{legacy => doc}/doc_ocr.py |   2 +-
 docling_core/types/{legacy => doc}/doc_raw.py |   2 +-
 docling_core/types/doc/document.py            | 822 ++++++++++++++++--
 .../{legacy => experimental}/__init__.py      |   0
 docling_core/types/experimental/base.py       | 120 +++
 docling_core/types/experimental/document.py   |  93 ++
 docling_core/types/legacy/base.py             | 293 -------
 docling_core/types/legacy/document.py         | 791 -----------------
 docling_core/types/rec/subject.py             |   2 +-
 test/test_base.py                             |   2 +-
 test/test_doc_base.py                         |   2 +-
 test/test_doc_schema.py                       |   2 +-
 test/test_doc_schema_extractor.py             |   2 +-
 test/test_docling_doc.py                      |   2 +-
 test/test_json_schema_to_search_mapper.py     |   2 +-
 18 files changed, 1284 insertions(+), 1285 deletions(-)
 rename docling_core/types/{legacy => doc}/doc_ann.py (95%)
 rename docling_core/types/{legacy => doc}/doc_ocr.py (96%)
 rename docling_core/types/{legacy => doc}/doc_raw.py (98%)
 rename docling_core/types/{legacy => experimental}/__init__.py (100%)
 create mode 100644 docling_core/types/experimental/base.py
 create mode 100644 docling_core/types/experimental/document.py
 delete mode 100644 docling_core/types/legacy/base.py
 delete mode 100644 docling_core/types/legacy/document.py

diff --git a/docling_core/types/__init__.py b/docling_core/types/__init__.py
index 30c1f4d..5f493b8 100644
--- a/docling_core/types/__init__.py
+++ b/docling_core/types/__init__.py
@@ -5,10 +5,10 @@
 
 """Define the main types."""
 
-from docling_core.types.legacy.base import BoundingBox as LegacyBoundingBox # noqa
-from docling_core.types.legacy.base import Table  # noqa
-from docling_core.types.legacy.base import TableCell  # noqa
-from docling_core.types.legacy.base import (  # noqa
+from docling_core.types.doc.base import BoundingBox # noqa
+from docling_core.types.doc.base import Table  # noqa
+from docling_core.types.doc.base import TableCell  # noqa
+from docling_core.types.doc.base import (  # noqa
     BaseCell,
     BaseText,
     PageDimensions,
@@ -16,13 +16,12 @@
     Prov,
     Ref,
 )
-from docling_core.types.legacy.document import (  # noqa
+from docling_core.types.doc.document import (  # noqa
     CCSDocumentDescription as DocumentDescription,
 )
-from docling_core.types.legacy.document import CCSFileInfoObject as FileInfoObject  # noqa
-from docling_core.types.legacy.document import ExportedCCSDocument as Document  # noqa
+from docling_core.types.doc.document import CCSFileInfoObject as FileInfoObject  # noqa
+from docling_core.types.doc.document import ExportedCCSDocument as Document  # noqa
 from docling_core.types.gen.generic import Generic  # noqa
 from docling_core.types.rec.record import Record  # noqa
 
-from docling_core.types.doc.document import DoclingDocument, DocItem, TextItem, FloatingItem, TableItem, FigureItem, TableData, FigureData, PageItem
-from docling_core.types.doc.base import CoordOrigin, BoundingBox
\ No newline at end of file
+from docling_core.types.experimental.document import DoclingDocument, DocItem, TextItem, FloatingItem, TableItem, FigureItem, TableData, FigureData, PageItem
\ No newline at end of file
diff --git a/docling_core/types/doc/base.py b/docling_core/types/doc/base.py
index b082ea6..fa61cbf 100644
--- a/docling_core/types/doc/base.py
+++ b/docling_core/types/doc/base.py
@@ -1,120 +1,293 @@
-import copy
-from enum import Enum
-from typing import Tuple
-
-from pydantic import BaseModel
-
-
-## All copied from docling
-class CoordOrigin(str, Enum):
-    TOPLEFT = "TOPLEFT"
-    BOTTOMLEFT = "BOTTOMLEFT"
-
-
-class Size(BaseModel):
-    width: float = 0.0
-    height: float = 0.0
-
-
-class BoundingBox(BaseModel):
-    l: float  # left
-    t: float  # top
-    r: float  # right
-    b: float  # bottom
-
-    coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
-
-    @property
-    def width(self):
-        return self.r - self.l
-
-    @property
-    def height(self):
-        return abs(self.t - self.b)
-
-    def scaled(self, scale: float) -> "BoundingBox":
-        out_bbox = copy.deepcopy(self)
-        out_bbox.l *= scale
-        out_bbox.r *= scale
-        out_bbox.t *= scale
-        out_bbox.b *= scale
-
-        return out_bbox
-
-    def normalized(self, page_size: Size) -> "BoundingBox":
-        out_bbox = copy.deepcopy(self)
-        out_bbox.l /= page_size.width
-        out_bbox.r /= page_size.width
-        out_bbox.t /= page_size.height
-        out_bbox.b /= page_size.height
-
-        return out_bbox
-
-    def as_tuple(self):
-        if self.coord_origin == CoordOrigin.TOPLEFT:
-            return (self.l, self.t, self.r, self.b)
-        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            return (self.l, self.b, self.r, self.t)
-
-    @classmethod
-    def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
-        if origin == CoordOrigin.TOPLEFT:
-            l, t, r, b = coord[0], coord[1], coord[2], coord[3]
-            if r < l:
-                l, r = r, l
-            if b < t:
-                b, t = t, b
-
-            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
-        elif origin == CoordOrigin.BOTTOMLEFT:
-            l, b, r, t = coord[0], coord[1], coord[2], coord[3]
-            if r < l:
-                l, r = r, l
-            if b > t:
-                b, t = t, b
-
-            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
-
-    def area(self) -> float:
-        return (self.r - self.l) * (self.b - self.t)
-
-    def intersection_area_with(self, other: "BoundingBox") -> float:
-        # Calculate intersection coordinates
-        left = max(self.l, other.l)
-        top = max(self.t, other.t)
-        right = min(self.r, other.r)
-        bottom = min(self.b, other.b)
-
-        # Calculate intersection dimensions
-        width = right - left
-        height = bottom - top
-
-        # If the bounding boxes do not overlap, width or height will be negative
-        if width <= 0 or height <= 0:
-            return 0.0
-
-        return width * height
-
-    def to_bottom_left_origin(self, page_height) -> "BoundingBox":
-        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            return self
-        elif self.coord_origin == CoordOrigin.TOPLEFT:
-            return BoundingBox(
-                l=self.l,
-                r=self.r,
-                t=page_height - self.t,
-                b=page_height - self.b,
-                coord_origin=CoordOrigin.BOTTOMLEFT,
-            )
-
-    def to_top_left_origin(self, page_height):
-        if self.coord_origin == CoordOrigin.TOPLEFT:
-            return self
-        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            return BoundingBox(
-                l=self.l,
-                r=self.r,
-                t=page_height - self.t,  # self.b
-                b=page_height - self.b,  # self.t
-                coord_origin=CoordOrigin.TOPLEFT,
-            )
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+
+"""Define common models across CCS objects."""
+from typing import Annotated, List, Literal, Optional, Union
+
+import pandas as pd
+from pydantic import BaseModel, Field, PositiveInt, StrictStr
+
+from docling_core.search.mapping import es_field
+from docling_core.utils.alias import AliasModel
+
+CellData = tuple[float, float, float, float, str, str]
+
+CellHeader = tuple[
+    Literal["x0"],
+    Literal["y0"],
+    Literal["x1"],
+    Literal["y1"],
+    Literal["font"],
+    Literal["text"],
+]
+
+BoundingBox = Annotated[list[float], Field(min_length=4, max_length=4)]
+
+Span = Annotated[list[int], Field(min_length=2, max_length=2)]
+
+
+class CellsContainer(BaseModel):
+    """Cell container."""
+
+    data: Optional[list[CellData]] = None
+    header: CellHeader = ("x0", "y0", "x1", "y1", "font", "text")
+
+
+class S3Resource(BaseModel):
+    """Resource in a cloud object storage."""
+
+    mime: str
+    path: str
+    page: Optional[PositiveInt] = None
+
+
+class S3Data(AliasModel):
+    """Data object in a cloud object storage."""
+
+    pdf_document: Optional[list[S3Resource]] = Field(default=None, alias="pdf-document")
+    pdf_pages: Optional[list[S3Resource]] = Field(default=None, alias="pdf-pages")
+    pdf_images: Optional[list[S3Resource]] = Field(default=None, alias="pdf-images")
+    json_document: Optional[S3Resource] = Field(default=None, alias="json-document")
+    json_meta: Optional[S3Resource] = Field(default=None, alias="json-meta")
+    glm_json_document: Optional[S3Resource] = Field(
+        default=None, alias="glm-json-document"
+    )
+    figures: Optional[list[S3Resource]] = None
+
+
+class S3Reference(AliasModel):
+    """References an s3 resource."""
+
+    ref_s3_data: StrictStr = Field(
+        alias="__ref_s3_data", examples=["#/_s3_data/figures/0"]
+    )
+
+
+class Prov(AliasModel):
+    """Provenance."""
+
+    bbox: BoundingBox
+    page: PositiveInt
+    span: Span
+    ref_s3_data: Optional[StrictStr] = Field(
+        default=None, alias="__ref_s3_data", json_schema_extra=es_field(suppress=True)
+    )
+
+
+class BoundingBoxContainer(BaseModel):
+    """Bounding box container."""
+
+    min: BoundingBox
+    max: BoundingBox
+
+
+class BitmapObject(AliasModel):
+    """Bitmap object."""
+
+    obj_type: str = Field(alias="type")
+    bounding_box: BoundingBoxContainer = Field(
+        json_schema_extra=es_field(suppress=True)
+    )
+    prov: Prov
+
+
+class PageDimensions(BaseModel):
+    """Page dimensions."""
+
+    height: float
+    page: PositiveInt
+    width: float
+
+
+class TableCell(AliasModel):
+    """Table cell."""
+
+    bbox: Optional[BoundingBox] = None
+    spans: Optional[list[Span]] = None
+    text: str = Field(json_schema_extra=es_field(term_vector="with_positions_offsets"))
+    obj_type: str = Field(alias="type")
+
+
+class GlmTableCell(TableCell):
+    """Glm Table cell."""
+
+    col: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True))
+    col_header: bool = Field(
+        default=False, alias="col-header", json_schema_extra=es_field(suppress=True)
+    )
+    col_span: Optional[Span] = Field(
+        default=None, alias="col-span", json_schema_extra=es_field(suppress=True)
+    )
+    row: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True))
+    row_header: bool = Field(
+        default=False, alias="row-header", json_schema_extra=es_field(suppress=True)
+    )
+    row_span: Optional[Span] = Field(
+        default=None, alias="row-span", json_schema_extra=es_field(suppress=True)
+    )
+
+
+class BaseCell(AliasModel):
+    """Base cell."""
+
+    # FIXME: we need to check why we have bounding_box (this should be in prov)
+    bounding_box: Optional[BoundingBoxContainer] = Field(
+        default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
+    )
+    prov: Optional[list[Prov]] = None
+    text: Optional[str] = Field(
+        default=None, json_schema_extra=es_field(term_vector="with_positions_offsets")
+    )
+    obj_type: str = Field(
+        alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+
+
+class Table(BaseCell):
+    """Table."""
+
+    num_cols: int = Field(alias="#-cols")
+    num_rows: int = Field(alias="#-rows")
+    data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
+    model: Optional[str] = None
+
+    def _get_tablecell_span(self, cell: TableCell, ix: int):
+        if cell.spans is None:
+            span = set()
+        else:
+            span = set([s[ix] for s in cell.spans])
+        if len(span) == 0:
+            return 1, None, None
+        return len(span), min(span), max(span)
+
+    def export_to_dataframe(self) -> pd.DataFrame:
+        """Export the table as a Pandas DataFrame."""
+        if self.data is None or self.num_rows == 0 or self.num_cols == 0:
+            return pd.DataFrame()
+
+        # Count how many rows are column headers
+        num_headers = 0
+        for i, row in enumerate(self.data):
+            if len(row) == 0:
+                raise RuntimeError(f"Invalid table. {len(row)=} but {self.num_cols=}.")
+
+            any_header = False
+            for cell in row:
+                if cell.obj_type == "col_header":
+                    any_header = True
+                    break
+
+            if any_header:
+                num_headers += 1
+            else:
+                break
+
+        # Create the column names from all col_headers
+        columns: Optional[List[str]] = None
+        if num_headers > 0:
+            columns = ["" for _ in range(self.num_cols)]
+            for i in range(num_headers):
+                for j, cell in enumerate(self.data[i]):
+                    col_name = cell.text
+                    if columns[j] != "":
+                        col_name = f".{col_name}"
+                    columns[j] += col_name
+
+        # Create table data
+        table_data = [[cell.text for cell in row] for row in self.data[num_headers:]]
+
+        # Create DataFrame
+        df = pd.DataFrame(table_data, columns=columns)
+
+        return df
+
+    def export_to_html(self) -> str:
+        """Export the table as html."""
+        body = ""
+        nrows = self.num_rows
+        ncols = self.num_cols
+
+        if self.data is None:
+            return ""
+        for i in range(nrows):
+            body += "<tr>"
+            for j in range(ncols):
+                cell: TableCell = self.data[i][j]
+
+                rowspan, rowstart, rowend = self._get_tablecell_span(cell, 0)
+                colspan, colstart, colend = self._get_tablecell_span(cell, 1)
+
+                if rowstart is not None and rowstart != i:
+                    continue
+                if colstart is not None and colstart != j:
+                    continue
+
+                if rowstart is None:
+                    rowstart = i
+                if colstart is None:
+                    colstart = j
+
+                content = cell.text.strip()
+                label = cell.obj_type
+                celltag = "td"
+                if label in ["row_header", "row_multi_header", "row_title"]:
+                    pass
+                elif label in ["col_header", "col_multi_header"]:
+                    celltag = "th"
+
+                opening_tag = f"{celltag}"
+                if rowspan > 1:
+                    opening_tag += f' rowspan="{rowspan}"'
+                if colspan > 1:
+                    opening_tag += f' colspan="{colspan}"'
+
+                body += f"<{opening_tag}>{content}</{celltag}>"
+            body += "</tr>"
+        body = f"<table>{body}</table>"
+
+        return body
+
+
+# FIXME: let's add some figure specific data-types later
+class Figure(BaseCell):
+    """Figure."""
+
+
+class BaseText(AliasModel):
+    """Base model for text objects."""
+
+    text: StrictStr = Field(
+        json_schema_extra=es_field(term_vector="with_positions_offsets")
+    )
+    obj_type: StrictStr = Field(
+        alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    name: Optional[StrictStr] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    font: Optional[str] = None
+    prov: Optional[list[Prov]] = None
+
+
+class ListItem(BaseText):
+    """List item."""
+
+    identifier: str
+
+
+class Ref(AliasModel):
+    """Reference."""
+
+    name: str
+    obj_type: str = Field(alias="type")
+    ref: str = Field(alias="$ref")
+
+
+class PageReference(BaseModel):
+    """Page reference."""
+
+    hash: str = Field(json_schema_extra=es_field(type="keyword", ignore_above=8191))
+    model: str = Field(json_schema_extra=es_field(suppress=True))
+    page: PositiveInt = Field(json_schema_extra=es_field(type="short"))
\ No newline at end of file
diff --git a/docling_core/types/legacy/doc_ann.py b/docling_core/types/doc/doc_ann.py
similarity index 95%
rename from docling_core/types/legacy/doc_ann.py
rename to docling_core/types/doc/doc_ann.py
index 974ea12..f836615 100644
--- a/docling_core/types/legacy/doc_ann.py
+++ b/docling_core/types/doc/doc_ann.py
@@ -8,7 +8,7 @@
 
 from pydantic import BaseModel
 
-from docling_core.types.legacy.base import BoundingBox
+from docling_core.types.doc.base import BoundingBox
 
 AnnotationReport = Any  # TODO
 
diff --git a/docling_core/types/legacy/doc_ocr.py b/docling_core/types/doc/doc_ocr.py
similarity index 96%
rename from docling_core/types/legacy/doc_ocr.py
rename to docling_core/types/doc/doc_ocr.py
index 656f54a..875c0d1 100644
--- a/docling_core/types/legacy/doc_ocr.py
+++ b/docling_core/types/doc/doc_ocr.py
@@ -8,7 +8,7 @@
 
 from pydantic import BaseModel, Field
 
-from docling_core.types.legacy.base import BoundingBox
+from docling_core.types.doc.base import BoundingBox
 from docling_core.utils.alias import AliasModel
 
 CoordsOrder = Literal["x1", "y1", "x2", "y2"]
diff --git a/docling_core/types/legacy/doc_raw.py b/docling_core/types/doc/doc_raw.py
similarity index 98%
rename from docling_core/types/legacy/doc_raw.py
rename to docling_core/types/doc/doc_raw.py
index d0b4d71..3e6a7e0 100644
--- a/docling_core/types/legacy/doc_raw.py
+++ b/docling_core/types/doc/doc_raw.py
@@ -9,7 +9,7 @@
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
 
-from docling_core.types.legacy.base import BoundingBox
+from docling_core.types.doc.base import BoundingBox
 from docling_core.utils.alias import AliasModel
 
 FontDifferences = dict[str, Any]
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 146616b..e54ea9e 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -1,93 +1,791 @@
-from typing import Any, Dict, List, Optional, Tuple, Union
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
 
-from pydantic import AnyUrl, BaseModel, Field
+"""Models for the Docling Document data type."""
 
-from docling_core.types.doc.base import BoundingBox, Size
+from datetime import datetime
+from enum import Enum
+from typing import Generic, Optional, Tuple, Union
 
+from pydantic import (
+    AnyHttpUrl,
+    BaseModel,
+    Field,
+    NonNegativeInt,
+    StrictStr,
+    model_validator,
+)
+from tabulate import tabulate
 
-class FigureData(BaseModel):  # TBD
-    pass
+from docling_core.search.mapping import es_field
+from docling_core.types.base import (
+    Acquisition,
+    CollectionDocumentInfo,
+    CollectionNameTypeT,
+    DescriptionAdvancedT,
+    DescriptionAnalyticsT,
+    FileInfoObject,
+    Identifier,
+    IdentifierTypeT,
+    LanguageT,
+    Log,
+)
+from docling_core.types.doc.base import (
+    BaseCell,
+    BaseText,
+    BitmapObject,
+    Figure,
+    PageDimensions,
+    PageReference,
+    Ref,
+    S3Data,
+    Table,
+)
+from docling_core.utils.alias import AliasModel
 
 
-class TableData(BaseModel):  # TBD
-    pass
+class CCSFileInfoDescription(BaseModel, extra="forbid"):
+    """File info description."""
 
+    author: Optional[list[StrictStr]] = None
+    keywords: Optional[str] = None
+    subject: Optional[str] = None
+    title: Optional[StrictStr] = None
+    creation_date: Optional[str] = None  # datetime
 
-class RefItem(BaseModel):
-    cref: str = Field(alias="$ref")
 
-    def resolve(self, doc: "DoclingDocument"):
-        _, path, index_str = self.cref.split("/")
-        index = int(index_str)
-        obj = doc.__getattribute__(path)[index]
-        return obj
+class CCSFileInfoObject(FileInfoObject, extra="forbid"):
+    """File info object."""
 
+    num_pages: Optional[int] = Field(default=None, alias="#-pages")
 
-class ImageRef(BaseModel):
-    format: str  # png, etc.
-    dpi: int  # ...
-    size: Size
-    uri: AnyUrl
+    collection_name: Optional[str] = Field(
+        default=None,
+        alias="collection-name",
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+    description: Optional[CCSFileInfoDescription] = Field(
+        default=None, json_schema_extra=es_field(suppress=True)
+    )
+    page_hashes: Optional[list[PageReference]] = Field(
+        default=None, alias="page-hashes"
+    )
 
 
-class ProvenanceItem(BaseModel):
-    page_no: int
-    bbox: BoundingBox
-    charspan: Tuple[int, int]
+class Affiliation(BaseModel, extra="forbid"):
+    """Affiliation."""
 
+    name: str = Field(
+        ...,
+        json_schema_extra=es_field(
+            fields={
+                "lower": {
+                    "normalizer": "lowercase_asciifolding",
+                    "type": "keyword",
+                    "ignore_above": 8191,
+                },
+                "keyword": {"type": "keyword", "ignore_above": 8191},
+            },
+        ),
+    )
+    id: Optional[str] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    source: Optional[str] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
 
-class DocItem(BaseModel):
-    dloc: str  # format spec ({document_hash}{json-path})
-    hash: int
-    label: str
-    parent: Optional[RefItem]
-    children: List[RefItem]
-    prov: List[ProvenanceItem]
 
+class Author(BaseModel, extra="forbid"):
+    """Author."""
 
-class TextItem(DocItem):
-    orig: str  # untreated representation
-    text: str  # sanitized representation
+    name: str = Field(
+        ...,
+        json_schema_extra=es_field(
+            type="text",
+            fields={
+                "lower": {
+                    "normalizer": "lowercase_asciifolding",
+                    "type": "keyword",
+                    "ignore_above": 8191,
+                },
+                "keyword": {"type": "keyword", "ignore_above": 8191},
+            },
+        ),
+    )
+    id: Optional[str] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    source: Optional[str] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    affiliations: Optional[list[Affiliation]] = None
 
 
-class FloatingItem(DocItem):
-    caption: Optional[Union[RefItem, TextItem]]
-    references: List[Union[RefItem, TextItem]]
-    footnotes: List[Union[RefItem, TextItem]]
-    data: Any
-    image: Optional[ImageRef]
+class Publication(BaseModel, Generic[IdentifierTypeT], extra="forbid"):
+    """Publication details of a journal or venue."""
 
+    identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field(
+        default=None,
+        description="Unique identifiers of a publication venue.",
+    )
+    name: StrictStr = Field(
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+        description="Name of the publication.",
+    )
+    alternate_names: Optional[list[StrictStr]] = Field(
+        default=None,
+        json_schema_extra=es_field(type="text"),
+        title="Alternate Names",
+        description="Other names or abbreviations of this publication.",
+    )
+    type: Optional[list[StrictStr]] = Field(
+        default=None,
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+        description="Type of publication (journal article, conference, review,...).",
+    )
+    pages: Optional[StrictStr] = Field(
+        default=None,
+        json_schema_extra=es_field(type="text"),
+        description="Page range in the publication.",
+    )
+    issue: Optional[StrictStr] = Field(
+        default=None,
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+        description="Publication issue (issue number).",
+    )
+    volume: Optional[StrictStr] = Field(
+        default=None,
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+        description="Publication volume.",
+    )
+    url: Optional[AnyHttpUrl] = Field(
+        default=None,
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+        description="URL on the publication site.",
+    )
 
-class FigureItem(DocItem):
-    data: FigureData
 
+class DescriptionLicense(BaseModel, extra="forbid"):
+    """Licence in document description."""
 
-class TableItem(DocItem):
-    data: TableData
+    code: Optional[StrictStr] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    text: Optional[StrictStr] = None
 
 
-class KeyValueItem(DocItem):
-    pass
+class CCSDocumentDescription(
+    AliasModel,
+    Generic[
+        DescriptionAdvancedT,
+        DescriptionAnalyticsT,
+        IdentifierTypeT,
+        LanguageT,
+        CollectionNameTypeT,
+    ],
+):
+    """Description in document."""
 
-ContentItem = Union[TextItem, FigureItem, TableItem, KeyValueItem]
+    title: Optional[StrictStr] = None
+    abstract: Optional[list[StrictStr]] = None
+    authors: Optional[list[Author]] = None
+    affiliations: Optional[list[Affiliation]] = None
+    subjects: Optional[list[str]] = Field(
+        default=None,
+        json_schema_extra=es_field(
+            fields={"keyword": {"ignore_above": 8191, "type": "keyword"}}
+        ),
+    )
+    keywords: Optional[list[str]] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    publication_date: Optional[datetime] = None
+    languages: Optional[list[LanguageT]] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    license_: Optional[DescriptionLicense] = Field(default=None, alias="license")
+    publishers: Optional[list[StrictStr]] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    url_refs: Optional[list[str]] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    references: Optional[list[Identifier[IdentifierTypeT]]] = None
+    publication: Optional[list[Publication]] = Field(
+        default=None, description="List of publication journals or venues."
+    )
+    reference_count: Optional[NonNegativeInt] = Field(
+        default=None,
+        title="Reference Count",
+        description="Total number of documents referenced by this document.",
+        json_schema_extra=es_field(type="integer"),
+    )
+    citation_count: Optional[NonNegativeInt] = Field(
+        default=None,
+        title="Citation Count",
+        description=(
+            "Total number of citations that this document has received (number "
+            "of documents in whose bibliography this document appears)."
+        ),
+        json_schema_extra=es_field(type="integer"),
+    )
+    citation_date: Optional[datetime] = Field(
+        default=None,
+        title="Citation Count Date",
+        description="Last update date of the citation count.",
+    )
+    advanced: Optional[DescriptionAdvancedT] = None
+    analytics: Optional[DescriptionAnalyticsT] = None
+    logs: list[Log]
+    collection: Optional[CollectionDocumentInfo[CollectionNameTypeT]] = Field(
+        default=None, description="The collection information of this document."
+    )
+    acquisition: Optional[Acquisition] = Field(
+        default=None,
+        description=(
+            "Information on how the document was obtained, for data governance"
+            " purposes."
+        ),
+    )
 
 
-class DocumentContent(BaseModel):
-    furniture: List[RefItem] = []
-    body: List[RefItem] = []
-    texts: List[TextItem] = []
-    figures: List[FigureItem] = []
-    tables: List[TableItem] = []
-    key_value_items: List[KeyValueItem] = []
+class MinimalDocument(
+    AliasModel,
+    Generic[
+        DescriptionAdvancedT,
+        DescriptionAnalyticsT,
+        IdentifierTypeT,
+        LanguageT,
+        CollectionNameTypeT,
+    ],
+):
+    """Minimal model for a document."""
 
-class PageItem(DocumentContent):
-    hash: str  # page hash
-    size: Size
-    image: Optional[ImageRef]
-    num_elements: int
+    name: StrictStr = Field(alias="_name")
+    obj_type: Optional[StrictStr] = Field("document", alias="type")
+    description: CCSDocumentDescription[
+        DescriptionAdvancedT,
+        DescriptionAnalyticsT,
+        IdentifierTypeT,
+        LanguageT,
+        CollectionNameTypeT,
+    ]
+    file_info: FileInfoObject = Field(alias="file-info")
+    main_text: Optional[list[Union[Ref, BaseText]]] = Field(
+        default=None, alias="main-text"
+    )
+    figures: Optional[list[Figure]] = None
+    tables: Optional[list[Table]] = None
 
 
-class DoclingDocument(DocumentContent):
-    description: Any
-    file_info: Any
-    pages: Dict[int, PageItem] = {}  # empty as default
+class CCSDocument(
+    MinimalDocument,
+    Generic[
+        DescriptionAdvancedT,
+        DescriptionAnalyticsT,
+        IdentifierTypeT,
+        LanguageT,
+        CollectionNameTypeT,
+    ],
+):
+    """Model for a CCS-generated document."""
+
+    obj_type: Optional[StrictStr] = Field("pdf-document", alias="type")
+    bitmaps: Optional[list[BitmapObject]] = None
+    equations: Optional[list[BaseCell]] = None
+    footnotes: Optional[list[BaseText]] = None
+    file_info: CCSFileInfoObject = Field(alias="file-info")
+    main_text: Optional[list[Union[Ref, BaseText]]] = Field(
+        default=None,
+        alias="main-text",
+    )
+    page_dimensions: Optional[list[PageDimensions]] = Field(
+        default=None, alias="page-dimensions"
+    )
+    page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
+    page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
+    s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
+
+    @model_validator(mode="before")
+    @classmethod
+    def from_dict(cls, data):
+        """Validates and fixes the input data."""
+        if not isinstance(data, dict):
+            return data
+        description_collection = data["description"].get("collection")
+        if not description_collection:
+            data["description"].setdefault("collection", {})
+
+        data["description"]["collection"].setdefault("type", "Document")
+        logs = data["description"].get("logs")
+        if not logs:
+            data["description"].setdefault("logs", [])
+
+        abstract = data["description"].get("abstract")
+        if abstract is not None and not isinstance(abstract, list):
+            if isinstance(abstract, str):
+                data["description"]["abstract"] = [abstract]
+            else:
+                data["description"].pop("abstract")
+
+        for key in ["affiliations", "authors"]:
+            descr = data["description"].get(key)
+            if descr is not None and not isinstance(descr, list):
+                if isinstance(descr, dict):
+                    data["description"][key] = [descr]
+                else:
+                    data["description"].pop(key)
+
+        if data.get("main-text"):
+            for item in data["main-text"]:
+                if ref := item.pop("__ref", None):
+                    item["$ref"] = ref
+
+        return data
+
+
+class DocumentToken(Enum):
+    """Class to represent an LLM friendly representation of a Document."""
+
+    BEG_DOCUMENT = "<document>"
+    END_DOCUMENT = "</document>"
+
+    BEG_TITLE = "<title>"
+    END_TITLE = "</title>"
+
+    BEG_ABSTRACT = "<abstract>"
+    END_ABSTRACT = "</abstract>"
+
+    BEG_DOI = "<doi>"
+    END_DOI = "</doi>"
+    BEG_DATE = "<date>"
+    END_DATE = "</date>"
+
+    BEG_AUTHORS = "<authors>"
+    END_AUTHORS = "</authors>"
+    BEG_AUTHOR = "<author>"
+    END_AUTHOR = "</author>"
+
+    BEG_AFFILIATIONS = "<affiliations>"
+    END_AFFILIATIONS = "</affiliations>"
+    BEG_AFFILIATION = "<affiliation>"
+    END_AFFILIATION = "</affiliation>"
+
+    BEG_HEADER = "<section-header>"
+    END_HEADER = "</section-header>"
+    BEG_TEXT = "<text>"
+    END_TEXT = "</text>"
+    BEG_PARAGRAPH = "<paragraph>"
+    END_PARAGRAPH = "</paragraph>"
+    BEG_TABLE = "<table>"
+    END_TABLE = "</table>"
+    BEG_FIGURE = "<figure>"
+    END_FIGURE = "</figure>"
+    BEG_CAPTION = "<caption>"
+    END_CAPTION = "</caption>"
+    BEG_EQUATION = "<equation>"
+    END_EQUATION = "</equation>"
+    BEG_LIST = "<list>"
+    END_LIST = "</list>"
+    BEG_LISTITEM = "<list-item>"
+    END_LISTITEM = "</list-item>"
+
+    BEG_LOCATION = "<location>"
+    END_LOCATION = "</location>"
+    BEG_GROUP = "<group>"
+    END_GROUP = "</group>"
+
+    @classmethod
+    def get_special_tokens(
+        cls,
+        max_rows: int = 100,
+        max_cols: int = 100,
+        max_pages: int = 1000,
+        page_dimension: Tuple[int, int] = (100, 100),
+    ):
+        """Function to get all special document tokens."""
+        special_tokens = [token.value for token in cls]
+
+        # Adding dynamically generated row and col tokens
+        for i in range(0, max_rows + 1):
+            special_tokens += [f"<row_{i}>", f"</row_{i}>"]
+
+        for i in range(0, max_cols + 1):
+            special_tokens += [f"<col_{i}>", f"</col_{i}>"]
+
+        for i in range(6):
+            special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
+
+        # Adding dynamically generated page-tokens
+        for i in range(0, max_pages + 1):
+            special_tokens.append(f"<page_{i}>")
+
+        # Adding dynamically generated location-tokens
+        for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
+            special_tokens.append(f"<loc_{i}>")
+
+        return special_tokens
+
+    @staticmethod
+    def get_page_token(page: int):
+        """Function to get page tokens."""
+        return f"<page_{page}>"
+
+    @staticmethod
+    def get_location_token(val: float, rnorm: int = 100):
+        """Function to get location tokens."""
+        val_ = round(rnorm * val)
+
+        if val_ < 0:
+            return "<loc_0>"
+
+        if val_ > rnorm:
+            return f"<loc_{rnorm}>"
+
+        return f"<loc_{val_}>"
+
+
+class ExportedCCSDocument(
+    MinimalDocument,
+    Generic[
+        DescriptionAdvancedT,
+        DescriptionAnalyticsT,
+        IdentifierTypeT,
+        LanguageT,
+        CollectionNameTypeT,
+    ],
+):
+    """Document model for Docling."""
+
+    obj_type: Optional[StrictStr] = Field(
+        "pdf-document",
+        alias="type",
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+    bitmaps: Optional[list[BitmapObject]] = None
+    equations: Optional[list[BaseCell]] = None
+    footnotes: Optional[list[BaseText]] = None
+    description: CCSDocumentDescription[
+        DescriptionAdvancedT,
+        DescriptionAnalyticsT,
+        IdentifierTypeT,
+        LanguageT,
+        CollectionNameTypeT,
+    ]
+    file_info: CCSFileInfoObject = Field(alias="file-info")
+    main_text: Optional[list[Union[Ref, BaseText]]] = Field(
+        default=None, alias="main-text"
+    )
+    page_dimensions: Optional[list[PageDimensions]] = Field(
+        default=None, alias="page-dimensions"
+    )
+    page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
+    page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
+    s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
+    identifiers: Optional[list[Identifier[IdentifierTypeT]]] = None
+
+    @model_validator(mode="before")
+    @classmethod
+    def from_dict(cls, data):
+        """Fix ref in main-text."""
+        if not isinstance(data, dict):
+            return data
+        if data.get("main-text"):
+            for item in data["main-text"]:
+                if ref := item.pop("__ref", None):
+                    item["$ref"] = ref
+
+        return data
+
+    def _resolve_ref(self, item: Ref) -> Optional[Union[BaseCell, BaseText]]:
+        """Return the resolved reference.
+
+        Resolved the Ref object within the document.
+        If the object is not found, None is returned.
+        """
+        result: Optional[Union[BaseCell, BaseText]] = None
+
+        # NOTE: currently only resolves refs explicitely, such that we can make
+        # assumptions on ref parts
+        if item.obj_type == "table" and self.tables:
+            parts = item.ref.split("/")
+            result = self.tables[int(parts[2])]
+        elif item.obj_type == "figure" and self.figures:
+            parts = item.ref.split("/")
+            result = self.figures[int(parts[2])]
+        elif item.obj_type == "equation" and self.equations:
+            parts = item.ref.split("/")
+            result = self.equations[int(parts[2])]
+        elif item.obj_type == "footnote" and self.footnotes:
+            parts = item.ref.split("/")
+            result = self.footnotes[int(parts[2])]
+
+        return result
+
+    def export_to_markdown(
+        self,
+        delim: str = "\n\n",
+        main_text_start: int = 0,
+        main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+            "table",
+        ],
+        strict_text: bool = False,
+    ) -> str:
+        r"""Serialize to Markdown.
+
+        Operates on a slice of the document's main_text as defined through arguments
+        main_text_start and main_text_stop; defaulting to the whole main_text.
+
+        Args:
+            delim (str, optional): Delimiter to use when concatenating the various
+                Markdown parts. Defaults to "\n\n".
+            main_text_start (int, optional): Main-text slicing start index (inclusive).
+                Defaults to 0.
+            main_text_end (Optional[int], optional): Main-text slicing stop index
+                (exclusive). Defaults to None.
+
+        Returns:
+            str: The exported Markdown representation.
+        """
+        has_title = False
+        prev_text = ""
+        md_texts: list[str] = []
+
+        if self.main_text is not None:
+            for orig_item in self.main_text[main_text_start:main_text_stop]:
+                markdown_text = ""
+
+                item = (
+                    self._resolve_ref(orig_item)
+                    if isinstance(orig_item, Ref)
+                    else orig_item
+                )
+                if item is None:
+                    continue
+
+                item_type = item.obj_type
+                if isinstance(item, BaseText) and item_type in main_text_labels:
+                    text = item.text
+
+                    # ignore repeated text
+                    if prev_text == text:
+                        continue
+                    else:
+                        prev_text = text
+
+                    # first title match
+                    if item_type == "title" and not has_title:
+                        if strict_text:
+                            markdown_text = f"{text}"
+                        else:
+                            markdown_text = f"# {text}"
+                        has_title = True
+
+                    # secondary titles
+                    elif item_type in {"title", "subtitle-level-1"} or (
+                        has_title and item_type == "title"
+                    ):
+                        if strict_text:
+                            markdown_text = f"{text}"
+                        else:
+                            markdown_text = f"## {text}"
+
+                    # normal text
+                    else:
+                        markdown_text = text
+
+                elif (
+                    isinstance(item, Table)
+                    and item.data
+                    and item_type in main_text_labels
+                    and not strict_text
+                ):
+                    table = []
+                    for row in item.data:
+                        tmp = []
+                        for col in row:
+                            tmp.append(col.text)
+                        table.append(tmp)
+
+                    if len(table) > 1 and len(table[0]) > 0:
+                        try:
+                            md_table = tabulate(
+                                table[1:], headers=table[0], tablefmt="github"
+                            )
+                        except ValueError:
+                            md_table = tabulate(
+                                table[1:],
+                                headers=table[0],
+                                tablefmt="github",
+                                disable_numparse=True,
+                            )
+
+                        markdown_text = md_table
+
+                if markdown_text:
+                    md_texts.append(markdown_text)
+
+        result = delim.join(md_texts)
+        return result
+
+    def export_to_document_tokens(
+        self,
+        delim: str = "\n\n",
+        main_text_start: int = 0,
+        main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+            "table",
+            "figure",
+        ],
+        page_tagging: bool = True,
+        location_tagging: bool = True,
+        location_dimensions: Tuple[int, int] = (100, 100),
+        add_new_line: bool = True,
+    ) -> str:
+        r"""Exports the document content to an DocumentToken format.
+
+        Operates on a slice of the document's main_text as defined through arguments
+        main_text_start and main_text_stop; defaulting to the whole main_text.
+
+        Args:
+            delim (str, optional): The delimiter used to separate text blocks in the
+                exported XML. Default is two newline characters ("\n\n").
+            main_text_start (int, optional): The starting index of the main text to
+                be included in the XML. Default is 0 (the beginning of the text).
+            main_text_stop (Optional[int], optional): The stopping index of the main
+                text. If set to None, the export includes text up to the end.
+                Default is None.
+            main_text_labels (list[str], optional): A list of text labels that
+                categorize the different sections of the document (e.g., "title",
+                "subtitle-level-1", "paragraph", "caption"). Default labels are
+                "title", "subtitle-level-1", "paragraph", and "caption".
+            location_tagging (bool, optional): Determines whether to include
+                location-based tagging in the XML. If True, the exported XML will
+                contain information about the locations of the text elements.
+                Default is True.
+            location_dimensions (Tuple[int, int], optional): Specifies the dimensions
+                (width and height) for the location tagging, if enabled.
+                Default is [100, 100].
+            add_new_line (bool, optional): Whether to add new line characters after
+                each text block. If True, a new line is added after each block of
+                text in the XML. Default is True.
+
+        Returns:
+            str: The content of the document formatted as an XML string.
+        """
+        xml_str = DocumentToken.BEG_DOCUMENT.value
+
+        new_line = ""
+        if add_new_line:
+            new_line = "\n"
+
+        if self.main_text is not None:
+            for orig_item in self.main_text[main_text_start:main_text_stop]:
+
+                item = (
+                    self._resolve_ref(orig_item)
+                    if isinstance(orig_item, Ref)
+                    else orig_item
+                )
+
+                if item is None:
+                    continue
+
+                prov = item.prov
+
+                loc_str = ""  # default is zero
+                if (
+                    location_tagging
+                    and self.page_dimensions is not None
+                    and prov is not None
+                    and len(prov) > 0
+                ):
+
+                    page = prov[0].page
+                    page_dim = self.page_dimensions[page - 1]
+
+                    page_w = float(page_dim.width)
+                    page_h = float(page_dim.height)
+
+                    x0 = float(prov[0].bbox[0]) / float(page_w)
+                    y0 = float(prov[0].bbox[1]) / float(page_h)
+                    x1 = float(prov[0].bbox[2]) / float(page_w)
+                    y1 = float(prov[0].bbox[3]) / float(page_h)
+
+                    page_tok = ""
+                    if page_tagging:
+                        page_tok = DocumentToken.get_page_token(page=page)
+
+                    x0_tok = DocumentToken.get_location_token(
+                        val=min(x0, x1), rnorm=location_dimensions[0]
+                    )
+                    y0_tok = DocumentToken.get_location_token(
+                        val=min(y0, y1), rnorm=location_dimensions[1]
+                    )
+                    x1_tok = DocumentToken.get_location_token(
+                        val=max(x0, x1), rnorm=location_dimensions[0]
+                    )
+                    y1_tok = DocumentToken.get_location_token(
+                        val=max(y0, y1), rnorm=location_dimensions[1]
+                    )
+
+                    # update
+                    loc_str = f"{DocumentToken.BEG_LOCATION.value}"
+                    loc_str += f"{page_tok}"
+                    loc_str += f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
+                    loc_str += f"{DocumentToken.END_LOCATION.value}"
+
+                item_type = item.obj_type
+                if isinstance(item, BaseText) and (item_type in main_text_labels):
+                    text = item.text
+
+                    xml_str += f"<{item_type}>{loc_str}{text}</{item_type}>{new_line}"
+
+                elif isinstance(item, Table) and (item_type in main_text_labels):
+
+                    xml_str += f"<{item_type}>{loc_str}"
+
+                    if item.text is not None and len(item.text) > 0:
+                        xml_str += f"{DocumentToken.BEG_CAPTION.value}"
+                        xml_str += (
+                            f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
+                        )
+
+                    if item.data is not None and len(item.data) > 0:
+                        for i, row in enumerate(item.data):
+                            xml_str += f"<row_{i}>"
+                            for j, col in enumerate(row):
+                                text = col.text
+                                xml_str += f"<col_{j}>{text}</col_{j}>"
+
+                            xml_str += f"</row_{i}>{new_line}"
+
+                    xml_str += f"</{item_type}>{new_line}"
+
+                elif isinstance(item, Figure) and (item_type in main_text_labels):
+
+                    xml_str += f"<{item_type}>{loc_str}"
+
+                    if item.text is not None and len(item.text) > 0:
+                        xml_str += f"{DocumentToken.BEG_CAPTION.value}"
+                        xml_str += (
+                            f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
+                        )
+
+                    xml_str += f"</{item_type}>{new_line}"
+
+        xml_str += DocumentToken.END_DOCUMENT.value
+
+        return xml_str
\ No newline at end of file
diff --git a/docling_core/types/legacy/__init__.py b/docling_core/types/experimental/__init__.py
similarity index 100%
rename from docling_core/types/legacy/__init__.py
rename to docling_core/types/experimental/__init__.py
diff --git a/docling_core/types/experimental/base.py b/docling_core/types/experimental/base.py
new file mode 100644
index 0000000..b082ea6
--- /dev/null
+++ b/docling_core/types/experimental/base.py
@@ -0,0 +1,120 @@
+import copy
+from enum import Enum
+from typing import Tuple
+
+from pydantic import BaseModel
+
+
+## All copied from docling
+class CoordOrigin(str, Enum):
+    TOPLEFT = "TOPLEFT"
+    BOTTOMLEFT = "BOTTOMLEFT"
+
+
+class Size(BaseModel):
+    width: float = 0.0
+    height: float = 0.0
+
+
+class BoundingBox(BaseModel):
+    l: float  # left
+    t: float  # top
+    r: float  # right
+    b: float  # bottom
+
+    coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
+
+    @property
+    def width(self):
+        return self.r - self.l
+
+    @property
+    def height(self):
+        return abs(self.t - self.b)
+
+    def scaled(self, scale: float) -> "BoundingBox":
+        out_bbox = copy.deepcopy(self)
+        out_bbox.l *= scale
+        out_bbox.r *= scale
+        out_bbox.t *= scale
+        out_bbox.b *= scale
+
+        return out_bbox
+
+    def normalized(self, page_size: Size) -> "BoundingBox":
+        out_bbox = copy.deepcopy(self)
+        out_bbox.l /= page_size.width
+        out_bbox.r /= page_size.width
+        out_bbox.t /= page_size.height
+        out_bbox.b /= page_size.height
+
+        return out_bbox
+
+    def as_tuple(self):
+        if self.coord_origin == CoordOrigin.TOPLEFT:
+            return (self.l, self.t, self.r, self.b)
+        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return (self.l, self.b, self.r, self.t)
+
+    @classmethod
+    def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
+        if origin == CoordOrigin.TOPLEFT:
+            l, t, r, b = coord[0], coord[1], coord[2], coord[3]
+            if r < l:
+                l, r = r, l
+            if b < t:
+                b, t = t, b
+
+            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
+        elif origin == CoordOrigin.BOTTOMLEFT:
+            l, b, r, t = coord[0], coord[1], coord[2], coord[3]
+            if r < l:
+                l, r = r, l
+            if b > t:
+                b, t = t, b
+
+            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
+
+    def area(self) -> float:
+        return (self.r - self.l) * (self.b - self.t)
+
+    def intersection_area_with(self, other: "BoundingBox") -> float:
+        # Calculate intersection coordinates
+        left = max(self.l, other.l)
+        top = max(self.t, other.t)
+        right = min(self.r, other.r)
+        bottom = min(self.b, other.b)
+
+        # Calculate intersection dimensions
+        width = right - left
+        height = bottom - top
+
+        # If the bounding boxes do not overlap, width or height will be negative
+        if width <= 0 or height <= 0:
+            return 0.0
+
+        return width * height
+
+    def to_bottom_left_origin(self, page_height) -> "BoundingBox":
+        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return self
+        elif self.coord_origin == CoordOrigin.TOPLEFT:
+            return BoundingBox(
+                l=self.l,
+                r=self.r,
+                t=page_height - self.t,
+                b=page_height - self.b,
+                coord_origin=CoordOrigin.BOTTOMLEFT,
+            )
+
+    def to_top_left_origin(self, page_height):
+        if self.coord_origin == CoordOrigin.TOPLEFT:
+            return self
+        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return BoundingBox(
+                l=self.l,
+                r=self.r,
+                t=page_height - self.t,  # self.b
+                b=page_height - self.b,  # self.t
+                coord_origin=CoordOrigin.TOPLEFT,
+            )
diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
new file mode 100644
index 0000000..6f53007
--- /dev/null
+++ b/docling_core/types/experimental/document.py
@@ -0,0 +1,93 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from pydantic import AnyUrl, BaseModel, Field
+
+from docling_core.types.experimental.base import BoundingBox, Size
+
+
+class FigureData(BaseModel):  # TBD
+    pass
+
+
+class TableData(BaseModel):  # TBD
+    pass
+
+
+class RefItem(BaseModel):
+    cref: str = Field(alias="$ref")
+
+    def resolve(self, doc: "DoclingDocument"):
+        _, path, index_str = self.cref.split("/")
+        index = int(index_str)
+        obj = doc.__getattribute__(path)[index]
+        return obj
+
+
+class ImageRef(BaseModel):
+    format: str  # png, etc.
+    dpi: int  # ...
+    size: Size
+    uri: AnyUrl
+
+
+class ProvenanceItem(BaseModel):
+    page_no: int
+    bbox: BoundingBox
+    charspan: Tuple[int, int]
+
+
+class DocItem(BaseModel):
+    dloc: str  # format spec ({document_hash}{json-path})
+    hash: int
+    label: str
+    parent: Optional[RefItem]
+    children: List[RefItem]
+    prov: List[ProvenanceItem]
+
+
+class TextItem(DocItem):
+    orig: str  # untreated representation
+    text: str  # sanitized representation
+
+
+class FloatingItem(DocItem):
+    caption: Optional[Union[RefItem, TextItem]]
+    references: List[Union[RefItem, TextItem]]
+    footnotes: List[Union[RefItem, TextItem]]
+    data: Any
+    image: Optional[ImageRef]
+
+
+class FigureItem(DocItem):
+    data: FigureData
+
+
+class TableItem(DocItem):
+    data: TableData
+
+
+class KeyValueItem(DocItem):
+    pass
+
+ContentItem = Union[TextItem, FigureItem, TableItem, KeyValueItem]
+
+
+class DocumentContent(BaseModel):
+    furniture: List[RefItem] = []
+    body: List[RefItem] = []
+    texts: List[TextItem] = []
+    figures: List[FigureItem] = []
+    tables: List[TableItem] = []
+    key_value_items: List[KeyValueItem] = []
+
+class PageItem(DocumentContent):
+    hash: str  # page hash
+    size: Size
+    image: Optional[ImageRef]
+    num_elements: int
+
+
+class DoclingDocument(DocumentContent):
+    description: Any
+    file_info: Any
+    pages: Dict[int, PageItem] = {}  # empty as default
diff --git a/docling_core/types/legacy/base.py b/docling_core/types/legacy/base.py
deleted file mode 100644
index fa61cbf..0000000
--- a/docling_core/types/legacy/base.py
+++ /dev/null
@@ -1,293 +0,0 @@
-#
-# Copyright IBM Corp. 2024 - 2024
-# SPDX-License-Identifier: MIT
-#
-
-"""Define common models across CCS objects."""
-from typing import Annotated, List, Literal, Optional, Union
-
-import pandas as pd
-from pydantic import BaseModel, Field, PositiveInt, StrictStr
-
-from docling_core.search.mapping import es_field
-from docling_core.utils.alias import AliasModel
-
-CellData = tuple[float, float, float, float, str, str]
-
-CellHeader = tuple[
-    Literal["x0"],
-    Literal["y0"],
-    Literal["x1"],
-    Literal["y1"],
-    Literal["font"],
-    Literal["text"],
-]
-
-BoundingBox = Annotated[list[float], Field(min_length=4, max_length=4)]
-
-Span = Annotated[list[int], Field(min_length=2, max_length=2)]
-
-
-class CellsContainer(BaseModel):
-    """Cell container."""
-
-    data: Optional[list[CellData]] = None
-    header: CellHeader = ("x0", "y0", "x1", "y1", "font", "text")
-
-
-class S3Resource(BaseModel):
-    """Resource in a cloud object storage."""
-
-    mime: str
-    path: str
-    page: Optional[PositiveInt] = None
-
-
-class S3Data(AliasModel):
-    """Data object in a cloud object storage."""
-
-    pdf_document: Optional[list[S3Resource]] = Field(default=None, alias="pdf-document")
-    pdf_pages: Optional[list[S3Resource]] = Field(default=None, alias="pdf-pages")
-    pdf_images: Optional[list[S3Resource]] = Field(default=None, alias="pdf-images")
-    json_document: Optional[S3Resource] = Field(default=None, alias="json-document")
-    json_meta: Optional[S3Resource] = Field(default=None, alias="json-meta")
-    glm_json_document: Optional[S3Resource] = Field(
-        default=None, alias="glm-json-document"
-    )
-    figures: Optional[list[S3Resource]] = None
-
-
-class S3Reference(AliasModel):
-    """References an s3 resource."""
-
-    ref_s3_data: StrictStr = Field(
-        alias="__ref_s3_data", examples=["#/_s3_data/figures/0"]
-    )
-
-
-class Prov(AliasModel):
-    """Provenance."""
-
-    bbox: BoundingBox
-    page: PositiveInt
-    span: Span
-    ref_s3_data: Optional[StrictStr] = Field(
-        default=None, alias="__ref_s3_data", json_schema_extra=es_field(suppress=True)
-    )
-
-
-class BoundingBoxContainer(BaseModel):
-    """Bounding box container."""
-
-    min: BoundingBox
-    max: BoundingBox
-
-
-class BitmapObject(AliasModel):
-    """Bitmap object."""
-
-    obj_type: str = Field(alias="type")
-    bounding_box: BoundingBoxContainer = Field(
-        json_schema_extra=es_field(suppress=True)
-    )
-    prov: Prov
-
-
-class PageDimensions(BaseModel):
-    """Page dimensions."""
-
-    height: float
-    page: PositiveInt
-    width: float
-
-
-class TableCell(AliasModel):
-    """Table cell."""
-
-    bbox: Optional[BoundingBox] = None
-    spans: Optional[list[Span]] = None
-    text: str = Field(json_schema_extra=es_field(term_vector="with_positions_offsets"))
-    obj_type: str = Field(alias="type")
-
-
-class GlmTableCell(TableCell):
-    """Glm Table cell."""
-
-    col: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True))
-    col_header: bool = Field(
-        default=False, alias="col-header", json_schema_extra=es_field(suppress=True)
-    )
-    col_span: Optional[Span] = Field(
-        default=None, alias="col-span", json_schema_extra=es_field(suppress=True)
-    )
-    row: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True))
-    row_header: bool = Field(
-        default=False, alias="row-header", json_schema_extra=es_field(suppress=True)
-    )
-    row_span: Optional[Span] = Field(
-        default=None, alias="row-span", json_schema_extra=es_field(suppress=True)
-    )
-
-
-class BaseCell(AliasModel):
-    """Base cell."""
-
-    # FIXME: we need to check why we have bounding_box (this should be in prov)
-    bounding_box: Optional[BoundingBoxContainer] = Field(
-        default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
-    )
-    prov: Optional[list[Prov]] = None
-    text: Optional[str] = Field(
-        default=None, json_schema_extra=es_field(term_vector="with_positions_offsets")
-    )
-    obj_type: str = Field(
-        alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-
-
-class Table(BaseCell):
-    """Table."""
-
-    num_cols: int = Field(alias="#-cols")
-    num_rows: int = Field(alias="#-rows")
-    data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
-    model: Optional[str] = None
-
-    def _get_tablecell_span(self, cell: TableCell, ix: int):
-        if cell.spans is None:
-            span = set()
-        else:
-            span = set([s[ix] for s in cell.spans])
-        if len(span) == 0:
-            return 1, None, None
-        return len(span), min(span), max(span)
-
-    def export_to_dataframe(self) -> pd.DataFrame:
-        """Export the table as a Pandas DataFrame."""
-        if self.data is None or self.num_rows == 0 or self.num_cols == 0:
-            return pd.DataFrame()
-
-        # Count how many rows are column headers
-        num_headers = 0
-        for i, row in enumerate(self.data):
-            if len(row) == 0:
-                raise RuntimeError(f"Invalid table. {len(row)=} but {self.num_cols=}.")
-
-            any_header = False
-            for cell in row:
-                if cell.obj_type == "col_header":
-                    any_header = True
-                    break
-
-            if any_header:
-                num_headers += 1
-            else:
-                break
-
-        # Create the column names from all col_headers
-        columns: Optional[List[str]] = None
-        if num_headers > 0:
-            columns = ["" for _ in range(self.num_cols)]
-            for i in range(num_headers):
-                for j, cell in enumerate(self.data[i]):
-                    col_name = cell.text
-                    if columns[j] != "":
-                        col_name = f".{col_name}"
-                    columns[j] += col_name
-
-        # Create table data
-        table_data = [[cell.text for cell in row] for row in self.data[num_headers:]]
-
-        # Create DataFrame
-        df = pd.DataFrame(table_data, columns=columns)
-
-        return df
-
-    def export_to_html(self) -> str:
-        """Export the table as html."""
-        body = ""
-        nrows = self.num_rows
-        ncols = self.num_cols
-
-        if self.data is None:
-            return ""
-        for i in range(nrows):
-            body += "<tr>"
-            for j in range(ncols):
-                cell: TableCell = self.data[i][j]
-
-                rowspan, rowstart, rowend = self._get_tablecell_span(cell, 0)
-                colspan, colstart, colend = self._get_tablecell_span(cell, 1)
-
-                if rowstart is not None and rowstart != i:
-                    continue
-                if colstart is not None and colstart != j:
-                    continue
-
-                if rowstart is None:
-                    rowstart = i
-                if colstart is None:
-                    colstart = j
-
-                content = cell.text.strip()
-                label = cell.obj_type
-                celltag = "td"
-                if label in ["row_header", "row_multi_header", "row_title"]:
-                    pass
-                elif label in ["col_header", "col_multi_header"]:
-                    celltag = "th"
-
-                opening_tag = f"{celltag}"
-                if rowspan > 1:
-                    opening_tag += f' rowspan="{rowspan}"'
-                if colspan > 1:
-                    opening_tag += f' colspan="{colspan}"'
-
-                body += f"<{opening_tag}>{content}</{celltag}>"
-            body += "</tr>"
-        body = f"<table>{body}</table>"
-
-        return body
-
-
-# FIXME: let's add some figure specific data-types later
-class Figure(BaseCell):
-    """Figure."""
-
-
-class BaseText(AliasModel):
-    """Base model for text objects."""
-
-    text: StrictStr = Field(
-        json_schema_extra=es_field(term_vector="with_positions_offsets")
-    )
-    obj_type: StrictStr = Field(
-        alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    name: Optional[StrictStr] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    font: Optional[str] = None
-    prov: Optional[list[Prov]] = None
-
-
-class ListItem(BaseText):
-    """List item."""
-
-    identifier: str
-
-
-class Ref(AliasModel):
-    """Reference."""
-
-    name: str
-    obj_type: str = Field(alias="type")
-    ref: str = Field(alias="$ref")
-
-
-class PageReference(BaseModel):
-    """Page reference."""
-
-    hash: str = Field(json_schema_extra=es_field(type="keyword", ignore_above=8191))
-    model: str = Field(json_schema_extra=es_field(suppress=True))
-    page: PositiveInt = Field(json_schema_extra=es_field(type="short"))
\ No newline at end of file
diff --git a/docling_core/types/legacy/document.py b/docling_core/types/legacy/document.py
deleted file mode 100644
index 36ffa4b..0000000
--- a/docling_core/types/legacy/document.py
+++ /dev/null
@@ -1,791 +0,0 @@
-#
-# Copyright IBM Corp. 2024 - 2024
-# SPDX-License-Identifier: MIT
-#
-
-"""Models for the Docling Document data type."""
-
-from datetime import datetime
-from enum import Enum
-from typing import Generic, Optional, Tuple, Union
-
-from pydantic import (
-    AnyHttpUrl,
-    BaseModel,
-    Field,
-    NonNegativeInt,
-    StrictStr,
-    model_validator,
-)
-from tabulate import tabulate
-
-from docling_core.search.mapping import es_field
-from docling_core.types.base import (
-    Acquisition,
-    CollectionDocumentInfo,
-    CollectionNameTypeT,
-    DescriptionAdvancedT,
-    DescriptionAnalyticsT,
-    FileInfoObject,
-    Identifier,
-    IdentifierTypeT,
-    LanguageT,
-    Log,
-)
-from docling_core.types.legacy.base import (
-    BaseCell,
-    BaseText,
-    BitmapObject,
-    Figure,
-    PageDimensions,
-    PageReference,
-    Ref,
-    S3Data,
-    Table,
-)
-from docling_core.utils.alias import AliasModel
-
-
-class CCSFileInfoDescription(BaseModel, extra="forbid"):
-    """File info description."""
-
-    author: Optional[list[StrictStr]] = None
-    keywords: Optional[str] = None
-    subject: Optional[str] = None
-    title: Optional[StrictStr] = None
-    creation_date: Optional[str] = None  # datetime
-
-
-class CCSFileInfoObject(FileInfoObject, extra="forbid"):
-    """File info object."""
-
-    num_pages: Optional[int] = Field(default=None, alias="#-pages")
-
-    collection_name: Optional[str] = Field(
-        default=None,
-        alias="collection-name",
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-    )
-    description: Optional[CCSFileInfoDescription] = Field(
-        default=None, json_schema_extra=es_field(suppress=True)
-    )
-    page_hashes: Optional[list[PageReference]] = Field(
-        default=None, alias="page-hashes"
-    )
-
-
-class Affiliation(BaseModel, extra="forbid"):
-    """Affiliation."""
-
-    name: str = Field(
-        ...,
-        json_schema_extra=es_field(
-            fields={
-                "lower": {
-                    "normalizer": "lowercase_asciifolding",
-                    "type": "keyword",
-                    "ignore_above": 8191,
-                },
-                "keyword": {"type": "keyword", "ignore_above": 8191},
-            },
-        ),
-    )
-    id: Optional[str] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    source: Optional[str] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-
-
-class Author(BaseModel, extra="forbid"):
-    """Author."""
-
-    name: str = Field(
-        ...,
-        json_schema_extra=es_field(
-            type="text",
-            fields={
-                "lower": {
-                    "normalizer": "lowercase_asciifolding",
-                    "type": "keyword",
-                    "ignore_above": 8191,
-                },
-                "keyword": {"type": "keyword", "ignore_above": 8191},
-            },
-        ),
-    )
-    id: Optional[str] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    source: Optional[str] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    affiliations: Optional[list[Affiliation]] = None
-
-
-class Publication(BaseModel, Generic[IdentifierTypeT], extra="forbid"):
-    """Publication details of a journal or venue."""
-
-    identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field(
-        default=None,
-        description="Unique identifiers of a publication venue.",
-    )
-    name: StrictStr = Field(
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-        description="Name of the publication.",
-    )
-    alternate_names: Optional[list[StrictStr]] = Field(
-        default=None,
-        json_schema_extra=es_field(type="text"),
-        title="Alternate Names",
-        description="Other names or abbreviations of this publication.",
-    )
-    type: Optional[list[StrictStr]] = Field(
-        default=None,
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-        description="Type of publication (journal article, conference, review,...).",
-    )
-    pages: Optional[StrictStr] = Field(
-        default=None,
-        json_schema_extra=es_field(type="text"),
-        description="Page range in the publication.",
-    )
-    issue: Optional[StrictStr] = Field(
-        default=None,
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-        description="Publication issue (issue number).",
-    )
-    volume: Optional[StrictStr] = Field(
-        default=None,
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-        description="Publication volume.",
-    )
-    url: Optional[AnyHttpUrl] = Field(
-        default=None,
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-        description="URL on the publication site.",
-    )
-
-
-class DescriptionLicense(BaseModel, extra="forbid"):
-    """Licence in document description."""
-
-    code: Optional[StrictStr] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    text: Optional[StrictStr] = None
-
-
-class CCSDocumentDescription(
-    AliasModel,
-    Generic[
-        DescriptionAdvancedT,
-        DescriptionAnalyticsT,
-        IdentifierTypeT,
-        LanguageT,
-        CollectionNameTypeT,
-    ],
-):
-    """Description in document."""
-
-    title: Optional[StrictStr] = None
-    abstract: Optional[list[StrictStr]] = None
-    authors: Optional[list[Author]] = None
-    affiliations: Optional[list[Affiliation]] = None
-    subjects: Optional[list[str]] = Field(
-        default=None,
-        json_schema_extra=es_field(
-            fields={"keyword": {"ignore_above": 8191, "type": "keyword"}}
-        ),
-    )
-    keywords: Optional[list[str]] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    publication_date: Optional[datetime] = None
-    languages: Optional[list[LanguageT]] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    license_: Optional[DescriptionLicense] = Field(default=None, alias="license")
-    publishers: Optional[list[StrictStr]] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    url_refs: Optional[list[str]] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    references: Optional[list[Identifier[IdentifierTypeT]]] = None
-    publication: Optional[list[Publication]] = Field(
-        default=None, description="List of publication journals or venues."
-    )
-    reference_count: Optional[NonNegativeInt] = Field(
-        default=None,
-        title="Reference Count",
-        description="Total number of documents referenced by this document.",
-        json_schema_extra=es_field(type="integer"),
-    )
-    citation_count: Optional[NonNegativeInt] = Field(
-        default=None,
-        title="Citation Count",
-        description=(
-            "Total number of citations that this document has received (number "
-            "of documents in whose bibliography this document appears)."
-        ),
-        json_schema_extra=es_field(type="integer"),
-    )
-    citation_date: Optional[datetime] = Field(
-        default=None,
-        title="Citation Count Date",
-        description="Last update date of the citation count.",
-    )
-    advanced: Optional[DescriptionAdvancedT] = None
-    analytics: Optional[DescriptionAnalyticsT] = None
-    logs: list[Log]
-    collection: Optional[CollectionDocumentInfo[CollectionNameTypeT]] = Field(
-        default=None, description="The collection information of this document."
-    )
-    acquisition: Optional[Acquisition] = Field(
-        default=None,
-        description=(
-            "Information on how the document was obtained, for data governance"
-            " purposes."
-        ),
-    )
-
-
-class MinimalDocument(
-    AliasModel,
-    Generic[
-        DescriptionAdvancedT,
-        DescriptionAnalyticsT,
-        IdentifierTypeT,
-        LanguageT,
-        CollectionNameTypeT,
-    ],
-):
-    """Minimal model for a document."""
-
-    name: StrictStr = Field(alias="_name")
-    obj_type: Optional[StrictStr] = Field("document", alias="type")
-    description: CCSDocumentDescription[
-        DescriptionAdvancedT,
-        DescriptionAnalyticsT,
-        IdentifierTypeT,
-        LanguageT,
-        CollectionNameTypeT,
-    ]
-    file_info: FileInfoObject = Field(alias="file-info")
-    main_text: Optional[list[Union[Ref, BaseText]]] = Field(
-        default=None, alias="main-text"
-    )
-    figures: Optional[list[Figure]] = None
-    tables: Optional[list[Table]] = None
-
-
-class CCSDocument(
-    MinimalDocument,
-    Generic[
-        DescriptionAdvancedT,
-        DescriptionAnalyticsT,
-        IdentifierTypeT,
-        LanguageT,
-        CollectionNameTypeT,
-    ],
-):
-    """Model for a CCS-generated document."""
-
-    obj_type: Optional[StrictStr] = Field("pdf-document", alias="type")
-    bitmaps: Optional[list[BitmapObject]] = None
-    equations: Optional[list[BaseCell]] = None
-    footnotes: Optional[list[BaseText]] = None
-    file_info: CCSFileInfoObject = Field(alias="file-info")
-    main_text: Optional[list[Union[Ref, BaseText]]] = Field(
-        default=None,
-        alias="main-text",
-    )
-    page_dimensions: Optional[list[PageDimensions]] = Field(
-        default=None, alias="page-dimensions"
-    )
-    page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
-    page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
-    s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
-
-    @model_validator(mode="before")
-    @classmethod
-    def from_dict(cls, data):
-        """Validates and fixes the input data."""
-        if not isinstance(data, dict):
-            return data
-        description_collection = data["description"].get("collection")
-        if not description_collection:
-            data["description"].setdefault("collection", {})
-
-        data["description"]["collection"].setdefault("type", "Document")
-        logs = data["description"].get("logs")
-        if not logs:
-            data["description"].setdefault("logs", [])
-
-        abstract = data["description"].get("abstract")
-        if abstract is not None and not isinstance(abstract, list):
-            if isinstance(abstract, str):
-                data["description"]["abstract"] = [abstract]
-            else:
-                data["description"].pop("abstract")
-
-        for key in ["affiliations", "authors"]:
-            descr = data["description"].get(key)
-            if descr is not None and not isinstance(descr, list):
-                if isinstance(descr, dict):
-                    data["description"][key] = [descr]
-                else:
-                    data["description"].pop(key)
-
-        if data.get("main-text"):
-            for item in data["main-text"]:
-                if ref := item.pop("__ref", None):
-                    item["$ref"] = ref
-
-        return data
-
-
-class DocumentToken(Enum):
-    """Class to represent an LLM friendly representation of a Document."""
-
-    BEG_DOCUMENT = "<document>"
-    END_DOCUMENT = "</document>"
-
-    BEG_TITLE = "<title>"
-    END_TITLE = "</title>"
-
-    BEG_ABSTRACT = "<abstract>"
-    END_ABSTRACT = "</abstract>"
-
-    BEG_DOI = "<doi>"
-    END_DOI = "</doi>"
-    BEG_DATE = "<date>"
-    END_DATE = "</date>"
-
-    BEG_AUTHORS = "<authors>"
-    END_AUTHORS = "</authors>"
-    BEG_AUTHOR = "<author>"
-    END_AUTHOR = "</author>"
-
-    BEG_AFFILIATIONS = "<affiliations>"
-    END_AFFILIATIONS = "</affiliations>"
-    BEG_AFFILIATION = "<affiliation>"
-    END_AFFILIATION = "</affiliation>"
-
-    BEG_HEADER = "<section-header>"
-    END_HEADER = "</section-header>"
-    BEG_TEXT = "<text>"
-    END_TEXT = "</text>"
-    BEG_PARAGRAPH = "<paragraph>"
-    END_PARAGRAPH = "</paragraph>"
-    BEG_TABLE = "<table>"
-    END_TABLE = "</table>"
-    BEG_FIGURE = "<figure>"
-    END_FIGURE = "</figure>"
-    BEG_CAPTION = "<caption>"
-    END_CAPTION = "</caption>"
-    BEG_EQUATION = "<equation>"
-    END_EQUATION = "</equation>"
-    BEG_LIST = "<list>"
-    END_LIST = "</list>"
-    BEG_LISTITEM = "<list-item>"
-    END_LISTITEM = "</list-item>"
-
-    BEG_LOCATION = "<location>"
-    END_LOCATION = "</location>"
-    BEG_GROUP = "<group>"
-    END_GROUP = "</group>"
-
-    @classmethod
-    def get_special_tokens(
-        cls,
-        max_rows: int = 100,
-        max_cols: int = 100,
-        max_pages: int = 1000,
-        page_dimension: Tuple[int, int] = (100, 100),
-    ):
-        """Function to get all special document tokens."""
-        special_tokens = [token.value for token in cls]
-
-        # Adding dynamically generated row and col tokens
-        for i in range(0, max_rows + 1):
-            special_tokens += [f"<row_{i}>", f"</row_{i}>"]
-
-        for i in range(0, max_cols + 1):
-            special_tokens += [f"<col_{i}>", f"</col_{i}>"]
-
-        for i in range(6):
-            special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
-
-        # Adding dynamically generated page-tokens
-        for i in range(0, max_pages + 1):
-            special_tokens.append(f"<page_{i}>")
-
-        # Adding dynamically generated location-tokens
-        for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
-            special_tokens.append(f"<loc_{i}>")
-
-        return special_tokens
-
-    @staticmethod
-    def get_page_token(page: int):
-        """Function to get page tokens."""
-        return f"<page_{page}>"
-
-    @staticmethod
-    def get_location_token(val: float, rnorm: int = 100):
-        """Function to get location tokens."""
-        val_ = round(rnorm * val)
-
-        if val_ < 0:
-            return "<loc_0>"
-
-        if val_ > rnorm:
-            return f"<loc_{rnorm}>"
-
-        return f"<loc_{val_}>"
-
-
-class ExportedCCSDocument(
-    MinimalDocument,
-    Generic[
-        DescriptionAdvancedT,
-        DescriptionAnalyticsT,
-        IdentifierTypeT,
-        LanguageT,
-        CollectionNameTypeT,
-    ],
-):
-    """Document model for Docling."""
-
-    obj_type: Optional[StrictStr] = Field(
-        "pdf-document",
-        alias="type",
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-    )
-    bitmaps: Optional[list[BitmapObject]] = None
-    equations: Optional[list[BaseCell]] = None
-    footnotes: Optional[list[BaseText]] = None
-    description: CCSDocumentDescription[
-        DescriptionAdvancedT,
-        DescriptionAnalyticsT,
-        IdentifierTypeT,
-        LanguageT,
-        CollectionNameTypeT,
-    ]
-    file_info: CCSFileInfoObject = Field(alias="file-info")
-    main_text: Optional[list[Union[Ref, BaseText]]] = Field(
-        default=None, alias="main-text"
-    )
-    page_dimensions: Optional[list[PageDimensions]] = Field(
-        default=None, alias="page-dimensions"
-    )
-    page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
-    page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
-    s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
-    identifiers: Optional[list[Identifier[IdentifierTypeT]]] = None
-
-    @model_validator(mode="before")
-    @classmethod
-    def from_dict(cls, data):
-        """Fix ref in main-text."""
-        if not isinstance(data, dict):
-            return data
-        if data.get("main-text"):
-            for item in data["main-text"]:
-                if ref := item.pop("__ref", None):
-                    item["$ref"] = ref
-
-        return data
-
-    def _resolve_ref(self, item: Ref) -> Optional[Union[BaseCell, BaseText]]:
-        """Return the resolved reference.
-
-        Resolved the Ref object within the document.
-        If the object is not found, None is returned.
-        """
-        result: Optional[Union[BaseCell, BaseText]] = None
-
-        # NOTE: currently only resolves refs explicitely, such that we can make
-        # assumptions on ref parts
-        if item.obj_type == "table" and self.tables:
-            parts = item.ref.split("/")
-            result = self.tables[int(parts[2])]
-        elif item.obj_type == "figure" and self.figures:
-            parts = item.ref.split("/")
-            result = self.figures[int(parts[2])]
-        elif item.obj_type == "equation" and self.equations:
-            parts = item.ref.split("/")
-            result = self.equations[int(parts[2])]
-        elif item.obj_type == "footnote" and self.footnotes:
-            parts = item.ref.split("/")
-            result = self.footnotes[int(parts[2])]
-
-        return result
-
-    def export_to_markdown(
-        self,
-        delim: str = "\n\n",
-        main_text_start: int = 0,
-        main_text_stop: Optional[int] = None,
-        main_text_labels: list[str] = [
-            "title",
-            "subtitle-level-1",
-            "paragraph",
-            "caption",
-            "table",
-        ],
-        strict_text: bool = False,
-    ) -> str:
-        r"""Serialize to Markdown.
-
-        Operates on a slice of the document's main_text as defined through arguments
-        main_text_start and main_text_stop; defaulting to the whole main_text.
-
-        Args:
-            delim (str, optional): Delimiter to use when concatenating the various
-                Markdown parts. Defaults to "\n\n".
-            main_text_start (int, optional): Main-text slicing start index (inclusive).
-                Defaults to 0.
-            main_text_end (Optional[int], optional): Main-text slicing stop index
-                (exclusive). Defaults to None.
-
-        Returns:
-            str: The exported Markdown representation.
-        """
-        has_title = False
-        prev_text = ""
-        md_texts: list[str] = []
-
-        if self.main_text is not None:
-            for orig_item in self.main_text[main_text_start:main_text_stop]:
-                markdown_text = ""
-
-                item = (
-                    self._resolve_ref(orig_item)
-                    if isinstance(orig_item, Ref)
-                    else orig_item
-                )
-                if item is None:
-                    continue
-
-                item_type = item.obj_type
-                if isinstance(item, BaseText) and item_type in main_text_labels:
-                    text = item.text
-
-                    # ignore repeated text
-                    if prev_text == text:
-                        continue
-                    else:
-                        prev_text = text
-
-                    # first title match
-                    if item_type == "title" and not has_title:
-                        if strict_text:
-                            markdown_text = f"{text}"
-                        else:
-                            markdown_text = f"# {text}"
-                        has_title = True
-
-                    # secondary titles
-                    elif item_type in {"title", "subtitle-level-1"} or (
-                        has_title and item_type == "title"
-                    ):
-                        if strict_text:
-                            markdown_text = f"{text}"
-                        else:
-                            markdown_text = f"## {text}"
-
-                    # normal text
-                    else:
-                        markdown_text = text
-
-                elif (
-                    isinstance(item, Table)
-                    and item.data
-                    and item_type in main_text_labels
-                    and not strict_text
-                ):
-                    table = []
-                    for row in item.data:
-                        tmp = []
-                        for col in row:
-                            tmp.append(col.text)
-                        table.append(tmp)
-
-                    if len(table) > 1 and len(table[0]) > 0:
-                        try:
-                            md_table = tabulate(
-                                table[1:], headers=table[0], tablefmt="github"
-                            )
-                        except ValueError:
-                            md_table = tabulate(
-                                table[1:],
-                                headers=table[0],
-                                tablefmt="github",
-                                disable_numparse=True,
-                            )
-
-                        markdown_text = md_table
-
-                if markdown_text:
-                    md_texts.append(markdown_text)
-
-        result = delim.join(md_texts)
-        return result
-
-    def export_to_document_tokens(
-        self,
-        delim: str = "\n\n",
-        main_text_start: int = 0,
-        main_text_stop: Optional[int] = None,
-        main_text_labels: list[str] = [
-            "title",
-            "subtitle-level-1",
-            "paragraph",
-            "caption",
-            "table",
-            "figure",
-        ],
-        page_tagging: bool = True,
-        location_tagging: bool = True,
-        location_dimensions: Tuple[int, int] = (100, 100),
-        add_new_line: bool = True,
-    ) -> str:
-        r"""Exports the document content to an DocumentToken format.
-
-        Operates on a slice of the document's main_text as defined through arguments
-        main_text_start and main_text_stop; defaulting to the whole main_text.
-
-        Args:
-            delim (str, optional): The delimiter used to separate text blocks in the
-                exported XML. Default is two newline characters ("\n\n").
-            main_text_start (int, optional): The starting index of the main text to
-                be included in the XML. Default is 0 (the beginning of the text).
-            main_text_stop (Optional[int], optional): The stopping index of the main
-                text. If set to None, the export includes text up to the end.
-                Default is None.
-            main_text_labels (list[str], optional): A list of text labels that
-                categorize the different sections of the document (e.g., "title",
-                "subtitle-level-1", "paragraph", "caption"). Default labels are
-                "title", "subtitle-level-1", "paragraph", and "caption".
-            location_tagging (bool, optional): Determines whether to include
-                location-based tagging in the XML. If True, the exported XML will
-                contain information about the locations of the text elements.
-                Default is True.
-            location_dimensions (Tuple[int, int], optional): Specifies the dimensions
-                (width and height) for the location tagging, if enabled.
-                Default is [100, 100].
-            add_new_line (bool, optional): Whether to add new line characters after
-                each text block. If True, a new line is added after each block of
-                text in the XML. Default is True.
-
-        Returns:
-            str: The content of the document formatted as an XML string.
-        """
-        xml_str = DocumentToken.BEG_DOCUMENT.value
-
-        new_line = ""
-        if add_new_line:
-            new_line = "\n"
-
-        if self.main_text is not None:
-            for orig_item in self.main_text[main_text_start:main_text_stop]:
-
-                item = (
-                    self._resolve_ref(orig_item)
-                    if isinstance(orig_item, Ref)
-                    else orig_item
-                )
-
-                if item is None:
-                    continue
-
-                prov = item.prov
-
-                loc_str = ""  # default is zero
-                if (
-                    location_tagging
-                    and self.page_dimensions is not None
-                    and prov is not None
-                    and len(prov) > 0
-                ):
-
-                    page = prov[0].page
-                    page_dim = self.page_dimensions[page - 1]
-
-                    page_w = float(page_dim.width)
-                    page_h = float(page_dim.height)
-
-                    x0 = float(prov[0].bbox[0]) / float(page_w)
-                    y0 = float(prov[0].bbox[1]) / float(page_h)
-                    x1 = float(prov[0].bbox[2]) / float(page_w)
-                    y1 = float(prov[0].bbox[3]) / float(page_h)
-
-                    page_tok = ""
-                    if page_tagging:
-                        page_tok = DocumentToken.get_page_token(page=page)
-
-                    x0_tok = DocumentToken.get_location_token(
-                        val=min(x0, x1), rnorm=location_dimensions[0]
-                    )
-                    y0_tok = DocumentToken.get_location_token(
-                        val=min(y0, y1), rnorm=location_dimensions[1]
-                    )
-                    x1_tok = DocumentToken.get_location_token(
-                        val=max(x0, x1), rnorm=location_dimensions[0]
-                    )
-                    y1_tok = DocumentToken.get_location_token(
-                        val=max(y0, y1), rnorm=location_dimensions[1]
-                    )
-
-                    # update
-                    loc_str = f"{DocumentToken.BEG_LOCATION.value}"
-                    loc_str += f"{page_tok}"
-                    loc_str += f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
-                    loc_str += f"{DocumentToken.END_LOCATION.value}"
-
-                item_type = item.obj_type
-                if isinstance(item, BaseText) and (item_type in main_text_labels):
-                    text = item.text
-
-                    xml_str += f"<{item_type}>{loc_str}{text}</{item_type}>{new_line}"
-
-                elif isinstance(item, Table) and (item_type in main_text_labels):
-
-                    xml_str += f"<{item_type}>{loc_str}"
-
-                    if item.text is not None and len(item.text) > 0:
-                        xml_str += f"{DocumentToken.BEG_CAPTION.value}"
-                        xml_str += (
-                            f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
-                        )
-
-                    if item.data is not None and len(item.data) > 0:
-                        for i, row in enumerate(item.data):
-                            xml_str += f"<row_{i}>"
-                            for j, col in enumerate(row):
-                                text = col.text
-                                xml_str += f"<col_{j}>{text}</col_{j}>"
-
-                            xml_str += f"</row_{i}>{new_line}"
-
-                    xml_str += f"</{item_type}>{new_line}"
-
-                elif isinstance(item, Figure) and (item_type in main_text_labels):
-
-                    xml_str += f"<{item_type}>{loc_str}"
-
-                    if item.text is not None and len(item.text) > 0:
-                        xml_str += f"{DocumentToken.BEG_CAPTION.value}"
-                        xml_str += (
-                            f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
-                        )
-
-                    xml_str += f"</{item_type}>{new_line}"
-
-        xml_str += DocumentToken.END_DOCUMENT.value
-
-        return xml_str
\ No newline at end of file
diff --git a/docling_core/types/rec/subject.py b/docling_core/types/rec/subject.py
index ceb28e3..69d2e88 100644
--- a/docling_core/types/rec/subject.py
+++ b/docling_core/types/rec/subject.py
@@ -15,7 +15,7 @@
     SubjectNameTypeT,
     SubjectTypeT,
 )
-from docling_core.types.legacy.base import S3Reference
+from docling_core.types.doc.base import S3Reference
 from docling_core.utils.alias import AliasModel
 
 
diff --git a/test/test_base.py b/test/test_base.py
index 76c9cbc..89cda5a 100644
--- a/test/test_base.py
+++ b/test/test_base.py
@@ -20,7 +20,7 @@
     Log,
     StrictDateTime,
 )
-from docling_core.types.legacy.document import CCSDocumentDescription
+from docling_core.types.doc.document import CCSDocumentDescription
 from docling_core.types.rec.record import RecordDescription
 
 
diff --git a/test/test_doc_base.py b/test/test_doc_base.py
index a1b8186..d5a48ff 100644
--- a/test/test_doc_base.py
+++ b/test/test_doc_base.py
@@ -7,7 +7,7 @@
 import pytest
 from pydantic import ValidationError
 
-from docling_core.types.legacy.base import Prov, S3Reference
+from docling_core.types.doc.base import Prov, S3Reference
 
 
 def test_s3_reference():
diff --git a/test/test_doc_schema.py b/test/test_doc_schema.py
index d3476f6..a899b08 100644
--- a/test/test_doc_schema.py
+++ b/test/test_doc_schema.py
@@ -17,7 +17,7 @@
     IdentifierTypeT,
     LanguageT,
 )
-from docling_core.types.legacy.document import (
+from docling_core.types.doc.document import (
     CCSDocument,
     CCSDocumentDescription,
     Publication,
diff --git a/test/test_doc_schema_extractor.py b/test/test_doc_schema_extractor.py
index 78dbcd4..9f1f9d9 100644
--- a/test/test_doc_schema_extractor.py
+++ b/test/test_doc_schema_extractor.py
@@ -8,7 +8,7 @@
 
 from pydantic import ValidationError
 
-from docling_core.types.legacy.document import CCSDocument
+from docling_core.types.doc.document import CCSDocument
 
 
 def test_ccs_document_update():
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index ee448ca..74ca859 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -1,7 +1,7 @@
 import yaml
 import pytest
 from docling_core.types import DoclingDocument, BoundingBox
-from docling_core.types.doc.document import ProvenanceItem
+from docling_core.types.experimental.document import ProvenanceItem
 
 
 def test_load_serialize_doc():
diff --git a/test/test_json_schema_to_search_mapper.py b/test/test_json_schema_to_search_mapper.py
index ab1abe4..413e5b2 100644
--- a/test/test_json_schema_to_search_mapper.py
+++ b/test/test_json_schema_to_search_mapper.py
@@ -10,7 +10,7 @@
 import jsondiff
 
 from docling_core.search.json_schema_to_search_mapper import JsonSchemaToSearchMapper
-from docling_core.types.legacy.document import ExportedCCSDocument
+from docling_core.types.doc.document import ExportedCCSDocument
 from docling_core.types.rec.record import Record
 
 

From 7dcbde763ead168b6f2ee44a830ff25099df7239 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Fri, 20 Sep 2024 16:53:42 +0200
Subject: [PATCH 05/34] Updates for document construction API and format

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/__init__.py              |   4 +-
 docling_core/types/doc/base.py              |   2 +-
 docling_core/types/doc/document.py          |   2 +-
 docling_core/types/experimental/document.py | 209 ++++++++++++++++++--
 test/data/newdoc/dummy_doc.yaml             |  38 ++--
 test/test_docling_doc.py                    |  65 +++---
 6 files changed, 258 insertions(+), 62 deletions(-)

diff --git a/docling_core/types/__init__.py b/docling_core/types/__init__.py
index 5f493b8..fde140e 100644
--- a/docling_core/types/__init__.py
+++ b/docling_core/types/__init__.py
@@ -5,7 +5,7 @@
 
 """Define the main types."""
 
-from docling_core.types.doc.base import BoundingBox # noqa
+from docling_core.types.doc.base import BoundingBox  # noqa
 from docling_core.types.doc.base import Table  # noqa
 from docling_core.types.doc.base import TableCell  # noqa
 from docling_core.types.doc.base import (  # noqa
@@ -23,5 +23,3 @@
 from docling_core.types.doc.document import ExportedCCSDocument as Document  # noqa
 from docling_core.types.gen.generic import Generic  # noqa
 from docling_core.types.rec.record import Record  # noqa
-
-from docling_core.types.experimental.document import DoclingDocument, DocItem, TextItem, FloatingItem, TableItem, FigureItem, TableData, FigureData, PageItem
\ No newline at end of file
diff --git a/docling_core/types/doc/base.py b/docling_core/types/doc/base.py
index fa61cbf..b823ca8 100644
--- a/docling_core/types/doc/base.py
+++ b/docling_core/types/doc/base.py
@@ -290,4 +290,4 @@ class PageReference(BaseModel):
 
     hash: str = Field(json_schema_extra=es_field(type="keyword", ignore_above=8191))
     model: str = Field(json_schema_extra=es_field(suppress=True))
-    page: PositiveInt = Field(json_schema_extra=es_field(type="short"))
\ No newline at end of file
+    page: PositiveInt = Field(json_schema_extra=es_field(type="short"))
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index e54ea9e..f0e5c2a 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -788,4 +788,4 @@ def export_to_document_tokens(
 
         xml_str += DocumentToken.END_DOCUMENT.value
 
-        return xml_str
\ No newline at end of file
+        return xml_str
diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index 6f53007..2a44176 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -1,21 +1,35 @@
+import hashlib
+import typing
 from typing import Any, Dict, List, Optional, Tuple, Union
 
-from pydantic import AnyUrl, BaseModel, Field
+from pydantic import AnyUrl, BaseModel, ConfigDict, Field, computed_field
 
 from docling_core.types.experimental.base import BoundingBox, Size
 
+# Uint64 = conint(ge=0, le=(2**64 - 1))  # type: ignore[valid-type]
+Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
+LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
 
-class FigureData(BaseModel):  # TBD
+
+class BaseFigureData(BaseModel):  # TBD
     pass
 
 
-class TableData(BaseModel):  # TBD
+class BaseTableData(BaseModel):  # TBD
     pass
 
 
+class FileInfo(BaseModel):
+    document_hash: str
+
+
 class RefItem(BaseModel):
     cref: str = Field(alias="$ref")
 
+    model_config = ConfigDict(
+        populate_by_name=True,
+    )
+
     def resolve(self, doc: "DoclingDocument"):
         _, path, index_str = self.cref.split("/")
         index = int(index_str)
@@ -36,13 +50,34 @@ class ProvenanceItem(BaseModel):
     charspan: Tuple[int, int]
 
 
-class DocItem(BaseModel):
+class NodeItem(BaseModel):
     dloc: str  # format spec ({document_hash}{json-path})
-    hash: int
+    parent: Optional[RefItem] = None
+    children: List[RefItem] = []
+
+    @computed_field  # type: ignore
+    @property
+    def hash(self) -> Uint64:  # TODO align with hasher on deepsearch-glm
+        if not len(self.dloc):
+            return 0
+        hash_object = hashlib.sha256(self.dloc.encode("utf-8"))
+
+        # Convert the hash to an integer
+        hash_int = int.from_bytes(hash_object.digest(), "big")
+
+        # Mask it to fit within 64 bits
+        return Uint64(hash_int & 0xFFFFFFFFFFFFFFFF)  # 64-bit unsigned integer mask
+
+
+class GroupItem(NodeItem):  # Container type, can't be a leaf node
+    name: str
+
+
+class DocItem(
+    NodeItem
+):  # Base type for any element that carries content, can be a leaf node
     label: str
-    parent: Optional[RefItem]
-    children: List[RefItem]
-    prov: List[ProvenanceItem]
+    prov: List[ProvenanceItem] = []
 
 
 class TextItem(DocItem):
@@ -50,36 +85,44 @@ class TextItem(DocItem):
     text: str  # sanitized representation
 
 
+class Section(TextItem):
+    level: LevelNumber = 1
+
+
 class FloatingItem(DocItem):
-    caption: Optional[Union[RefItem, TextItem]]
-    references: List[Union[RefItem, TextItem]]
-    footnotes: List[Union[RefItem, TextItem]]
-    data: Any
-    image: Optional[ImageRef]
+    caption: Optional[RefItem] = None
+    references: List[RefItem] = []
+    footnotes: List[RefItem] = []
+    image: Optional[ImageRef] = None
 
 
-class FigureItem(DocItem):
-    data: FigureData
+class FigureItem(FloatingItem):
+    data: BaseFigureData
 
 
-class TableItem(DocItem):
-    data: TableData
+class TableItem(FloatingItem):
+    data: BaseTableData
 
 
 class KeyValueItem(DocItem):
     pass
 
+
 ContentItem = Union[TextItem, FigureItem, TableItem, KeyValueItem]
 
 
 class DocumentContent(BaseModel):
-    furniture: List[RefItem] = []
-    body: List[RefItem] = []
+    furniture: GroupItem = GroupItem(
+        name="_root_", dloc="#/furniture"
+    )  # List[RefItem] = []
+    body: GroupItem = GroupItem(name="_root_", dloc="#/body")  # List[RefItem] = []
+    groups: List[GroupItem] = []
     texts: List[TextItem] = []
     figures: List[FigureItem] = []
     tables: List[TableItem] = []
     key_value_items: List[KeyValueItem] = []
 
+
 class PageItem(DocumentContent):
     hash: str  # page hash
     size: Size
@@ -89,5 +132,131 @@ class PageItem(DocumentContent):
 
 class DoclingDocument(DocumentContent):
     description: Any
-    file_info: Any
+    file_info: FileInfo
     pages: Dict[int, PageItem] = {}  # empty as default
+
+    # def add_furniture_group(self, name: str):
+    #    group = GroupItem(name=name)
+    #    self.furniture.children.append(group)
+    #    return group
+    def resolve_cref(self, obj):
+        path = obj.dloc.split("#")[1]
+        return path
+
+    def add_group(self, name: str, parent: Optional[GroupItem] = None) -> GroupItem:
+        if not parent:
+            parent = self.body
+            parent_cref = "#/body"
+        else:
+            parent_cref = self.resolve_cref(parent)
+
+        group_index = len(self.groups)
+        cref = f"#/groups/{group_index}"
+        dloc = f"{self.file_info.document_hash}{cref}"
+
+        group = GroupItem(name=name, dloc=dloc, parent=RefItem(cref=parent_cref))
+        self.groups.append(group)
+        parent.children.append(RefItem(cref=cref))
+
+        return group
+
+    def add_paragraph(
+        self,
+        label: str,
+        text: str,
+        orig: Optional[str] = None,
+        prov: Optional[ProvenanceItem] = None,
+        parent: Optional[GroupItem] = None,
+        item_cls=TextItem,
+    ):
+        if not parent:
+            parent = self.body
+            parent_cref = "#/body"
+        else:
+            parent_cref = self.resolve_cref(parent)
+
+        if not orig:
+            orig = text
+
+        text_index = len(self.texts)
+        cref = f"#/texts/{text_index}"
+        dloc = f"{self.file_info.document_hash}{cref}"
+        text_item = item_cls(
+            label=label,
+            text=text,
+            orig=orig,
+            dloc=dloc,
+            parent=RefItem(cref=parent_cref),
+        )
+        if prov:
+            text_item.prov.append(prov)
+
+        self.texts.append(text_item)
+        parent.children.append(RefItem(cref=cref))
+
+        return text_item
+
+    def add_table(
+        self,
+        data: BaseTableData,
+        caption: Optional[RefItem] = None,
+        prov: Optional[ProvenanceItem] = None,
+        parent: Optional[GroupItem] = None,
+    ):
+        if not parent:
+            parent = self.body
+
+        table_index = len(self.tables)
+        cref = f"#/tables/{table_index}"
+        dloc = f"{self.file_info.document_hash}{cref}"
+
+        tbl_item = TableItem(label="table", data=data, dloc=dloc, parent=parent)
+        if prov:
+            tbl_item.prov.append(prov)
+        if caption:
+            tbl_item.caption = caption
+
+        self.tables.append(tbl_item)
+        parent.children.append(RefItem(cref=cref))
+
+        return tbl_item
+
+    def add_figure(
+        self,
+        data: BaseFigureData,
+        caption: Optional[RefItem] = None,
+        prov: Optional[ProvenanceItem] = None,
+        parent: Optional[GroupItem] = None,
+    ):
+        if not parent:
+            parent = self.body
+
+        figure_index = len(self.figures)
+        cref = f"#/figures/{figure_index}"
+        dloc = f"{self.file_info.document_hash}{cref}"
+
+        fig_item = FigureItem(label="figure", data=data, dloc=dloc, parent=parent)
+        if prov:
+            fig_item.prov.append(prov)
+        if caption:
+            fig_item.caption = caption
+
+        self.figures.append(fig_item)
+        parent.children.append(RefItem(cref=cref))
+
+        return fig_item
+
+    def add_heading(
+        self,
+        label: str,
+        text: str,
+        orig: Optional[str] = None,
+        level: LevelNumber = 1,
+        prov: Optional[ProvenanceItem] = None,
+        parent: Optional[GroupItem] = None,
+    ):
+        item: Section = self.add_paragraph(
+            label, text, orig, prov, parent, item_cls=Section
+        )
+        item.level = level
+        return item
diff --git a/test/data/newdoc/dummy_doc.yaml b/test/data/newdoc/dummy_doc.yaml
index 632c164..28763a1 100644
--- a/test/data/newdoc/dummy_doc.yaml
+++ b/test/data/newdoc/dummy_doc.yaml
@@ -3,15 +3,25 @@
 description: { } # DescriptionType - TBD
 file_info: # FileInfoType - TBD
   document_hash: e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5
-furniture: # Headers, footers, framing, navigation elements, all other non-body text
-  - $ref: "/texts/0"
 
-body: # Top-level elements in other arrays, by-reference only, must not have parent.
-  - $ref: "/texts/1"
-  - $ref: "/figure/0"
-  - $ref: "/texts/2"
-  - $ref: "/texts/3"
-  - $ref: "/tables/0"
+furniture: # Top level element for any headers, footers, framing, navigation elements, all other non-body text
+  name: "_root_"
+  dloc: "#/furniture"
+  parent: null
+  children:
+    - $ref: "/texts/0"
+
+body: # Top-level element for anything in the document body
+  name: "_root_"
+  dloc: "#/body"
+  parent: null
+  children:
+    - $ref: "/texts/1"
+    - $ref: "/figure/0"
+    - $ref: "/texts/2"
+    - $ref: "/tables/0"
+
+groups: [] # Any group that is nested deeper in either body or furniture children
 
 texts: # All elements that have a text-string representation, with actual data
   - orig: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
@@ -19,7 +29,8 @@ texts: # All elements that have a text-string representation, with actual data
     dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/0"
     hash: 132103230
     label: "page_header"
-    parent: null
+    parent:
+      $ref: "#/furniture"
     children: []
     prov:
       - page_no: 1
@@ -34,7 +45,8 @@ texts: # All elements that have a text-string representation, with actual data
     dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/1"
     hash: 2349732 # uint64 hash of dloc
     label: "title"
-    parent: null
+    parent:
+      $ref: "#/body"
     children: [ ]
     prov: # must exist, can be empty
       - page_no: 1
@@ -83,7 +95,8 @@ tables: # All tables...
   - dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/table/0"
     hash: 98574
     label: "table"
-    parent: null
+    parent:
+      $ref: "#/body"
     children: [ ]
     caption:
       $ref: "/texts/3"
@@ -117,7 +130,8 @@ figures: # All figures...
   - dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/figures/0"
     hash: 7782482
     label: "figure"
-    parent: null
+    parent:
+      $ref: "#/body"
     caption:
       $ref: "/texts/2"
     references:
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index 74ca859..b0b0c73 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -1,7 +1,6 @@
 import yaml
-import pytest
-from docling_core.types import DoclingDocument, BoundingBox
-from docling_core.types.experimental.document import ProvenanceItem
+
+from docling_core.types.experimental.document import DoclingDocument, FileInfo
 
 
 def test_load_serialize_doc():
@@ -19,7 +18,7 @@ def test_load_serialize_doc():
     text_item.prov[0].page_no
 
     # Objects that are references need explicit resolution for now:
-    obj = doc.body[2].resolve(doc=doc)  # Text item with parent
+    obj = doc.body.children[2].resolve(doc=doc)  # Text item with parent
     parent = obj.parent.resolve(doc=doc)  # it is a figure
 
     obj2 = parent.children[0].resolve(
@@ -37,27 +36,43 @@ def test_load_serialize_doc():
     assert doc_reload == doc  # must be equal
     assert doc_reload is not doc  # can't be identical
 
-def test_construct_doc():
-    doc = DoclingDocument(description={}, file_info={})
-
-    # group, heading, paragraph, table, figure, title, list, provenance
-    doc.add_title()
-    doc.add_paragraph(text="Author 1\nAffiliation 1").add_provenance(ProvenanceItem(page_no=1, bbox=BoundingBox(t=12, l=5, r=230, b=40), charspan=(0,22)))
-    doc.add_paragraph(text="Author 2\nAffiliation 2")
-
-    chapter1 = doc.add_group(name="Introduction")
-    chapter1.add_heading(text="1. Introduction", level=2)
-    chapter1.add_paragraph(text="This paper introduces the biggest invention ever made. ...")
-    mylist = chapter1.add_group()
-    mylist.add_item(text="Cooks your favourite meal before you know you want it.")
-    mylist.add_item(text="Cleans up all your dishes.")
-    mylist.add_item(text="Drains your bank account without consent.")
 
+def test_construct_doc():
 
+    doc = DoclingDocument(description={}, file_info=FileInfo(document_hash="xyz"))
 
-    sec = doc.add_section(text="1. Introduction")
-
-    list = sec.add_child(label="container")
-    list.add_child()
-    list.add_child()
-
+    # group, heading, paragraph, table, figure, title, list, provenance
+    doc.add_paragraph(label="text", text="Author 1\nAffiliation 1")
+    doc.add_paragraph(label="text", text="Author 2\nAffiliation 2")
+
+    chapter1 = doc.add_group(
+        name="Introduction"
+    )  # can be done if such information is present, or ommitted.
+    doc.add_heading(
+        parent=chapter1, label="section_header", text="1. Introduction", level=1
+    )
+    doc.add_paragraph(
+        parent=chapter1,
+        label="text",
+        text="This paper introduces the biggest invention ever made. ...",
+    )
+    mylist = doc.add_group(parent=chapter1, name="whateverlist")
+    doc.add_paragraph(
+        parent=mylist,
+        label="list_item",
+        text="Cooks your favourite meal before you know you want it.",
+    )
+    doc.add_paragraph(
+        parent=mylist, label="list_item", text="Cleans up all your dishes."
+    )
+    doc.add_paragraph(
+        parent=mylist,
+        label="list_item",
+        text="Drains your bank account without consent.",
+    )
+
+    yaml_dump = yaml.safe_dump(doc.model_dump(mode="json", by_alias=True))
+
+    print(f"\n\n{yaml_dump}")
+
+    DoclingDocument.model_validate(yaml.safe_load(yaml_dump))

From 940f6cd31e6e1943f5af97dfc14fb74170a351b6 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Fri, 20 Sep 2024 19:35:14 +0200
Subject: [PATCH 06/34] Add comments

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 .../{newdoc => experimental}/dummy_doc.yaml   | 27 +++++++++++--------
 test/test_docling_doc.py                      |  2 +-
 2 files changed, 17 insertions(+), 12 deletions(-)
 rename test/data/{newdoc => experimental}/dummy_doc.yaml (83%)

diff --git a/test/data/newdoc/dummy_doc.yaml b/test/data/experimental/dummy_doc.yaml
similarity index 83%
rename from test/data/newdoc/dummy_doc.yaml
rename to test/data/experimental/dummy_doc.yaml
index 28763a1..087fac6 100644
--- a/test/data/newdoc/dummy_doc.yaml
+++ b/test/data/experimental/dummy_doc.yaml
@@ -1,29 +1,34 @@
 ---
-## Document with content + layout info
+## Document with content + optional layout info
 description: { } # DescriptionType - TBD
-file_info: # FileInfoType - TBD
+file_info: # FileInfo type
   document_hash: e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5
 
-furniture: # Top level element for any headers, footers, framing, navigation elements, all other non-body text
+# Root element for any headers, footers, framing, navigation elements, all other non-body text, type GroupItem
+furniture:
   name: "_root_"
   dloc: "#/furniture"
-  parent: null
-  children:
+  parent: null # Only root elements have no parent.
+  children: # only the first-level children appear here, as references (RefItem)
     - $ref: "/texts/0"
 
-body: # Top-level element for anything in the document body
+# Root element for anything in the document body, type GroupItem
+body:
   name: "_root_"
   dloc: "#/body"
-  parent: null
-  children:
+  parent: null # Only root elements have no parent.
+  children: # only the first-level children appear here, as references (RefItem)
     - $ref: "/texts/1"
     - $ref: "/figure/0"
     - $ref: "/texts/2"
     - $ref: "/tables/0"
 
-groups: [] # Any group that is nested deeper in either body or furniture children
+# All groups of items nested deeper in body or furniture roots, type List[GroupItem]
+groups: [] # The parent + children relations capture nesting and reading-order.
 
-texts: # All elements that have a text-string representation, with actual data
+# All elements that have a text-string representation, type TextItem.
+# This is a flat list of all elements without implied order.
+texts:
   - orig: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
     text: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
     dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/0"
@@ -153,7 +158,7 @@ figures: # All figures...
       uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/figures/0.png"
       #alternatives: base64 encoded striong
     children:
-      - $ref: "/texts/2"
+      - $ref: "/texts/2" # This text element appears inside the figure, hence it is a child.
     prov:
       - page_no: 1
         bbox:
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index b0b0c73..8cd8a43 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -5,7 +5,7 @@
 
 def test_load_serialize_doc():
     # Read YAML file
-    with open("test/data/newdoc/dummy_doc.yaml", "r") as fp:
+    with open("test/data/experimental/dummy_doc.yaml", "r") as fp:
         dict_from_yaml = yaml.safe_load(fp)
 
     doc = DoclingDocument.model_validate(dict_from_yaml)

From ccbe241e9086d5c3bdde3e6b8df5dd5cc0e52d90 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Sat, 21 Sep 2024 10:42:52 +0200
Subject: [PATCH 07/34] Add BaseTableData and table cell typing

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/experimental/document.py | 110 +++++++++++++++++---
 poetry.lock                                 |  24 ++++-
 pyproject.toml                              |   1 +
 test/data/experimental/dummy_doc.yaml       |   6 +-
 test/test_docling_doc.py                    |  57 +++++++++-
 5 files changed, 181 insertions(+), 17 deletions(-)

diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index 2a44176..8521366 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -2,11 +2,11 @@
 import typing
 from typing import Any, Dict, List, Optional, Tuple, Union
 
-from pydantic import AnyUrl, BaseModel, ConfigDict, Field, computed_field
+from pydantic import AnyUrl, BaseModel, ConfigDict, Field, computed_field, model_validator
+from pydantic_extra_types.semantic_version import SemanticVersion
 
 from docling_core.types.experimental.base import BoundingBox, Size
 
-# Uint64 = conint(ge=0, le=(2**64 - 1))  # type: ignore[valid-type]
 Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
 LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
 
@@ -15,8 +15,75 @@ class BaseFigureData(BaseModel):  # TBD
     pass
 
 
+class TableCell(BaseModel):
+    bbox: Optional[BoundingBox] = None
+    row_span: int = 1
+    col_span: int = 1
+    start_row_offset_idx: int
+    end_row_offset_idx: int
+    start_col_offset_idx: int
+    end_col_offset_idx: int
+    text: str
+    column_header: bool = False
+    row_header: bool = False
+    row_section: bool = False
+
+    @model_validator(mode="before")
+    @classmethod
+    def from_dict_format(cls, data: Any) -> Any:
+        if isinstance(data, Dict):
+            if not "bbox" in data or data["bbox"] == None:
+                return data
+            text = data["bbox"].get("token", "")
+            if not len(text):
+                text_cells = data.pop("text_cell_bboxes", None)
+                if text_cells:
+                    for el in text_cells:
+                        text += el["token"] + " "
+
+                text = text.strip()
+            data["text"] = text
+
+        return data
+
 class BaseTableData(BaseModel):  # TBD
-    pass
+    table_cells: List[TableCell]
+    num_rows: int = 0
+    num_cols: int = 0
+
+    @computed_field
+    @property
+    def grid(self) -> List[List[TableCell]]:         # TODO compute grid representation on the fly from table_cells
+
+        # Initialise empty table data grid (only empty cells)
+        table_data = [
+            [
+                TableCell(
+                    text="",
+                    start_row_offset_idx=i,
+                    end_row_offset_idx=i+1,
+                    start_col_offset_idx=j,
+                    end_col_offset_idx=j+1
+                )
+                for j in range(self.num_cols)
+            ]
+            for i in range(self.num_rows)
+        ]
+
+        # Overwrite cells in table data for which there is actual cell content.
+        for cell in self.table_cells:
+            for i in range(
+                    min(cell.start_row_offset_idx, self.num_rows),
+                    min(cell.end_row_offset_idx, self.num_rows),
+            ):
+                for j in range(
+                        min(cell.start_col_offset_idx, self.num_cols),
+                        min(cell.end_col_offset_idx, self.num_cols),
+                ):
+                    table_data[i][j] = cell
+
+        return table_data
+
 
 
 class FileInfo(BaseModel):
@@ -96,6 +163,7 @@ class FloatingItem(DocItem):
     image: Optional[ImageRef] = None
 
 
+
 class FigureItem(FloatingItem):
     data: BaseFigureData
 
@@ -111,28 +179,33 @@ class KeyValueItem(DocItem):
 ContentItem = Union[TextItem, FigureItem, TableItem, KeyValueItem]
 
 
-class DocumentContent(BaseModel):
+class DocumentTrees(BaseModel):
     furniture: GroupItem = GroupItem(
         name="_root_", dloc="#/furniture"
     )  # List[RefItem] = []
     body: GroupItem = GroupItem(name="_root_", dloc="#/body")  # List[RefItem] = []
-    groups: List[GroupItem] = []
-    texts: List[TextItem] = []
-    figures: List[FigureItem] = []
-    tables: List[TableItem] = []
-    key_value_items: List[KeyValueItem] = []
 
 
-class PageItem(DocumentContent):
+class PageItem(DocumentTrees):
+    # A page carries separate root items for furniture and body, only referencing items on the page
     hash: str  # page hash
     size: Size
     image: Optional[ImageRef]
     num_elements: int
+    page_no: int
 
 
-class DoclingDocument(DocumentContent):
+class DoclingDocument(DocumentTrees):
+    version: str = "0.0.1" #= SemanticVersion(version="0.0.1")
     description: Any
     file_info: FileInfo
+
+    groups: List[GroupItem] = []
+    texts: List[TextItem] = []
+    figures: List[FigureItem] = []
+    tables: List[TableItem] = []
+    key_value_items: List[KeyValueItem] = []
+
     pages: Dict[int, PageItem] = {}  # empty as default
 
     # def add_furniture_group(self, name: str):
@@ -199,18 +272,21 @@ def add_paragraph(
     def add_table(
         self,
         data: BaseTableData,
-        caption: Optional[RefItem] = None,
+        caption: Optional[RefItem] = None, # This is not cool yet.
         prov: Optional[ProvenanceItem] = None,
         parent: Optional[GroupItem] = None,
     ):
         if not parent:
             parent = self.body
+            parent_cref = "#/body"
+        else:
+            parent_cref = self.resolve_cref(parent)
 
         table_index = len(self.tables)
         cref = f"#/tables/{table_index}"
         dloc = f"{self.file_info.document_hash}{cref}"
 
-        tbl_item = TableItem(label="table", data=data, dloc=dloc, parent=parent)
+        tbl_item = TableItem(label="table", data=data, dloc=dloc, parent=RefItem(cref=parent_cref))
         if prov:
             tbl_item.prov.append(prov)
         if caption:
@@ -260,3 +336,11 @@ def add_heading(
         )
         item.level = level
         return item
+
+
+    def num_pages(self):
+        return len(self.pages.values())
+
+    def build_page_trees(self):
+        # TODO: For every PageItem, update the furniture and body trees from the main doc.
+        pass
\ No newline at end of file
diff --git a/poetry.lock b/poetry.lock
index e094bf8..841b8c8 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1426,6 +1426,28 @@ files = [
 [package.dependencies]
 typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
 
+[[package]]
+name = "pydantic-extra-types"
+version = "2.9.0"
+description = "Extra Pydantic types."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pydantic_extra_types-2.9.0-py3-none-any.whl", hash = "sha256:f0bb975508572ba7bf3390b7337807588463b7248587e69f43b1ad7c797530d0"},
+    {file = "pydantic_extra_types-2.9.0.tar.gz", hash = "sha256:e061c01636188743bb69f368dcd391f327b8cfbfede2fe1cbb1211b06601ba3b"},
+]
+
+[package.dependencies]
+pydantic = ">=2.5.2"
+
+[package.extras]
+all = ["pendulum (>=3.0.0,<4.0.0)", "phonenumbers (>=8,<9)", "pycountry (>=23)", "python-ulid (>=1,<2)", "python-ulid (>=1,<3)", "pytz (>=2024.1)", "semver (>=3.0.2)", "tzdata (>=2024.1)"]
+pendulum = ["pendulum (>=3.0.0,<4.0.0)"]
+phonenumbers = ["phonenumbers (>=8,<9)"]
+pycountry = ["pycountry (>=23)"]
+python-ulid = ["python-ulid (>=1,<2)", "python-ulid (>=1,<3)"]
+semver = ["semver (>=3.0.2)"]
+
 [[package]]
 name = "pydocstyle"
 version = "6.3.0"
@@ -2125,4 +2147,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "86b4ffdea1897655891a9264f523f88952fec428c41dc18c9c51a52d203865e9"
+content-hash = "aef6fb9068e74833732b5b1ee8fa804baf68a1875a5e983293bf0af6ebd336dc"
diff --git a/pyproject.toml b/pyproject.toml
index b3a2a1f..b199fd0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,6 +54,7 @@ json-schema-for-humans = "^1.0.0"
 pyproject-toml = "^0.0.10"
 tabulate = "^0.9.0"
 pandas = "^2.2.2"
+pydantic-extra-types = "^2.9.0"
 
 [tool.poetry.group.dev.dependencies]
 black = "^24.4.2"
diff --git a/test/data/experimental/dummy_doc.yaml b/test/data/experimental/dummy_doc.yaml
index 087fac6..bd93ced 100644
--- a/test/data/experimental/dummy_doc.yaml
+++ b/test/data/experimental/dummy_doc.yaml
@@ -118,7 +118,8 @@ tables: # All tables...
       uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/tables/0.png"
       #alternatives: base64 encoded striong
     data: # TableData Type
-      grid: [ [ ] ] # list-of-list of TableCell type
+      table_cells: [] # flat list of TableCell type
+      grid: [[]] # list-of-list of TableCell type
       otsl: "<fcel><ecel>..." # OTSL token string
       html: "" # ??
     prov:
@@ -185,4 +186,5 @@ pages: # Optional, for layout documents
         height: 1166
       uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/pages/1.png"
       #alternatives: base64 encoded string
-    num_elements: 23
\ No newline at end of file
+    num_elements: 23
+    page_no: 1
\ No newline at end of file
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index 8cd8a43..c96fe0b 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -1,6 +1,6 @@
 import yaml
 
-from docling_core.types.experimental.document import DoclingDocument, FileInfo
+from docling_core.types.experimental.document import DoclingDocument, FileInfo, TableCell, BaseTableData
 
 
 def test_load_serialize_doc():
@@ -70,6 +70,61 @@ def test_construct_doc():
         label="list_item",
         text="Drains your bank account without consent.",
     )
+    # Make some table cells
+    table_cells = []
+    table_cells.append(TableCell(
+        row_span=2,
+        start_row_offset_idx=0,
+        end_row_offset_idx=1,
+        start_col_offset_idx=0,
+        end_col_offset_idx=1,
+        text="Product"
+    ))
+    table_cells.append(TableCell(
+        col_span=2,
+        start_row_offset_idx=0,
+        end_row_offset_idx=1,
+        start_col_offset_idx=1,
+        end_col_offset_idx=3,
+        text="Years"
+    ))
+    table_cells.append(TableCell(
+        start_row_offset_idx=1,
+        end_row_offset_idx=2,
+        start_col_offset_idx=1,
+        end_col_offset_idx=2,
+        text="2016"
+    ))
+    table_cells.append(TableCell(
+        start_row_offset_idx=1,
+        end_row_offset_idx=2,
+        start_col_offset_idx=2,
+        end_col_offset_idx=3,
+        text="2017"
+    ))
+    table_cells.append(TableCell(
+        start_row_offset_idx=2,
+        end_row_offset_idx=3,
+        start_col_offset_idx=0,
+        end_col_offset_idx=1,
+        text="Apple"
+    ))
+    table_cells.append(TableCell(
+        start_row_offset_idx=2,
+        end_row_offset_idx=3,
+        start_col_offset_idx=1,
+        end_col_offset_idx=2,
+        text="49823"
+    ))
+    table_cells.append(TableCell(
+        start_row_offset_idx=2,
+        end_row_offset_idx=3,
+        start_col_offset_idx=2,
+        end_col_offset_idx=3,
+        text="695944"
+    ))
+    table_el = BaseTableData(num_rows=3, num_cols=3, table_cells=table_cells)
+    doc.add_table(data=table_el)
 
     yaml_dump = yaml.safe_dump(doc.model_dump(mode="json", by_alias=True))
 

From 0685709324e8a5e7f818af0450b6f455aa0ba8cb Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Mon, 23 Sep 2024 10:52:38 +0200
Subject: [PATCH 08/34] Tree element iterator, several API fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/experimental/document.py | 117 +++++++++++-----
 test/data/experimental/dummy_doc.yaml       |  21 +--
 test/test_docling_doc.py                    | 145 ++++++++++++--------
 3 files changed, 175 insertions(+), 108 deletions(-)

diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index 8521366..27fb516 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -2,8 +2,14 @@
 import typing
 from typing import Any, Dict, List, Optional, Tuple, Union
 
-from pydantic import AnyUrl, BaseModel, ConfigDict, Field, computed_field, model_validator
-from pydantic_extra_types.semantic_version import SemanticVersion
+from pydantic import (
+    AnyUrl,
+    BaseModel,
+    ConfigDict,
+    Field,
+    computed_field,
+    model_validator,
+)
 
 from docling_core.types.experimental.base import BoundingBox, Size
 
@@ -46,14 +52,19 @@ def from_dict_format(cls, data: Any) -> Any:
 
         return data
 
+
 class BaseTableData(BaseModel):  # TBD
     table_cells: List[TableCell]
     num_rows: int = 0
     num_cols: int = 0
 
-    @computed_field
+    @computed_field  # type: ignore
     @property
-    def grid(self) -> List[List[TableCell]]:         # TODO compute grid representation on the fly from table_cells
+    def grid(
+        self,
+    ) -> List[
+        List[TableCell]
+    ]:  # TODO compute grid representation on the fly from table_cells
 
         # Initialise empty table data grid (only empty cells)
         table_data = [
@@ -61,9 +72,9 @@ def grid(self) -> List[List[TableCell]]:         # TODO compute grid representat
                 TableCell(
                     text="",
                     start_row_offset_idx=i,
-                    end_row_offset_idx=i+1,
+                    end_row_offset_idx=i + 1,
                     start_col_offset_idx=j,
-                    end_col_offset_idx=j+1
+                    end_col_offset_idx=j + 1,
                 )
                 for j in range(self.num_cols)
             ]
@@ -73,19 +84,18 @@ def grid(self) -> List[List[TableCell]]:         # TODO compute grid representat
         # Overwrite cells in table data for which there is actual cell content.
         for cell in self.table_cells:
             for i in range(
-                    min(cell.start_row_offset_idx, self.num_rows),
-                    min(cell.end_row_offset_idx, self.num_rows),
+                min(cell.start_row_offset_idx, self.num_rows),
+                min(cell.end_row_offset_idx, self.num_rows),
             ):
                 for j in range(
-                        min(cell.start_col_offset_idx, self.num_cols),
-                        min(cell.end_col_offset_idx, self.num_cols),
+                    min(cell.start_col_offset_idx, self.num_cols),
+                    min(cell.end_col_offset_idx, self.num_cols),
                 ):
                     table_data[i][j] = cell
 
         return table_data
 
 
-
 class FileInfo(BaseModel):
     document_hash: str
 
@@ -93,14 +103,28 @@ class FileInfo(BaseModel):
 class RefItem(BaseModel):
     cref: str = Field(alias="$ref")
 
+    # This method makes RefItem compatible with DocItem
+    def get_ref(self):
+        return self
+
     model_config = ConfigDict(
         populate_by_name=True,
     )
 
     def resolve(self, doc: "DoclingDocument"):
-        _, path, index_str = self.cref.split("/")
-        index = int(index_str)
-        obj = doc.__getattribute__(path)[index]
+        path_components = self.cref.split("/")
+        if len(path_components) > 2:
+            _, path, index_str = path_components
+        else:
+            _, path = path_components
+            index_str = None
+
+        if index_str:
+            index = int(index_str)
+            obj = doc.__getattribute__(path)[index]
+        else:
+            obj = doc.__getattribute__(path)
+
         return obj
 
 
@@ -122,6 +146,9 @@ class NodeItem(BaseModel):
     parent: Optional[RefItem] = None
     children: List[RefItem] = []
 
+    def get_ref(self):
+        return RefItem(cref=f"#{self.dloc.split('#')[1]}")
+
     @computed_field  # type: ignore
     @property
     def hash(self) -> Uint64:  # TODO align with hasher on deepsearch-glm
@@ -163,7 +190,6 @@ class FloatingItem(DocItem):
     image: Optional[ImageRef] = None
 
 
-
 class FigureItem(FloatingItem):
     data: BaseFigureData
 
@@ -196,7 +222,7 @@ class PageItem(DocumentTrees):
 
 
 class DoclingDocument(DocumentTrees):
-    version: str = "0.0.1" #= SemanticVersion(version="0.0.1")
+    version: str = "0.0.1"  # = SemanticVersion(version="0.0.1")
     description: Any
     file_info: FileInfo
 
@@ -212,22 +238,16 @@ class DoclingDocument(DocumentTrees):
     #    group = GroupItem(name=name)
     #    self.furniture.children.append(group)
     #    return group
-    def resolve_cref(self, obj):
-        path = obj.dloc.split("#")[1]
-        return path
 
     def add_group(self, name: str, parent: Optional[GroupItem] = None) -> GroupItem:
         if not parent:
             parent = self.body
-            parent_cref = "#/body"
-        else:
-            parent_cref = self.resolve_cref(parent)
 
         group_index = len(self.groups)
         cref = f"#/groups/{group_index}"
         dloc = f"{self.file_info.document_hash}{cref}"
 
-        group = GroupItem(name=name, dloc=dloc, parent=RefItem(cref=parent_cref))
+        group = GroupItem(name=name, dloc=dloc, parent=parent.get_ref())
         self.groups.append(group)
         parent.children.append(RefItem(cref=cref))
 
@@ -244,9 +264,6 @@ def add_paragraph(
     ):
         if not parent:
             parent = self.body
-            parent_cref = "#/body"
-        else:
-            parent_cref = self.resolve_cref(parent)
 
         if not orig:
             orig = text
@@ -259,7 +276,7 @@ def add_paragraph(
             text=text,
             orig=orig,
             dloc=dloc,
-            parent=RefItem(cref=parent_cref),
+            parent=parent.get_ref(),
         )
         if prov:
             text_item.prov.append(prov)
@@ -272,25 +289,24 @@ def add_paragraph(
     def add_table(
         self,
         data: BaseTableData,
-        caption: Optional[RefItem] = None, # This is not cool yet.
+        caption: Optional[Union[TextItem, RefItem]] = None,  # This is not cool yet.
         prov: Optional[ProvenanceItem] = None,
         parent: Optional[GroupItem] = None,
     ):
         if not parent:
             parent = self.body
-            parent_cref = "#/body"
-        else:
-            parent_cref = self.resolve_cref(parent)
 
         table_index = len(self.tables)
         cref = f"#/tables/{table_index}"
         dloc = f"{self.file_info.document_hash}{cref}"
 
-        tbl_item = TableItem(label="table", data=data, dloc=dloc, parent=RefItem(cref=parent_cref))
+        tbl_item = TableItem(
+            label="table", data=data, dloc=dloc, parent=parent.get_ref()
+        )
         if prov:
             tbl_item.prov.append(prov)
         if caption:
-            tbl_item.caption = caption
+            tbl_item.caption = caption.get_ref()
 
         self.tables.append(tbl_item)
         parent.children.append(RefItem(cref=cref))
@@ -300,7 +316,7 @@ def add_table(
     def add_figure(
         self,
         data: BaseFigureData,
-        caption: Optional[RefItem] = None,
+        caption: Optional[Union[TextItem, RefItem]] = None,
         prov: Optional[ProvenanceItem] = None,
         parent: Optional[GroupItem] = None,
     ):
@@ -311,11 +327,13 @@ def add_figure(
         cref = f"#/figures/{figure_index}"
         dloc = f"{self.file_info.document_hash}{cref}"
 
-        fig_item = FigureItem(label="figure", data=data, dloc=dloc, parent=parent)
+        fig_item = FigureItem(
+            label="figure", data=data, dloc=dloc, parent=parent.get_ref()
+        )
         if prov:
             fig_item.prov.append(prov)
         if caption:
-            fig_item.caption = caption
+            fig_item.caption = caption.get_ref()
 
         self.figures.append(fig_item)
         parent.children.append(RefItem(cref=cref))
@@ -337,10 +355,33 @@ def add_heading(
         item.level = level
         return item
 
-
     def num_pages(self):
         return len(self.pages.values())
 
     def build_page_trees(self):
         # TODO: For every PageItem, update the furniture and body trees from the main doc.
-        pass
\ No newline at end of file
+        pass
+
+    def iterate_elements(
+        self,
+        root: Optional[NodeItem] = None,
+        omit_groups: bool = True,
+        traverse_figures: bool = True,
+    ) -> typing.Iterable[NodeItem]:
+        # Yield the current node
+        if not root:
+            root = self.body
+
+        if omit_groups and not isinstance(root, GroupItem):
+            yield root
+
+        # Traverse children
+        for child_ref in root.children:
+            child = child_ref.resolve(self)
+
+            if isinstance(child, NodeItem):
+                # If the child is a NodeItem, recursively traverse it
+                if isinstance(child, FigureItem) and traverse_figures:
+                    yield from self.iterate_elements(child)
+            else:  # leaf
+                yield child
diff --git a/test/data/experimental/dummy_doc.yaml b/test/data/experimental/dummy_doc.yaml
index bd93ced..a3b29f8 100644
--- a/test/data/experimental/dummy_doc.yaml
+++ b/test/data/experimental/dummy_doc.yaml
@@ -19,8 +19,8 @@ body:
   parent: null # Only root elements have no parent.
   children: # only the first-level children appear here, as references (RefItem)
     - $ref: "/texts/1"
-    - $ref: "/figure/0"
-    - $ref: "/texts/2"
+    - $ref: "/figures/0"
+    - $ref: "/texts/3"
     - $ref: "/tables/0"
 
 # All groups of items nested deeper in body or furniture roots, type List[GroupItem]
@@ -83,7 +83,7 @@ texts:
     hash: 6978483
     label: "caption"
     parent:
-      $ref: "/figures/0"
+      $ref: "#/body"
     children: [ ]
     prov:
       - page_no: 1
@@ -103,12 +103,6 @@ tables: # All tables...
     parent:
       $ref: "#/body"
     children: [ ]
-    caption:
-      $ref: "/texts/3"
-    references:
-      - $ref: "/text/??"
-    footnotes:
-      - $ref: "/text/??"
     image:
       format: png
       dpi: 72
@@ -139,13 +133,8 @@ figures: # All figures...
     parent:
       $ref: "#/body"
     caption:
-      $ref: "/texts/2"
-    references:
-      - $ref: "/text/??"
-    footnotes:
-      - $ref: "/text/??"
-
-    data: # FigureData Type
+      $ref: "/texts/3"
+    data: # BaseFigureData Type
       classification: "illustration"
       confidence: 0.78
       description: "...."
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index c96fe0b..a8cb8ea 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -1,6 +1,12 @@
 import yaml
 
-from docling_core.types.experimental.document import DoclingDocument, FileInfo, TableCell, BaseTableData
+from docling_core.types.experimental.document import (
+    BaseFigureData,
+    BaseTableData,
+    DoclingDocument,
+    FileInfo,
+    TableCell,
+)
 
 
 def test_load_serialize_doc():
@@ -18,7 +24,7 @@ def test_load_serialize_doc():
     text_item.prov[0].page_no
 
     # Objects that are references need explicit resolution for now:
-    obj = doc.body.children[2].resolve(doc=doc)  # Text item with parent
+    obj = doc.texts[2]  # Text item with parent
     parent = obj.parent.resolve(doc=doc)  # it is a figure
 
     obj2 = parent.children[0].resolve(
@@ -36,6 +42,11 @@ def test_load_serialize_doc():
     assert doc_reload == doc  # must be equal
     assert doc_reload is not doc  # can't be identical
 
+    ### Iterate all elements
+
+    for item in doc.iterate_elements():
+        print(f"Item: {item}")
+
 
 def test_construct_doc():
 
@@ -72,62 +83,88 @@ def test_construct_doc():
     )
     # Make some table cells
     table_cells = []
-    table_cells.append(TableCell(
-        row_span=2,
-        start_row_offset_idx=0,
-        end_row_offset_idx=1,
-        start_col_offset_idx=0,
-        end_col_offset_idx=1,
-        text="Product"
-    ))
-    table_cells.append(TableCell(
-        col_span=2,
-        start_row_offset_idx=0,
-        end_row_offset_idx=1,
-        start_col_offset_idx=1,
-        end_col_offset_idx=3,
-        text="Years"
-    ))
-    table_cells.append(TableCell(
-        start_row_offset_idx=1,
-        end_row_offset_idx=2,
-        start_col_offset_idx=1,
-        end_col_offset_idx=2,
-        text="2016"
-    ))
-    table_cells.append(TableCell(
-        start_row_offset_idx=1,
-        end_row_offset_idx=2,
-        start_col_offset_idx=2,
-        end_col_offset_idx=3,
-        text="2017"
-    ))
-    table_cells.append(TableCell(
-        start_row_offset_idx=2,
-        end_row_offset_idx=3,
-        start_col_offset_idx=0,
-        end_col_offset_idx=1,
-        text="Apple"
-    ))
-    table_cells.append(TableCell(
-        start_row_offset_idx=2,
-        end_row_offset_idx=3,
-        start_col_offset_idx=1,
-        end_col_offset_idx=2,
-        text="49823"
-    ))
-    table_cells.append(TableCell(
-        start_row_offset_idx=2,
-        end_row_offset_idx=3,
-        start_col_offset_idx=2,
-        end_col_offset_idx=3,
-        text="695944"
-    ))
+    table_cells.append(
+        TableCell(
+            row_span=2,
+            start_row_offset_idx=0,
+            end_row_offset_idx=1,
+            start_col_offset_idx=0,
+            end_col_offset_idx=1,
+            text="Product",
+        )
+    )
+    table_cells.append(
+        TableCell(
+            col_span=2,
+            start_row_offset_idx=0,
+            end_row_offset_idx=1,
+            start_col_offset_idx=1,
+            end_col_offset_idx=3,
+            text="Years",
+        )
+    )
+    table_cells.append(
+        TableCell(
+            start_row_offset_idx=1,
+            end_row_offset_idx=2,
+            start_col_offset_idx=1,
+            end_col_offset_idx=2,
+            text="2016",
+        )
+    )
+    table_cells.append(
+        TableCell(
+            start_row_offset_idx=1,
+            end_row_offset_idx=2,
+            start_col_offset_idx=2,
+            end_col_offset_idx=3,
+            text="2017",
+        )
+    )
+    table_cells.append(
+        TableCell(
+            start_row_offset_idx=2,
+            end_row_offset_idx=3,
+            start_col_offset_idx=0,
+            end_col_offset_idx=1,
+            text="Apple",
+        )
+    )
+    table_cells.append(
+        TableCell(
+            start_row_offset_idx=2,
+            end_row_offset_idx=3,
+            start_col_offset_idx=1,
+            end_col_offset_idx=2,
+            text="49823",
+        )
+    )
+    table_cells.append(
+        TableCell(
+            start_row_offset_idx=2,
+            end_row_offset_idx=3,
+            start_col_offset_idx=2,
+            end_col_offset_idx=3,
+            text="695944",
+        )
+    )
     table_el = BaseTableData(num_rows=3, num_cols=3, table_cells=table_cells)
     doc.add_table(data=table_el)
 
+    fig_caption = doc.add_paragraph(
+        label="caption", text="This is the caption of figure 1."
+    )
+    doc.add_figure(data=BaseFigureData(), caption=fig_caption.get_ref())
+
+    ### Iterate all elements
+
+    for item in doc.iterate_elements():
+        print(f"Item: {item}")
+
+    ### Serialize and deserialize stuff
+
     yaml_dump = yaml.safe_dump(doc.model_dump(mode="json", by_alias=True))
 
-    print(f"\n\n{yaml_dump}")
+    # print(f"\n\n{yaml_dump}")
 
     DoclingDocument.model_validate(yaml.safe_load(yaml_dump))

From f791f74c61b866e83f1c3fd22fb189f80a7ae0c7 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Mon, 23 Sep 2024 13:57:31 +0200
Subject: [PATCH 09/34] Turn captions into list field

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/experimental/document.py | 8 ++++----
 test/data/experimental/dummy_doc.yaml       | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index 27fb516..520aab5 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -54,7 +54,7 @@ def from_dict_format(cls, data: Any) -> Any:
 
 
 class BaseTableData(BaseModel):  # TBD
-    table_cells: List[TableCell]
+    table_cells: List[TableCell] = []
     num_rows: int = 0
     num_cols: int = 0
 
@@ -184,7 +184,7 @@ class Section(TextItem):
 
 
 class FloatingItem(DocItem):
-    caption: Optional[RefItem] = None
+    captions: List[RefItem] = []
     references: List[RefItem] = []
     footnotes: List[RefItem] = []
     image: Optional[ImageRef] = None
@@ -306,7 +306,7 @@ def add_table(
         if prov:
             tbl_item.prov.append(prov)
         if caption:
-            tbl_item.caption = caption.get_ref()
+            tbl_item.captions.append(caption.get_ref())
 
         self.tables.append(tbl_item)
         parent.children.append(RefItem(cref=cref))
@@ -333,7 +333,7 @@ def add_figure(
         if prov:
             fig_item.prov.append(prov)
         if caption:
-            fig_item.caption = caption.get_ref()
+            fig_item.captions.append(caption.get_ref())
 
         self.figures.append(fig_item)
         parent.children.append(RefItem(cref=cref))
diff --git a/test/data/experimental/dummy_doc.yaml b/test/data/experimental/dummy_doc.yaml
index a3b29f8..d2a6470 100644
--- a/test/data/experimental/dummy_doc.yaml
+++ b/test/data/experimental/dummy_doc.yaml
@@ -132,8 +132,8 @@ figures: # All figures...
     label: "figure"
     parent:
       $ref: "#/body"
-    caption:
-      $ref: "/texts/3"
+    captions:
+      - $ref: "/texts/3"
     data: # BaseFigureData Type
       classification: "illustration"
       confidence: 0.78

From 4f1c190791e16604932fb145cb68e8ce5a17496b Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Tue, 24 Sep 2024 08:11:05 +0200
Subject: [PATCH 10/34] Add export methods to DoclingDocument and types

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/experimental/document.py | 527 +++++++++++++++++++-
 test/test_docling_doc.py                    |  16 +-
 2 files changed, 541 insertions(+), 2 deletions(-)

diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index 520aab5..41fa935 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -2,6 +2,7 @@
 import typing
 from typing import Any, Dict, List, Optional, Tuple, Union
 
+import pandas as pd
 from pydantic import (
     AnyUrl,
     BaseModel,
@@ -10,7 +11,9 @@
     computed_field,
     model_validator,
 )
+from tabulate import tabulate
 
+from docling_core.types.doc.tokens import DocumentToken
 from docling_core.types.experimental.base import BoundingBox, Size
 
 Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
@@ -173,11 +176,79 @@ class DocItem(
     label: str
     prov: List[ProvenanceItem] = []
 
+    def get_location_tokens(
+        self,
+        new_line: str,
+        page_w: float,
+        page_h: float,
+        xsize: int = 100,
+        ysize: int = 100,
+        add_page_index: bool = True,
+    ) -> str:
+        """Get the location string for the BaseCell."""
+        if not len(self.prov):
+            return ""
+
+        location = ""
+        for prov in self.prov:
+
+            page_i = -1
+            if add_page_index:
+                page_i = prov.page_no
+
+            loc_str = DocumentToken.get_location(
+                bbox=prov.bbox.to_bottom_left_origin(page_h).as_tuple(),
+                page_w=page_w,
+                page_h=page_h,
+                xsize=xsize,
+                ysize=ysize,
+                page_i=page_i,
+            )
+            location += f"{loc_str}{new_line}"
+
+        return location
+
 
 class TextItem(DocItem):
     orig: str  # untreated representation
     text: str  # sanitized representation
 
+    def export_to_document_tokens(
+        self,
+        new_line: str = "\n",
+        page_w: float = 0.0,
+        page_h: float = 0.0,
+        xsize: int = 100,
+        ysize: int = 100,
+        add_location: bool = True,
+        add_content: bool = True,
+        add_page_index: bool = True,
+    ):
+        """Export text element to document tokens format."""
+        body = f"<{self.label}>"
+        # body = f"<{self.name}>"
+
+        assert DocumentToken.is_known_token(
+            body
+        ), f"failed DocumentToken.is_known_token({body})"
+
+        if add_location:
+            body += self.get_location_tokens(
+                new_line="",
+                page_w=page_w,
+                page_h=page_h,
+                xsize=xsize,
+                ysize=ysize,
+                add_page_index=add_page_index,
+            )
+
+        if add_content and self.text is not None:
+            body += self.text.strip()
+
+        body += f"</{self.label}>{new_line}"
+
+        return body
+
 
 class Section(TextItem):
     level: LevelNumber = 1
@@ -193,10 +264,233 @@ class FloatingItem(DocItem):
 class FigureItem(FloatingItem):
     data: BaseFigureData
 
+    def export_to_document_tokens(
+        self,
+        doc: "DoclingDocument",
+        new_line: str = "\n",
+        page_w: float = 0.0,
+        page_h: float = 0.0,
+        xsize: int = 100,
+        ysize: int = 100,
+        add_location: bool = True,
+        add_caption: bool = True,
+        add_content: bool = True,  # not used at the moment
+        add_page_index: bool = True,
+    ):
+        """Export figure to document tokens format."""
+        body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
+
+        if add_location:
+            body += self.get_location_tokens(
+                new_line=new_line,
+                page_w=page_w,
+                page_h=page_h,
+                xsize=xsize,
+                ysize=ysize,
+                add_page_index=add_page_index,
+            )
+
+        if add_caption and len(self.captions):
+            text = ""
+            for cap in self.captions:
+                text += cap.resolve(doc).text
+
+            if len(text):
+                body += f"{DocumentToken.BEG_CAPTION.value}"
+                body += f"{text.strip()}"
+                body += f"{DocumentToken.END_CAPTION.value}"
+                body += f"{new_line}"
+
+        body += f"{DocumentToken.END_FIGURE.value}{new_line}"
+
+        return body
+
 
 class TableItem(FloatingItem):
     data: BaseTableData
 
+    def export_to_dataframe(self) -> pd.DataFrame:
+        """Export the table as a Pandas DataFrame."""
+        if self.data is None or self.data.num_rows == 0 or self.data.num_cols == 0:
+            return pd.DataFrame()
+
+        # Count how many rows are column headers
+        num_headers = 0
+        for i, row in enumerate(self.data.grid):
+            if len(row) == 0:
+                raise RuntimeError(
+                    f"Invalid table. {len(row)=} but {self.data.num_cols=}."
+                )
+
+            any_header = False
+            for cell in row:
+                if cell.column_header:
+                    any_header = True
+                    break
+
+            if any_header:
+                num_headers += 1
+            else:
+                break
+
+        # Create the column names from all col_headers
+        columns: Optional[List[str]] = None
+        if num_headers > 0:
+            columns = ["" for _ in range(self.data.num_cols)]
+            for i in range(num_headers):
+                for j, cell in enumerate(self.data.grid[i]):
+                    col_name = cell.text
+                    if columns[j] != "":
+                        col_name = f".{col_name}"
+                    columns[j] += col_name
+
+        # Create table data
+        table_data = [
+            [cell.text for cell in row] for row in self.data.grid[num_headers:]
+        ]
+
+        # Create DataFrame
+        df = pd.DataFrame(table_data, columns=columns)
+
+        return df
+
+    def export_to_html(self) -> str:
+        """Export the table as html."""
+        body = ""
+        nrows = self.data.num_rows
+        ncols = self.data.num_cols
+
+        if not len(self.data.table_cells):
+            return ""
+        for i in range(nrows):
+            body += "<tr>"
+            for j in range(ncols):
+                cell: TableCell = self.data.grid[i][j]
+
+                rowspan, rowstart, rowend = (
+                    cell.row_span,
+                    cell.start_row_offset_idx,
+                    cell.end_row_offset_idx,
+                )
+                colspan, colstart, colend = (
+                    cell.col_span,
+                    cell.start_col_offset_idx,
+                    cell.end_col_offset_idx,
+                )
+
+                if rowstart != i:
+                    continue
+                if colstart != j:
+                    continue
+
+                content = cell.text.strip()
+                celltag = "td"
+                if cell.column_header:
+                    celltag = "th"
+
+                opening_tag = f"{celltag}"
+                if rowspan > 1:
+                    opening_tag += f' rowspan="{rowspan}"'
+                if colspan > 1:
+                    opening_tag += f' colspan="{colspan}"'
+
+                body += f"<{opening_tag}>{content}</{celltag}>"
+            body += "</tr>"
+        body = f"<table>{body}</table>"
+
+        return body
+
+    def export_to_document_tokens(
+        self,
+        doc: "DoclingDocument",
+        new_line: str = "\n",
+        page_w: float = 0.0,
+        page_h: float = 0.0,
+        xsize: int = 100,
+        ysize: int = 100,
+        add_location: bool = True,
+        add_caption: bool = True,
+        add_content: bool = True,
+        add_cell_location: bool = True,
+        add_cell_label: bool = True,
+        add_cell_text: bool = True,
+        add_page_index: bool = True,
+    ):
+        """Export table to document tokens format."""
+        body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
+
+        if add_location:
+            body += self.get_location_tokens(
+                new_line=new_line,
+                page_w=page_w,
+                page_h=page_h,
+                xsize=xsize,
+                ysize=ysize,
+                add_page_index=add_page_index,
+            )
+
+        if add_caption and len(self.captions):
+            text = ""
+            for cap in self.captions:
+                text += cap.resolve(doc).text
+
+            if len(text):
+                body += f"{DocumentToken.BEG_CAPTION.value}"
+                body += f"{text.strip()}"
+                body += f"{DocumentToken.END_CAPTION.value}"
+                body += f"{new_line}"
+
+        if add_content and len(self.data.table_cells) > 0:
+            for i, row in enumerate(self.data.grid):
+                body += f"<row_{i}>"
+                for j, col in enumerate(row):
+
+                    text = ""
+                    if add_cell_text:
+                        text = col.text.strip()
+
+                    cell_loc = ""
+                    if (
+                        col.bbox is not None
+                        and add_cell_location
+                        and add_page_index
+                        and self.prov is not None
+                        and len(self.prov) > 0
+                    ):
+                        cell_loc = DocumentToken.get_location(
+                            bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
+                            page_w=page_w,
+                            page_h=page_h,
+                            xsize=xsize,
+                            ysize=ysize,
+                            page_i=self.prov[0].page_no,
+                        )
+                    elif (
+                        col.bbox is not None
+                        and add_cell_location
+                        and not add_page_index
+                    ):
+                        cell_loc = DocumentToken.get_location(
+                            bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
+                            page_w=page_w,
+                            page_h=page_h,
+                            xsize=xsize,
+                            ysize=ysize,
+                            page_i=-1,
+                        )
+
+                    cell_label = ""
+                    if add_cell_label:
+                        cell_label = f"<{'col_header' if col.column_header else 'row_header' if col.row_header else 'row_section' if col.row_section else 'body'}>"
+
+                    body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
+
+                body += f"</row_{i}>{new_line}"
+
+        body += f"{DocumentToken.END_TABLE.value}{new_line}"
+
+        return body
+
 
 class KeyValueItem(DocItem):
     pass
@@ -381,7 +675,238 @@ def iterate_elements(
 
             if isinstance(child, NodeItem):
                 # If the child is a NodeItem, recursively traverse it
-                if isinstance(child, FigureItem) and traverse_figures:
+                if not isinstance(child, FigureItem) or traverse_figures:
                     yield from self.iterate_elements(child)
             else:  # leaf
                 yield child
+
+    def export_to_markdown(
+        self,
+        delim: str = "\n\n",
+        from_element: int = 0,
+        to_element: Optional[int] = None,
+        labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+            "table",
+            "Text",
+            "text",
+        ],
+        strict_text: bool = False,
+    ) -> str:
+        r"""Serialize to Markdown.
+
+        Operates on a slice of the document's main_text as defined through arguments
+        main_text_start and main_text_stop; defaulting to the whole main_text.
+
+        Args:
+            delim (str, optional): Delimiter to use when concatenating the various
+                Markdown parts. Defaults to "\n\n".
+            from_element (int, optional): Body slicing start index (inclusive).
+                Defaults to 0.
+            to_element (Optional[int], optional): Body slicing stop index
+                (exclusive). Defaults to None.
+
+        Returns:
+            str: The exported Markdown representation.
+        """
+        has_title = False
+        prev_text = ""
+        md_texts: list[str] = []
+
+        skip_count = 0
+        if len(self.body.children):
+            for ix, item in enumerate(self.iterate_elements(self.body)):
+                if skip_count < from_element:
+                    skip_count += 1
+                    continue  # skip as many items as you want
+
+                if to_element and ix >= to_element:
+                    break
+
+                markdown_text = ""
+
+                if isinstance(item, DocItem):
+                    item_type = item.label
+
+                    if isinstance(item, TextItem) and item_type in labels:
+                        text = item.text
+
+                        # ignore repeated text
+                        if prev_text == text or text is None:
+                            continue
+                        else:
+                            prev_text = text
+
+                        # first title match
+                        if item_type == "title" and not has_title:
+                            if strict_text:
+                                markdown_text = f"{text}"
+                            else:
+                                markdown_text = f"# {text}"
+                            has_title = True
+
+                        # secondary titles
+                        elif item_type in {"title", "subtitle-level-1"} or (
+                            has_title and item_type == "title"
+                        ):
+                            if strict_text:
+                                markdown_text = f"{text}"
+                            else:
+                                markdown_text = f"## {text}"
+
+                        # normal text
+                        else:
+                            markdown_text = text
+
+                    elif (
+                        isinstance(item, TableItem)
+                        and item.data
+                        and item_type in labels
+                        and not strict_text
+                    ):
+                        table = []
+                        for row in item.data.grid:
+                            tmp = []
+                            for col in row:
+                                tmp.append(col.text)
+                            table.append(tmp)
+
+                        if len(table) > 1 and len(table[0]) > 0:
+                            try:
+                                md_table = tabulate(
+                                    table[1:], headers=table[0], tablefmt="github"
+                                )
+                            except ValueError:
+                                md_table = tabulate(
+                                    table[1:],
+                                    headers=table[0],
+                                    tablefmt="github",
+                                    disable_numparse=True,
+                                )
+
+                            markdown_text = md_table
+
+                if markdown_text:
+                    md_texts.append(markdown_text)
+
+        result = delim.join(md_texts)
+        return result
+
+    def export_to_document_tokens(
+        self,
+        delim: str = "\n\n",
+        from_element: int = 0,
+        to_element: Optional[int] = None,
+        labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+            "table",
+            "figure",
+            "text",
+        ],
+        xsize: int = 100,
+        ysize: int = 100,
+        add_location: bool = True,
+        add_content: bool = True,
+        add_page_index: bool = True,
+        # table specific flags
+        add_table_cell_location: bool = False,
+        add_table_cell_label: bool = True,
+        add_table_cell_text: bool = True,
+    ) -> str:
+        r"""Exports the document content to an DocumentToken format.
+
+        Operates on a slice of the document's body as defined through arguments
+        from_element and to_element; defaulting to the whole main_text.
+
+        Returns:
+            str: The content of the document formatted as a DocTags string.
+        """
+        new_line = ""
+        if delim:
+            new_line = "\n"
+
+        doctags = f"{DocumentToken.BEG_DOCUMENT.value}{new_line}"
+
+        # pagedims = self.get_map_to_page_dimensions()
+
+        skip_count = 0
+        if len(self.body.children):
+            for ix, item in enumerate(self.iterate_elements(self.body)):
+
+                if skip_count < from_element:
+                    skip_count += 1
+                    continue  # skip as many items as you want
+
+                if to_element and ix >= to_element:
+                    break
+
+                prov = item.prov
+
+                page_i = -1
+                page_w = 0.0
+                page_h = 0.0
+
+                if add_location and len(self.pages) and len(prov) > 0:
+
+                    page_i = prov[0].page
+                    page_dim = self.pages[page_i - 1].size
+
+                    page_w = float(page_dim.width)
+                    page_h = float(page_dim.height)
+
+                item_type = item.label
+                if isinstance(item, TextItem) and (item_type in labels):
+
+                    doctags += item.export_to_document_tokens(
+                        new_line=new_line,
+                        page_w=page_w,
+                        page_h=page_h,
+                        xsize=xsize,
+                        ysize=ysize,
+                        add_location=add_location,
+                        add_content=add_content,
+                        add_page_index=add_page_index,
+                    )
+
+                elif isinstance(item, TableItem) and (item_type in labels):
+
+                    doctags += item.export_to_document_tokens(
+                        doc=self,
+                        new_line=new_line,
+                        page_w=page_w,
+                        page_h=page_h,
+                        xsize=xsize,
+                        ysize=ysize,
+                        add_caption=True,
+                        add_location=add_location,
+                        add_content=add_content,
+                        add_cell_location=add_table_cell_location,
+                        add_cell_label=add_table_cell_label,
+                        add_cell_text=add_table_cell_text,
+                        add_page_index=add_page_index,
+                    )
+
+                elif isinstance(item, FigureItem) and (item_type in labels):
+
+                    doctags += item.export_to_document_tokens(
+                        doc=self,
+                        new_line=new_line,
+                        page_w=page_w,
+                        page_h=page_h,
+                        xsize=xsize,
+                        ysize=ysize,
+                        add_caption=True,
+                        add_location=add_location,
+                        add_content=add_content,
+                        add_page_index=add_page_index,
+                    )
+
+        doctags += DocumentToken.END_DOCUMENT.value
+
+        return doctags
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index a8cb8ea..4e25a5d 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -87,7 +87,7 @@ def test_construct_doc():
         TableCell(
             row_span=2,
             start_row_offset_idx=0,
-            end_row_offset_idx=1,
+            end_row_offset_idx=2,
             start_col_offset_idx=0,
             end_col_offset_idx=1,
             text="Product",
@@ -161,6 +161,20 @@ def test_construct_doc():
     for item in doc.iterate_elements():
         print(f"Item: {item}")
 
+    ## Export stuff
+
+    print(doc.export_to_markdown())
+    print(doc.export_to_document_tokens())
+
+    for table in doc.tables:
+        table.export_to_html()
+        table.export_to_dataframe()
+        table.export_to_document_tokens(doc)
+        1 == 1
+
+    for fig in doc.figures:
+        fig.export_to_document_tokens(doc)
+
     ### Serialize and deserialize stuff
 
     yaml_dump = yaml.safe_dump(doc.model_dump(mode="json", by_alias=True))

From 0a1e6ce9559ffccf50c5e63c33962ac8fde35648 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Tue, 24 Sep 2024 11:16:52 +0200
Subject: [PATCH 11/34] Change DoclingDocument.iterate_elements and add print
 tree function

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/experimental/document.py | 24 ++++++++++++++-------
 test/test_docling_doc.py                    |  9 ++++----
 2 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index 41fa935..e92c047 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -659,15 +659,16 @@ def build_page_trees(self):
     def iterate_elements(
         self,
         root: Optional[NodeItem] = None,
-        omit_groups: bool = True,
+        with_groups: bool = False,
         traverse_figures: bool = True,
-    ) -> typing.Iterable[NodeItem]:
+        level=0,
+    ) -> typing.Iterable[Tuple[NodeItem, int]]:  # tuple of node and level
         # Yield the current node
         if not root:
             root = self.body
 
-        if omit_groups and not isinstance(root, GroupItem):
-            yield root
+        if not isinstance(root, GroupItem) or with_groups:
+            yield root, level
 
         # Traverse children
         for child_ref in root.children:
@@ -676,9 +677,16 @@ def iterate_elements(
             if isinstance(child, NodeItem):
                 # If the child is a NodeItem, recursively traverse it
                 if not isinstance(child, FigureItem) or traverse_figures:
-                    yield from self.iterate_elements(child)
+                    yield from self.iterate_elements(child, level=level + 1)
             else:  # leaf
-                yield child
+                yield child, level
+
+    def print_element_tree(self):
+        for ix, (item, level) in enumerate(self.iterate_elements(with_groups=True)):
+            if isinstance(item, GroupItem):
+                print(" " * level, f"{ix}: {item.name}")
+            elif isinstance(item, DocItem):
+                print(" " * level, f"{ix}: {item.label}")
 
     def export_to_markdown(
         self,
@@ -718,7 +726,7 @@ def export_to_markdown(
 
         skip_count = 0
         if len(self.body.children):
-            for ix, item in enumerate(self.iterate_elements(self.body)):
+            for ix, (item, level) in enumerate(self.iterate_elements(self.body)):
                 if skip_count < from_element:
                     skip_count += 1
                     continue  # skip as many items as you want
@@ -837,7 +845,7 @@ def export_to_document_tokens(
 
         skip_count = 0
         if len(self.body.children):
-            for ix, item in enumerate(self.iterate_elements(self.body)):
+            for ix, (item, level) in enumerate(self.iterate_elements(self.body)):
 
                 if skip_count < from_element:
                     skip_count += 1
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index 4e25a5d..ee1a9f9 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -44,8 +44,8 @@ def test_load_serialize_doc():
 
     ### Iterate all elements
 
-    for item in doc.iterate_elements():
-        print(f"Item: {item}")
+    for item, level in doc.iterate_elements():
+        print(f"Item: {item} at level {level}")
 
 
 def test_construct_doc():
@@ -158,7 +158,7 @@ def test_construct_doc():
 
     ### Iterate all elements
 
-    for item in doc.iterate_elements():
+    for item, level in doc.iterate_elements():
         print(f"Item: {item}")
 
     ## Export stuff
@@ -170,11 +170,12 @@ def test_construct_doc():
         table.export_to_html()
         table.export_to_dataframe()
         table.export_to_document_tokens(doc)
-        1 == 1
 
     for fig in doc.figures:
         fig.export_to_document_tokens(doc)
 
+    doc.print_element_tree()
+
     ### Serialize and deserialize stuff
 
     yaml_dump = yaml.safe_dump(doc.model_dump(mode="json", by_alias=True))

From a83ff0056138d83ac2cb52bfb2ab1728ff86972f Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Tue, 24 Sep 2024 15:53:41 +0200
Subject: [PATCH 12/34] Introduce label enum types, apply everywhere

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/experimental/labels.py     |    0
 .../experimental/2206.01062.experimental.yaml | 3689 +++++++++++++++++
 2 files changed, 3689 insertions(+)
 create mode 100644 docling_core/types/experimental/labels.py
 create mode 100644 test/data/experimental/2206.01062.experimental.yaml

diff --git a/docling_core/types/experimental/labels.py b/docling_core/types/experimental/labels.py
new file mode 100644
index 0000000..e69de29
diff --git a/test/data/experimental/2206.01062.experimental.yaml b/test/data/experimental/2206.01062.experimental.yaml
new file mode 100644
index 0000000..76f8480
--- /dev/null
+++ b/test/data/experimental/2206.01062.experimental.yaml
@@ -0,0 +1,3689 @@
+body:
+  children:
+  - $ref: '#/texts/0'
+  - $ref: '#/texts/1'
+  - $ref: '#/texts/2'
+  - $ref: '#/texts/3'
+  - $ref: '#/texts/4'
+  - $ref: '#/texts/5'
+  - $ref: '#/texts/6'
+  - $ref: '#/texts/7'
+  - $ref: '#/texts/8'
+  - $ref: '#/texts/9'
+  - $ref: '#/texts/10'
+  - $ref: '#/texts/11'
+  - $ref: '#/texts/12'
+  - $ref: '#/figures/0'
+  - $ref: '#/texts/13'
+  - $ref: '#/texts/14'
+  - $ref: '#/texts/15'
+  - $ref: '#/texts/16'
+  - $ref: '#/texts/17'
+  - $ref: '#/texts/18'
+  - $ref: '#/texts/19'
+  - $ref: '#/texts/20'
+  - $ref: '#/texts/21'
+  - $ref: '#/texts/22'
+  - $ref: '#/texts/23'
+  - $ref: '#/texts/24'
+  - $ref: '#/texts/25'
+  - $ref: '#/texts/26'
+  - $ref: '#/texts/27'
+  - $ref: '#/texts/28'
+  - $ref: '#/texts/29'
+  - $ref: '#/texts/30'
+  - $ref: '#/texts/31'
+  - $ref: '#/texts/32'
+  - $ref: '#/texts/33'
+  - $ref: '#/texts/34'
+  - $ref: '#/texts/35'
+  - $ref: '#/texts/36'
+  - $ref: '#/texts/37'
+  - $ref: '#/texts/38'
+  - $ref: '#/texts/39'
+  - $ref: '#/figures/1'
+  - $ref: '#/texts/40'
+  - $ref: '#/texts/41'
+  - $ref: '#/texts/42'
+  - $ref: '#/texts/43'
+  - $ref: '#/texts/44'
+  - $ref: '#/texts/45'
+  - $ref: '#/texts/46'
+  - $ref: '#/texts/47'
+  - $ref: '#/texts/48'
+  - $ref: '#/texts/49'
+  - $ref: '#/texts/50'
+  - $ref: '#/tables/0'
+  - $ref: '#/texts/51'
+  - $ref: '#/texts/52'
+  - $ref: '#/texts/53'
+  - $ref: '#/texts/54'
+  - $ref: '#/texts/55'
+  - $ref: '#/texts/56'
+  - $ref: '#/texts/57'
+  - $ref: '#/texts/58'
+  - $ref: '#/texts/59'
+  - $ref: '#/texts/60'
+  - $ref: '#/texts/61'
+  - $ref: '#/texts/62'
+  - $ref: '#/texts/63'
+  - $ref: '#/texts/64'
+  - $ref: '#/texts/65'
+  - $ref: '#/texts/66'
+  - $ref: '#/texts/67'
+  - $ref: '#/texts/68'
+  - $ref: '#/texts/69'
+  - $ref: '#/texts/70'
+  - $ref: '#/figures/2'
+  - $ref: '#/texts/71'
+  - $ref: '#/texts/72'
+  - $ref: '#/texts/73'
+  - $ref: '#/tables/1'
+  - $ref: '#/texts/74'
+  - $ref: '#/texts/75'
+  - $ref: '#/texts/76'
+  - $ref: '#/figures/3'
+  - $ref: '#/texts/77'
+  - $ref: '#/texts/78'
+  - $ref: '#/texts/79'
+  - $ref: '#/texts/80'
+  - $ref: '#/texts/81'
+  - $ref: '#/texts/82'
+  - $ref: '#/tables/2'
+  - $ref: '#/texts/83'
+  - $ref: '#/texts/84'
+  - $ref: '#/texts/85'
+  - $ref: '#/texts/86'
+  - $ref: '#/texts/87'
+  - $ref: '#/tables/3'
+  - $ref: '#/texts/88'
+  - $ref: '#/texts/89'
+  - $ref: '#/texts/90'
+  - $ref: '#/texts/91'
+  - $ref: '#/texts/92'
+  - $ref: '#/texts/93'
+  - $ref: '#/tables/4'
+  - $ref: '#/texts/94'
+  - $ref: '#/texts/95'
+  - $ref: '#/texts/96'
+  - $ref: '#/texts/97'
+  - $ref: '#/texts/98'
+  - $ref: '#/texts/99'
+  - $ref: '#/texts/100'
+  - $ref: '#/texts/101'
+  - $ref: '#/texts/102'
+  - $ref: '#/texts/103'
+  - $ref: '#/texts/104'
+  - $ref: '#/texts/105'
+  - $ref: '#/texts/106'
+  - $ref: '#/texts/107'
+  - $ref: '#/texts/108'
+  - $ref: '#/texts/109'
+  - $ref: '#/texts/110'
+  - $ref: '#/texts/111'
+  - $ref: '#/texts/112'
+  - $ref: '#/texts/113'
+  - $ref: '#/texts/114'
+  - $ref: '#/texts/115'
+  - $ref: '#/texts/116'
+  - $ref: '#/texts/117'
+  - $ref: '#/figures/4'
+  - $ref: '#/texts/118'
+  - $ref: '#/texts/119'
+  - $ref: '#/texts/120'
+  - $ref: '#/texts/121'
+  - $ref: '#/texts/122'
+  - $ref: '#/texts/123'
+  - $ref: '#/texts/124'
+  - $ref: '#/texts/125'
+  - $ref: '#/texts/126'
+  - $ref: '#/texts/127'
+  dloc: '#/body'
+  hash: 1876595454579351028
+  name: _root_
+  parent: null
+description: {}
+figures:
+- captions:
+  - $ref: '#/texts/12'
+  children: []
+  data: {}
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/figures/0
+  footnotes: []
+  hash: 3823827261264467155
+  image: null
+  label: picture
+  parent:
+    $ref: '#/body'
+  prov: []
+  references: []
+- captions:
+  - $ref: '#/texts/39'
+  children: []
+  data: {}
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/figures/1
+  footnotes: []
+  hash: 2717789230650946439
+  image: null
+  label: picture
+  parent:
+    $ref: '#/body'
+  prov: []
+  references: []
+- captions:
+  - $ref: '#/texts/70'
+  children: []
+  data: {}
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/figures/2
+  footnotes: []
+  hash: 11874686886604579344
+  image: null
+  label: picture
+  parent:
+    $ref: '#/body'
+  prov: []
+  references: []
+- captions:
+  - $ref: '#/texts/76'
+  children: []
+  data: {}
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/figures/3
+  footnotes: []
+  hash: 13157758373214615403
+  image: null
+  label: picture
+  parent:
+    $ref: '#/body'
+  prov: []
+  references: []
+- captions:
+  - $ref: '#/texts/117'
+  children: []
+  data: {}
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/figures/4
+  footnotes: []
+  hash: 3241646916892239195
+  image: null
+  label: picture
+  parent:
+    $ref: '#/body'
+  prov: []
+  references: []
+file_info:
+  document_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc
+furniture:
+  children: []
+  dloc: '#/furniture'
+  hash: 5280524054814059340
+  name: _root_
+  parent: null
+groups: []
+key_value_items: []
+pages:
+  '1':
+    hash: 3c76b6d3fd82865e42c51d5cbd7d1a9996dba7902643b919acc581e866b92716
+    image: null
+    page_no: 1
+    size:
+      height: 792.0
+      width: 612.0
+  '2':
+    hash: 5ccfaddd314d3712cbabc857c8c0f33d1268341ce37b27089857cbf09f0522d4
+    image: null
+    page_no: 2
+    size:
+      height: 792.0
+      width: 612.0
+  '3':
+    hash: d2dc51ad0a01ee9486ffe248649ee1cd10ce35773de8e4b21abf30d310f4fc26
+    image: null
+    page_no: 3
+    size:
+      height: 792.0
+      width: 612.0
+  '4':
+    hash: 310121977375f8f1106412189943bd70f121629b2b4d35394077233dedbfb041
+    image: null
+    page_no: 4
+    size:
+      height: 792.0
+      width: 612.0
+  '5':
+    hash: 09fa72b602eb0640669844acabc17ef494802a4a9188aeaaf0e0131c496e6951
+    image: null
+    page_no: 5
+    size:
+      height: 792.0
+      width: 612.0
+  '6':
+    hash: ec3fa60f136f3d9f5fa790ab27f5d1c14e5622573c52377b909b591d0be0ea44
+    image: null
+    page_no: 6
+    size:
+      height: 792.0
+      width: 612.0
+  '7':
+    hash: ec1bc56fe581ce95615b1fab11c3ba8fc89662acf2f53446decd380a155b06dd
+    image: null
+    page_no: 7
+    size:
+      height: 792.0
+      width: 612.0
+  '8':
+    hash: fbd2b06876dddc19ee08e0a9751d978c03e6943b74bedf1d83d6528cd4f8954d
+    image: null
+    page_no: 8
+    size:
+      height: 792.0
+      width: 612.0
+  '9':
+    hash: 6cfa4eb4410fa9972da289dbf8d8cc585d317a192e1214c778ddd7768e98f311
+    image: null
+    page_no: 9
+    size:
+      height: 792.0
+      width: 612.0
+tables:
+- captions:
+  - $ref: '#/texts/50'
+  children: []
+  data:
+    grid: []
+    num_cols: 0
+    num_rows: 0
+    table_cells: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/tables/0
+  footnotes: []
+  hash: 14148577749296175318
+  image: null
+  label: table
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 498.30108642578125
+      coord_origin: BOTTOMLEFT
+      l: 98.96420288085938
+      r: 512.7739868164062
+      t: 654.1231689453125
+    charspan:
+    - 0
+    - 0
+    page_no: 4
+  references: []
+- captions:
+  - $ref: '#/texts/73'
+  children: []
+  data:
+    grid: []
+    num_cols: 0
+    num_rows: 0
+    table_cells: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/tables/1
+  footnotes: []
+  hash: 17333450552515386005
+  image: null
+  label: table
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 440.30438232421875
+      coord_origin: BOTTOMLEFT
+      l: 61.93328094482422
+      r: 285.75616455078125
+      t: 596.587158203125
+    charspan:
+    - 0
+    - 0
+    page_no: 6
+  references: []
+- captions:
+  - $ref: '#/texts/82'
+  children: []
+  data:
+    grid: []
+    num_cols: 0
+    num_rows: 0
+    table_cells: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/tables/2
+  footnotes: []
+  hash: 16080913497667217474
+  image: null
+  label: table
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 496.419189453125
+      coord_origin: BOTTOMLEFT
+      l: 80.5073471069336
+      r: 267.3428649902344
+      t: 640.9814453125
+    charspan:
+    - 0
+    - 0
+    page_no: 7
+  references: []
+- captions:
+  - $ref: '#/texts/87'
+  children: []
+  data:
+    grid: []
+    num_cols: 0
+    num_rows: 0
+    table_cells: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/tables/3
+  footnotes: []
+  hash: 7071974284449481758
+  image: null
+  label: table
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 485.2873840332031
+      coord_origin: BOTTOMLEFT
+      l: 353.065185546875
+      r: 523.3069458007812
+      t: 641.25341796875
+    charspan:
+    - 0
+    - 0
+    page_no: 7
+  references: []
+- captions:
+  - $ref: '#/texts/93'
+  children: []
+  data:
+    grid: []
+    num_cols: 0
+    num_rows: 0
+    table_cells: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/tables/4
+  footnotes: []
+  hash: 8754037299649738038
+  image: null
+  label: table
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 452.12615966796875
+      coord_origin: BOTTOMLEFT
+      l: 72.87370300292969
+      r: 274.87945556640625
+      t: 619.3699951171875
+    charspan:
+    - 0
+    - 0
+    page_no: 8
+  references: []
+texts:
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/0
+  hash: 5801389470470321019
+  label: section_header
+  orig: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 672.3833618164062
+      coord_origin: BOTTOMLEFT
+      l: 107.30000305175781
+      r: 505.1857604980469
+      t: 709.082275390625
+    charspan:
+    - 0
+    - 71
+    page_no: 1
+  text: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/1
+  hash: 8511179082257553176
+  label: text
+  orig: Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 611.2825317382812
+      coord_origin: BOTTOMLEFT
+      l: 90.94670867919922
+      r: 193.91998291015625
+      t: 658.7803344726562
+    charspan:
+    - 0
+    - 73
+    page_no: 1
+  text: Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/2
+  hash: 8279137503716887272
+  label: text
+  orig: Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 611.7597045898438
+      coord_origin: BOTTOMLEFT
+      l: 254.97935485839844
+      r: 357.8802490234375
+      t: 658.7174072265625
+    charspan:
+    - 0
+    - 71
+    page_no: 1
+  text: Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/3
+  hash: 16452346600845753706
+  label: text
+  orig: Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 611.7597045898438
+      coord_origin: BOTTOMLEFT
+      l: 419.0672302246094
+      r: 522.0595703125
+      t: 658.9878540039062
+    charspan:
+    - 0
+    - 70
+    page_no: 1
+  text: Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/4
+  hash: 5753518757297767565
+  label: text
+  orig: Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 553.3746948242188
+      coord_origin: BOTTOMLEFT
+      l: 171.90907287597656
+      r: 275.3072509765625
+      t: 600.1580200195312
+    charspan:
+    - 0
+    - 72
+    page_no: 1
+  text: Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/5
+  hash: 400399309987224909
+  label: text
+  orig: Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 553.3746948242188
+      coord_origin: BOTTOMLEFT
+      l: 336.5292053222656
+      r: 439.84405517578125
+      t: 599.942626953125
+    charspan:
+    - 0
+    - 68
+    page_no: 1
+  text: Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/6
+  hash: 7981313731349902307
+  label: section_header
+  orig: ABSTRACT
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 533.9879760742188
+      coord_origin: BOTTOMLEFT
+      l: 53.33011245727539
+      r: 112.2127456665039
+      t: 544.47509765625
+    charspan:
+    - 0
+    - 8
+    page_no: 1
+  text: ABSTRACT
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/7
+  hash: 18185954695676845569
+  label: text
+  orig: Accurate document layout analysis is a key requirement for highquality PDF
+    document conversion. With the recent availability of public, large ground-truth
+    datasets such as PubLayNet and DocBank, deep-learning models have proven to be
+    very effective at layout detection and segmentation. While these datasets are
+    of adequate size to train such models, they severely lack in layout variability
+    since they are sourced from scientific article repositories such as PubMed and
+    arXiv only. Consequently, the accuracy of the layout segmentation drops significantly
+    when these models are applied on more challenging and diverse layouts. In this
+    paper, we present DocLayNet , a new, publicly available, document-layout annotation
+    dataset in COCO format. It contains 80863 manually annotated pages from diverse
+    data sources to represent a wide variability in layouts. For each PDF page, the
+    layout annotations provide labelled bounding-boxes with a choice of 11 distinct
+    classes. DocLayNet also provides a subset of double- and triple-annotated pages
+    to determine the inter-annotator agreement. In multiple experiments, we provide
+    baseline accuracy scores (in mAP) for a set of popular object detection models.
+    We also demonstrate that these models fall approximately 10% behind the inter-annotator
+    agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size.
+    Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing
+    that layout predictions of the DocLayNettrained models are more robust and thus
+    the preferred choice for general-purpose document-layout analysis.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 257.10565185546875
+      coord_origin: BOTTOMLEFT
+      l: 52.857933044433594
+      r: 295.5601806640625
+      t: 529.5941162109375
+    charspan:
+    - 0
+    - 1595
+    page_no: 1
+  text: Accurate document layout analysis is a key requirement for highquality PDF
+    document conversion. With the recent availability of public, large ground-truth
+    datasets such as PubLayNet and DocBank, deep-learning models have proven to be
+    very effective at layout detection and segmentation. While these datasets are
+    of adequate size to train such models, they severely lack in layout variability
+    since they are sourced from scientific article repositories such as PubMed and
+    arXiv only. Consequently, the accuracy of the layout segmentation drops significantly
+    when these models are applied on more challenging and diverse layouts. In this
+    paper, we present DocLayNet , a new, publicly available, document-layout annotation
+    dataset in COCO format. It contains 80863 manually annotated pages from diverse
+    data sources to represent a wide variability in layouts. For each PDF page, the
+    layout annotations provide labelled bounding-boxes with a choice of 11 distinct
+    classes. DocLayNet also provides a subset of double- and triple-annotated pages
+    to determine the inter-annotator agreement. In multiple experiments, we provide
+    baseline accuracy scores (in mAP) for a set of popular object detection models.
+    We also demonstrate that these models fall approximately 10% behind the inter-annotator
+    agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size.
+    Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing
+    that layout predictions of the DocLayNettrained models are more robust and thus
+    the preferred choice for general-purpose document-layout analysis.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/8
+  hash: 17745163365450621279
+  label: section_header
+  orig: CCS CONCEPTS
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 230.69398498535156
+      coord_origin: BOTTOMLEFT
+      l: 53.36912155151367
+      r: 134.81988525390625
+      t: 241.21551513671875
+    charspan:
+    - 0
+    - 12
+    page_no: 1
+  text: CCS CONCEPTS
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/9
+  hash: 12529050007388097730
+  label: text
+  orig: "\xB7 Information systems \u2192 Document structure ; \xB7 Applied computing\
+    \ \u2192 Document analysis ; \xB7 Computing methodologies \u2192 Machine learning\
+    \ ; Computer vision ; Object detection ;"
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 194.8704071044922
+      coord_origin: BOTTOMLEFT
+      l: 53.02470016479492
+      r: 297.8529357910156
+      t: 226.241455078125
+    charspan:
+    - 0
+    - 170
+    page_no: 1
+  text: "\xB7 Information systems \u2192 Document structure ; \xB7 Applied computing\
+    \ \u2192 Document analysis ; \xB7 Computing methodologies \u2192 Machine learning\
+    \ ; Computer vision ; Object detection ;"
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/10
+  hash: 11569407347437572994
+  label: text
+  orig: Permission to make digital or hard copies of part or all of this work for
+    personal or classroom use is granted without fee provided that copies are not
+    made or distributed for profit or commercial advantage and that copies bear this
+    notice and the full citation on the first page. Copyrights for third-party components
+    of this work must be honored. For all other uses, contact the owner/author(s).
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 117.82738494873047
+      coord_origin: BOTTOMLEFT
+      l: 53.33460235595703
+      r: 295.11798095703125
+      t: 158.33511352539062
+    charspan:
+    - 0
+    - 397
+    page_no: 1
+  text: Permission to make digital or hard copies of part or all of this work for
+    personal or classroom use is granted without fee provided that copies are not
+    made or distributed for profit or commercial advantage and that copies bear this
+    notice and the full citation on the first page. Copyrights for third-party components
+    of this work must be honored. For all other uses, contact the owner/author(s).
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/11
+  hash: 13344389659304761998
+  label: text
+  orig: "KDD '22, August 14-18, 2022, Washington, DC, USA \xA9 2022 Copyright held\
+    \ by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043"
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 85.73310852050781
+      coord_origin: BOTTOMLEFT
+      l: 53.31700134277344
+      r: 197.8627471923828
+      t: 116.91976928710938
+    charspan:
+    - 0
+    - 168
+    page_no: 1
+  text: "KDD '22, August 14-18, 2022, Washington, DC, USA \xA9 2022 Copyright held\
+    \ by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043"
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/12
+  hash: 3162927929825665449
+  label: caption
+  orig: 'Figure 1: Four examples of complex page layouts across different document
+    categories'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 232.3291473388672
+      coord_origin: BOTTOMLEFT
+      l: 317.2291564941406
+      r: 559.8057861328125
+      t: 252.12974548339844
+    charspan:
+    - 0
+    - 84
+    page_no: 1
+  text: 'Figure 1: Four examples of complex page layouts across different document
+    categories'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/13
+  hash: 13011367304084404613
+  label: section_header
+  orig: KEYWORDS
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 189.22499084472656
+      coord_origin: BOTTOMLEFT
+      l: 317.11431884765625
+      r: 379.82049560546875
+      t: 199.97215270996094
+    charspan:
+    - 0
+    - 8
+    page_no: 1
+  text: KEYWORDS
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/14
+  hash: 16726456449567869739
+  label: text
+  orig: PDF document conversion, layout segmentation, object-detection, data set,
+    Machine Learning
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 164.9988250732422
+      coord_origin: BOTTOMLEFT
+      l: 317.2037658691406
+      r: 559.2164306640625
+      t: 184.67845153808594
+    charspan:
+    - 0
+    - 90
+    page_no: 1
+  text: PDF document conversion, layout segmentation, object-detection, data set,
+    Machine Learning
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/15
+  hash: 5718230321549514887
+  label: section_header
+  orig: 'ACM Reference Format:'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 144.41390991210938
+      coord_origin: BOTTOMLEFT
+      l: 317.3434753417969
+      r: 404.6536560058594
+      t: 152.36439514160156
+    charspan:
+    - 0
+    - 21
+    page_no: 1
+  text: 'ACM Reference Format:'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/16
+  hash: 17635312130661974579
+  label: text
+  orig: 'Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter
+    Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis.
+    In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data
+    Mining (KDD ''22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY,
+    USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 84.62297058105469
+      coord_origin: BOTTOMLEFT
+      l: 317.1117248535156
+      r: 559.5494995117188
+      t: 142.41151428222656
+    charspan:
+    - 0
+    - 374
+    page_no: 1
+  text: 'Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter
+    Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis.
+    In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data
+    Mining (KDD ''22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY,
+    USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/17
+  hash: 5293186016864745982
+  label: page_header
+  orig: "KDD \u201922, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph\
+    \ Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar"
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 722.7692260742188
+      coord_origin: BOTTOMLEFT
+      l: 53.19501876831055
+      r: 558.4357299804688
+      t: 732.1524047851562
+    charspan:
+    - 0
+    - 130
+    page_no: 2
+  text: "KDD \u201922, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph\
+    \ Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar"
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/18
+  hash: 5428450824043951937
+  label: section_header
+  orig: 1 INTRODUCTION
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 695.8309936523438
+      coord_origin: BOTTOMLEFT
+      l: 53.79800033569336
+      r: 156.52899169921875
+      t: 706.4523315429688
+    charspan:
+    - 0
+    - 14
+    page_no: 2
+  text: 1 INTRODUCTION
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/19
+  hash: 15747194476520587400
+  label: text
+  orig: Despite the substantial improvements achieved with machine-learning (ML) approaches
+    and deep neural networks in recent years, document conversion remains a challenging
+    problem, as demonstrated by the numerous public competitions held on this topic
+    [1-4]. The challenge originates from the huge variability in PDF documents regarding
+    layout, language and formats (scanned, programmatic or a combination of both).
+    Engineering a single ML model that can be applied on all types of documents and
+    provides high-quality layout segmentation remains to this day extremely challenging
+    [5]. To highlight the variability in document layouts, we show a few example documents
+    from the DocLayNet dataset in Figure 1.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 562.986572265625
+      coord_origin: BOTTOMLEFT
+      l: 52.80397415161133
+      r: 303.1766357421875
+      t: 681.3472290039062
+    charspan:
+    - 0
+    - 702
+    page_no: 2
+  text: Despite the substantial improvements achieved with machine-learning (ML) approaches
+    and deep neural networks in recent years, document conversion remains a challenging
+    problem, as demonstrated by the numerous public competitions held on this topic
+    [1-4]. The challenge originates from the huge variability in PDF documents regarding
+    layout, language and formats (scanned, programmatic or a combination of both).
+    Engineering a single ML model that can be applied on all types of documents and
+    provides high-quality layout segmentation remains to this day extremely challenging
+    [5]. To highlight the variability in document layouts, we show a few example documents
+    from the DocLayNet dataset in Figure 1.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/20
+  hash: 9815825593984971365
+  label: text
+  orig: 'A key problem in the process of document conversion is to understand the
+    structure of a single document page, i.e. which segments of text should be grouped
+    together in a unit. To train models for this task, there are currently two large
+    datasets available to the community, PubLayNet [6] and DocBank [7]. They were
+    introduced in 2019 and 2020 respectively and significantly accelerated the implementation
+    of layout detection and segmentation models due to their sizes of 300K and 500K
+    ground-truth pages. These sizes were achieved by leveraging an automation approach.
+    The benefit of automated ground-truth generation is obvious: one can generate
+    large ground-truth datasets at virtually no cost. However, the automation introduces
+    a constraint on the variability in the dataset, because corresponding structured
+    source data must be available. PubLayNet and DocBank were both generated from
+    scientific document repositories (PubMed and arXiv), which provide XML or L A
+    T E X sources. Those scientific documents present a limited variability in their
+    layouts, because they are typeset in uniform templates provided by the publishers.
+    Obviously, documents such as technical manuals, annual company reports, legal
+    text, government tenders, etc. have very different and partially unique layouts.
+    As a consequence, the layout predictions obtained from models trained on PubLayNet
+    or DocBank is very reasonable when applied on scientific documents. However, for
+    more artistic or free-style layouts, we see sub-par prediction quality from these
+    models, which we demonstrate in Section 5.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 289.0808410644531
+      coord_origin: BOTTOMLEFT
+      l: 52.89326477050781
+      r: 295.5641174316406
+      t: 561.2902221679688
+    charspan:
+    - 0
+    - 1580
+    page_no: 2
+  text: 'A key problem in the process of document conversion is to understand the
+    structure of a single document page, i.e. which segments of text should be grouped
+    together in a unit. To train models for this task, there are currently two large
+    datasets available to the community, PubLayNet [6] and DocBank [7]. They were
+    introduced in 2019 and 2020 respectively and significantly accelerated the implementation
+    of layout detection and segmentation models due to their sizes of 300K and 500K
+    ground-truth pages. These sizes were achieved by leveraging an automation approach.
+    The benefit of automated ground-truth generation is obvious: one can generate
+    large ground-truth datasets at virtually no cost. However, the automation introduces
+    a constraint on the variability in the dataset, because corresponding structured
+    source data must be available. PubLayNet and DocBank were both generated from
+    scientific document repositories (PubMed and arXiv), which provide XML or L A
+    T E X sources. Those scientific documents present a limited variability in their
+    layouts, because they are typeset in uniform templates provided by the publishers.
+    Obviously, documents such as technical manuals, annual company reports, legal
+    text, government tenders, etc. have very different and partially unique layouts.
+    As a consequence, the layout predictions obtained from models trained on PubLayNet
+    or DocBank is very reasonable when applied on scientific documents. However, for
+    more artistic or free-style layouts, we see sub-par prediction quality from these
+    models, which we demonstrate in Section 5.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/21
+  hash: 562391038162260731
+  label: text
+  orig: 'In this paper, we present the DocLayNet dataset. It provides pageby-page
+    layout annotation ground-truth using bounding-boxes for 11 distinct class labels
+    on 80863 unique document pages, of which a fraction carry double- or triple-annotations.
+    DocLayNet is similar in spirit to PubLayNet and DocBank and will likewise be made
+    available to the public 1 in order to stimulate the document-layout analysis community.
+    It distinguishes itself in the following aspects:'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 212.36782836914062
+      coord_origin: BOTTOMLEFT
+      l: 53.12458419799805
+      r: 295.56396484375
+      t: 287.0208740234375
+    charspan:
+    - 0
+    - 462
+    page_no: 2
+  text: 'In this paper, we present the DocLayNet dataset. It provides pageby-page
+    layout annotation ground-truth using bounding-boxes for 11 distinct class labels
+    on 80863 unique document pages, of which a fraction carry double- or triple-annotations.
+    DocLayNet is similar in spirit to PubLayNet and DocBank and will likewise be made
+    available to the public 1 in order to stimulate the document-layout analysis community.
+    It distinguishes itself in the following aspects:'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/22
+  hash: 1687994490476660946
+  label: list_item
+  orig: '(1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on
+    human annotation instead of automation approaches to generate the data set.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 176.96405029296875
+      coord_origin: BOTTOMLEFT
+      l: 64.64593505859375
+      r: 295.5616455078125
+      t: 208.28524780273438
+    charspan:
+    - 0
+    - 149
+    page_no: 2
+  text: '(1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on
+    human annotation instead of automation approaches to generate the data set.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/23
+  hash: 8977022680477147526
+  label: list_item
+  orig: '(2) Large Layout Variability : We include diverse and complex layouts from
+    a large variety of public sources.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 154.92233276367188
+      coord_origin: BOTTOMLEFT
+      l: 64.50244140625
+      r: 294.3029479980469
+      t: 174.95782470703125
+    charspan:
+    - 0
+    - 109
+    page_no: 2
+  text: '(2) Large Layout Variability : We include diverse and complex layouts from
+    a large variety of public sources.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/24
+  hash: 2133234466113940345
+  label: list_item
+  orig: '(3) Detailed Label Set : We define 11 class labels to distinguish layout
+    features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although
+    not a superset of ours.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 121.99307250976562
+      coord_origin: BOTTOMLEFT
+      l: 64.18266296386719
+      r: 294.6838073730469
+      t: 153.57122802734375
+    charspan:
+    - 0
+    - 180
+    page_no: 2
+  text: '(3) Detailed Label Set : We define 11 class labels to distinguish layout
+    features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although
+    not a superset of ours.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/25
+  hash: 15972412295294232993
+  label: list_item
+  orig: '(4) Redundant Annotations : A fraction of the pages in the DocLayNet data
+    set carry more than one human annotation.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 99.92230987548828
+      coord_origin: BOTTOMLEFT
+      l: 64.30329132080078
+      r: 295.56439208984375
+      t: 120.3491439819336
+    charspan:
+    - 0
+    - 115
+    page_no: 2
+  text: '(4) Redundant Annotations : A fraction of the pages in the DocLayNet data
+    set carry more than one human annotation.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/26
+  hash: 338444530349878300
+  label: footnote
+  orig: $^{1}$https://developer.ibm.com/exchanges/data/all/doclaynet
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 82.76702880859375
+      coord_origin: BOTTOMLEFT
+      l: 53.60314178466797
+      r: 216.05824279785156
+      t: 90.63584899902344
+    charspan:
+    - 0
+    - 60
+    page_no: 2
+  text: $^{1}$https://developer.ibm.com/exchanges/data/all/doclaynet
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/27
+  hash: 3617521057052249807
+  label: text
+  orig: This enables experimentation with annotation uncertainty and quality control
+    analysis.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 685.3028564453125
+      coord_origin: BOTTOMLEFT
+      l: 341.2403564453125
+      r: 558.5009765625
+      t: 705.5034790039062
+    charspan:
+    - 0
+    - 86
+    page_no: 2
+  text: This enables experimentation with annotation uncertainty and quality control
+    analysis.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/28
+  hash: 14788267481324200655
+  label: list_item
+  orig: '(5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide
+    fixed train-, test- & validation-sets to ensure proportional representation of
+    the class-labels. Further, we prevent leakage of unique layouts across sets, which
+    has a large effect on model accuracy scores.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 630.4351806640625
+      coord_origin: BOTTOMLEFT
+      l: 328.06146240234375
+      r: 559.7210083007812
+      t: 683.4995727539062
+    charspan:
+    - 0
+    - 280
+    page_no: 2
+  text: '(5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide
+    fixed train-, test- & validation-sets to ensure proportional representation of
+    the class-labels. Further, we prevent leakage of unique layouts across sets, which
+    has a large effect on model accuracy scores.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/29
+  hash: 18119595765995049833
+  label: text
+  orig: All aspects outlined above are detailed in Section 3. In Section 4, we will
+    elaborate on how we designed and executed this large-scale human annotation campaign.
+    We will also share key insights and lessons learned that might prove helpful for
+    other parties planning to set up annotation campaigns.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 571.292724609375
+      coord_origin: BOTTOMLEFT
+      l: 317.0706787109375
+      r: 559.1903076171875
+      t: 624.9239501953125
+    charspan:
+    - 0
+    - 297
+    page_no: 2
+  text: All aspects outlined above are detailed in Section 3. In Section 4, we will
+    elaborate on how we designed and executed this large-scale human annotation campaign.
+    We will also share key insights and lessons learned that might prove helpful for
+    other parties planning to set up annotation campaigns.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/30
+  hash: 2634162194000949275
+  label: text
+  orig: In Section 5, we will present baseline accuracy numbers for a variety of object
+    detection methods (Faster R-CNN, Mask R-CNN and YOLOv5) trained on DocLayNet.
+    We further show how the model performance is impacted by varying the DocLayNet
+    dataset size, reducing the label set and modifying the train/test-split. Last
+    but not least, we compare the performance of models trained on PubLayNet, DocBank
+    and DocLayNet and demonstrate that a model trained on DocLayNet provides overall
+    more robust layout recovery.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 483.6390686035156
+      coord_origin: BOTTOMLEFT
+      l: 316.9918518066406
+      r: 559.5819702148438
+      t: 569.6455078125
+    charspan:
+    - 0
+    - 506
+    page_no: 2
+  text: In Section 5, we will present baseline accuracy numbers for a variety of object
+    detection methods (Faster R-CNN, Mask R-CNN and YOLOv5) trained on DocLayNet.
+    We further show how the model performance is impacted by varying the DocLayNet
+    dataset size, reducing the label set and modifying the train/test-split. Last
+    but not least, we compare the performance of models trained on PubLayNet, DocBank
+    and DocLayNet and demonstrate that a model trained on DocLayNet provides overall
+    more robust layout recovery.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/31
+  hash: 12785294041260556899
+  label: section_header
+  orig: 2 RELATED WORK
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 460.4820251464844
+      coord_origin: BOTTOMLEFT
+      l: 317.33935546875
+      r: 422.0046081542969
+      t: 471.2471923828125
+    charspan:
+    - 0
+    - 14
+    page_no: 2
+  text: 2 RELATED WORK
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/32
+  hash: 15532515360198720027
+  label: text
+  orig: While early approaches in document-layout analysis used rulebased algorithms
+    and heuristics [8], the problem is lately addressed with deep learning methods.
+    The most common approach is to leverage object detection models [9-15]. In the
+    last decade, the accuracy and speed of these models has increased dramatically.
+    Furthermore, most state-of-the-art object detection methods can be trained and
+    applied with very little work, thanks to a standardisation effort of the ground-truth
+    data format [16] and common deep-learning frameworks [17]. Reference data sets
+    such as PubLayNet [6] and DocBank provide their data in the commonly accepted
+    COCO format [16].
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 327.7038269042969
+      coord_origin: BOTTOMLEFT
+      l: 316.9687805175781
+      r: 559.7161254882812
+      t: 446.38397216796875
+    charspan:
+    - 0
+    - 655
+    page_no: 2
+  text: While early approaches in document-layout analysis used rulebased algorithms
+    and heuristics [8], the problem is lately addressed with deep learning methods.
+    The most common approach is to leverage object detection models [9-15]. In the
+    last decade, the accuracy and speed of these models has increased dramatically.
+    Furthermore, most state-of-the-art object detection methods can be trained and
+    applied with very little work, thanks to a standardisation effort of the ground-truth
+    data format [16] and common deep-learning frameworks [17]. Reference data sets
+    such as PubLayNet [6] and DocBank provide their data in the commonly accepted
+    COCO format [16].
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/33
+  hash: 7759908539731899164
+  label: text
+  orig: Lately, new types of ML models for document-layout analysis have emerged in
+    the community [18-21]. These models do not approach the problem of layout analysis
+    purely based on an image representation of the page, as computer vision methods
+    do. Instead, they combine the text tokens and image representation of a page in
+    order to obtain a segmentation. While the reported accuracies appear to be promising,
+    a broadly accepted data format which links geometric and textual features has
+    yet to establish.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 239.59246826171875
+      coord_origin: BOTTOMLEFT
+      l: 317.156982421875
+      r: 559.1864624023438
+      t: 325.6906433105469
+    charspan:
+    - 0
+    - 500
+    page_no: 2
+  text: Lately, new types of ML models for document-layout analysis have emerged in
+    the community [18-21]. These models do not approach the problem of layout analysis
+    purely based on an image representation of the page, as computer vision methods
+    do. Instead, they combine the text tokens and image representation of a page in
+    order to obtain a segmentation. While the reported accuracies appear to be promising,
+    a broadly accepted data format which links geometric and textual features has
+    yet to establish.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/34
+  hash: 13224162835784204794
+  label: section_header
+  orig: 3 THE DOCLAYNET DATASET
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 216.37100219726562
+      coord_origin: BOTTOMLEFT
+      l: 317.58740234375
+      r: 477.8531799316406
+      t: 226.6800994873047
+    charspan:
+    - 0
+    - 23
+    page_no: 2
+  text: 3 THE DOCLAYNET DATASET
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/35
+  hash: 13840788721079437184
+  label: text
+  orig: DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances
+    of human annotations, and 1591 carry three. This amounts to 91104 total annotation
+    instances. The annotations provide layout information in the shape of labeled,
+    rectangular boundingboxes. We define 11 distinct labels for layout features, namely
+    Caption , Footnote , Formula , List-item , Page-footer , Page-header , Picture
+    , Section-header , Table , Text , and Title . Our reasoning for picking this particular
+    label set is detailed in Section 4.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 116.19312286376953
+      coord_origin: BOTTOMLEFT
+      l: 317.11236572265625
+      r: 559.7131958007812
+      t: 202.27523803710938
+    charspan:
+    - 0
+    - 522
+    page_no: 2
+  text: DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances
+    of human annotations, and 1591 carry three. This amounts to 91104 total annotation
+    instances. The annotations provide layout information in the shape of labeled,
+    rectangular boundingboxes. We define 11 distinct labels for layout features, namely
+    Caption , Footnote , Formula , List-item , Page-footer , Page-header , Picture
+    , Section-header , Table , Text , and Title . Our reasoning for picking this particular
+    label set is detailed in Section 4.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/36
+  hash: 8382469735566893423
+  label: text
+  orig: In addition to open intellectual property constraints for the source documents,
+    we required that the documents in DocLayNet adhere to a few conditions. Firstly,
+    we kept scanned documents
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 83.59282684326172
+      coord_origin: BOTTOMLEFT
+      l: 317.34619140625
+      r: 558.5303344726562
+      t: 114.41421508789062
+    charspan:
+    - 0
+    - 186
+    page_no: 2
+  text: In addition to open intellectual property constraints for the source documents,
+    we required that the documents in DocLayNet adhere to a few conditions. Firstly,
+    we kept scanned documents
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/37
+  hash: 15463014254960213695
+  label: page_header
+  orig: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 722.95458984375
+      coord_origin: BOTTOMLEFT
+      l: 53.4626579284668
+      r: 347.0511779785156
+      t: 732.11474609375
+    charspan:
+    - 0
+    - 71
+    page_no: 3
+  text: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/38
+  hash: 202003194997475932
+  label: page_header
+  orig: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 723.0569458007812
+      coord_origin: BOTTOMLEFT
+      l: 365.31488037109375
+      r: 558.807861328125
+      t: 731.9796142578125
+    charspan:
+    - 0
+    - 48
+    page_no: 3
+  text: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/39
+  hash: 16454164006377695992
+  label: caption
+  orig: 'Figure 2: Distribution of DocLayNet pages across document categories.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 424.931396484375
+      coord_origin: BOTTOMLEFT
+      l: 53.244232177734375
+      r: 294.5379943847656
+      t: 510.7526550292969
+    charspan:
+    - 0
+    - 513
+    page_no: 3
+  text: 'Figure 2: Distribution of DocLayNet pages across document categories.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/40
+  hash: 17429156214159736783
+  label: text
+  orig: The pages in DocLayNet can be grouped into six distinct categories, namely
+    Financial Reports , Manuals , Scientific Articles , Laws & Regulations , Patents
+    and Government Tenders . Each document category was sourced from various repositories.
+    For example, Financial Reports contain both free-style format annual reports 2
+    which expose company-specific, artistic layouts as well as the more formal SEC
+    filings. The two largest categories ( Financial Reports and Manuals ) contain
+    a large amount of free-style layouts in order to obtain maximum variability. In
+    the other four categories, we boosted the variability by mixing documents from
+    independent providers, such as different government websites or publishers. In
+    Figure 2, we show the document categories contained in DocLayNet with their respective
+    sizes.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 282.6438293457031
+      coord_origin: BOTTOMLEFT
+      l: 53.10974884033203
+      r: 295.5604553222656
+      t: 423.1407775878906
+    charspan:
+    - 0
+    - 810
+    page_no: 3
+  text: The pages in DocLayNet can be grouped into six distinct categories, namely
+    Financial Reports , Manuals , Scientific Articles , Laws & Regulations , Patents
+    and Government Tenders . Each document category was sourced from various repositories.
+    For example, Financial Reports contain both free-style format annual reports 2
+    which expose company-specific, artistic layouts as well as the more formal SEC
+    filings. The two largest categories ( Financial Reports and Manuals ) contain
+    a large amount of free-style layouts in order to obtain maximum variability. In
+    the other four categories, we boosted the variability by mixing documents from
+    independent providers, such as different government websites or publishers. In
+    Figure 2, we show the document categories contained in DocLayNet with their respective
+    sizes.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/41
+  hash: 4835758972077135061
+  label: text
+  orig: We did not control the document selection with regard to language. The vast
+    majority of documents contained in DocLayNet (close to 95%) are published in English
+    language. However, DocLayNet also contains a number of documents in other languages
+    such as German (2.5%), French (1.0%) and Japanese (1.0%). While the document language
+    has negligible impact on the performance of computer vision methods such as object
+    detection and segmentation models, it might prove challenging for layout analysis
+    methods which exploit textual features.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 183.77932739257812
+      coord_origin: BOTTOMLEFT
+      l: 52.8973388671875
+      r: 295.5615539550781
+      t: 281.3227233886719
+    charspan:
+    - 0
+    - 535
+    page_no: 3
+  text: We did not control the document selection with regard to language. The vast
+    majority of documents contained in DocLayNet (close to 95%) are published in English
+    language. However, DocLayNet also contains a number of documents in other languages
+    such as German (2.5%), French (1.0%) and Japanese (1.0%). While the document language
+    has negligible impact on the performance of computer vision methods such as object
+    detection and segmentation models, it might prove challenging for layout analysis
+    methods which exploit textual features.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/42
+  hash: 6442074878702101187
+  label: text
+  orig: To ensure that future benchmarks in the document-layout analysis community
+    can be easily compared, we have split up DocLayNet into pre-defined train-, test-
+    and validation-sets. In this way, we can avoid spurious variations in the evaluation
+    scores due to random splitting in train-, test- and validation-sets. We also ensured
+    that less frequent labels are represented in train and test sets in equal proportions.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 106.8985824584961
+      coord_origin: BOTTOMLEFT
+      l: 53.209388732910156
+      r: 295.56396484375
+      t: 182.471923828125
+    charspan:
+    - 0
+    - 413
+    page_no: 3
+  text: To ensure that future benchmarks in the document-layout analysis community
+    can be easily compared, we have split up DocLayNet into pre-defined train-, test-
+    and validation-sets. In this way, we can avoid spurious variations in the evaluation
+    scores due to random splitting in train-, test- and validation-sets. We also ensured
+    that less frequent labels are represented in train and test sets in equal proportions.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/43
+  hash: 13873304636238013732
+  label: footnote
+  orig: $^{2}$e.g. AAPL from https://www.annualreports.com/
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 83.35768127441406
+      coord_origin: BOTTOMLEFT
+      l: 53.352603912353516
+      r: 195.78997802734375
+      t: 91.47167205810547
+    charspan:
+    - 0
+    - 51
+    page_no: 3
+  text: $^{2}$e.g. AAPL from https://www.annualreports.com/
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/44
+  hash: 6259711523792429489
+  label: text
+  orig: Table 1 shows the overall frequency and distribution of the labels among the
+    different sets. Importantly, we ensure that subsets are only split on full-document
+    boundaries. This avoids that pages of the same document are spread over train,
+    test and validation set, which can give an undesired evaluation advantage to models
+    and lead to overestimation of their prediction accuracy. We will show the impact
+    of this decision in Section 5.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 630.5088500976562
+      coord_origin: BOTTOMLEFT
+      l: 317.0691833496094
+      r: 559.1918334960938
+      t: 705.8527221679688
+    charspan:
+    - 0
+    - 435
+    page_no: 3
+  text: Table 1 shows the overall frequency and distribution of the labels among the
+    different sets. Importantly, we ensure that subsets are only split on full-document
+    boundaries. This avoids that pages of the same document are spread over train,
+    test and validation set, which can give an undesired evaluation advantage to models
+    and lead to overestimation of their prediction accuracy. We will show the impact
+    of this decision in Section 5.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/45
+  hash: 9126253445878309540
+  label: text
+  orig: "In order to accommodate the different types of models currently in use by\
+    \ the community, we provide DocLayNet in an augmented COCO format [16]. This entails\
+    \ the standard COCO ground-truth file (in JSON format) with the associated page\
+    \ images (in PNG format, 1025 \xD7 1025 pixels). Furthermore, custom fields have\
+    \ been added to each COCO record to specify document category, original document\
+    \ filename and page number. In addition, we also provide the original PDF pages,\
+    \ as well as sidecar files containing parsed PDF text and text-cell coordinates\
+    \ (in JSON). All additional files are linked to the primary page images by their\
+    \ matching filenames."
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 520.8086547851562
+      coord_origin: BOTTOMLEFT
+      l: 317.05938720703125
+      r: 558.862060546875
+      t: 628.44580078125
+    charspan:
+    - 0
+    - 645
+    page_no: 3
+  text: "In order to accommodate the different types of models currently in use by\
+    \ the community, we provide DocLayNet in an augmented COCO format [16]. This entails\
+    \ the standard COCO ground-truth file (in JSON format) with the associated page\
+    \ images (in PNG format, 1025 \xD7 1025 pixels). Furthermore, custom fields have\
+    \ been added to each COCO record to specify document category, original document\
+    \ filename and page number. In addition, we also provide the original PDF pages,\
+    \ as well as sidecar files containing parsed PDF text and text-cell coordinates\
+    \ (in JSON). All additional files are linked to the primary page images by their\
+    \ matching filenames."
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/46
+  hash: 17722516482300246985
+  label: text
+  orig: Despite being cost-intense and far less scalable than automation, human annotation
+    has several benefits over automated groundtruth generation. The first and most
+    obvious reason to leverage human annotations is the freedom to annotate any type
+    of document without requiring a programmatic source. For most PDF documents, the
+    original source document is not available. The latter is not a hard constraint
+    with human annotation, but it is for automated methods. A second reason to use
+    human annotations is that the latter usually provide a more natural interpretation
+    of the page layout. The human-interpreted layout can significantly deviate from
+    the programmatic layout used in typesetting. For example, "invisible" tables might
+    be used solely for aligning text paragraphs on columns. Such typesetting tricks
+    might be interpreted by automated methods incorrectly as an actual table, while
+    the human annotation will interpret it correctly as Text or other styles. The
+    same applies to multi-line text elements, when authors decided to space them as
+    "invisible" list elements without bullet symbols. A third reason to gather ground-truth
+    through human annotation is to estimate a "natural" upper bound on the segmentation
+    accuracy. As we will show in Section 4, certain documents featuring complex layouts
+    can have different but equally acceptable layout interpretations. This natural
+    upper bound for segmentation accuracy can be found by annotating the same pages
+    multiple times by different people and evaluating the inter-annotator agreement.
+    Such a baseline consistency evaluation is very useful to define expectations for
+    a good target accuracy in trained deep neural network models and avoid overfitting
+    (see Table 1). On the flip side, achieving high annotation consistency proved
+    to be a key challenge in human annotation, as we outline in Section 4.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 203.11082458496094
+      coord_origin: BOTTOMLEFT
+      l: 316.88604736328125
+      r: 559.7215576171875
+      t: 518.6715087890625
+    charspan:
+    - 0
+    - 1854
+    page_no: 3
+  text: Despite being cost-intense and far less scalable than automation, human annotation
+    has several benefits over automated groundtruth generation. The first and most
+    obvious reason to leverage human annotations is the freedom to annotate any type
+    of document without requiring a programmatic source. For most PDF documents, the
+    original source document is not available. The latter is not a hard constraint
+    with human annotation, but it is for automated methods. A second reason to use
+    human annotations is that the latter usually provide a more natural interpretation
+    of the page layout. The human-interpreted layout can significantly deviate from
+    the programmatic layout used in typesetting. For example, "invisible" tables might
+    be used solely for aligning text paragraphs on columns. Such typesetting tricks
+    might be interpreted by automated methods incorrectly as an actual table, while
+    the human annotation will interpret it correctly as Text or other styles. The
+    same applies to multi-line text elements, when authors decided to space them as
+    "invisible" list elements without bullet symbols. A third reason to gather ground-truth
+    through human annotation is to estimate a "natural" upper bound on the segmentation
+    accuracy. As we will show in Section 4, certain documents featuring complex layouts
+    can have different but equally acceptable layout interpretations. This natural
+    upper bound for segmentation accuracy can be found by annotating the same pages
+    multiple times by different people and evaluating the inter-annotator agreement.
+    Such a baseline consistency evaluation is very useful to define expectations for
+    a good target accuracy in trained deep neural network models and avoid overfitting
+    (see Table 1). On the flip side, achieving high annotation consistency proved
+    to be a key challenge in human annotation, as we outline in Section 4.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/47
+  hash: 8217803899333050095
+  label: section_header
+  orig: 4 ANNOTATION CAMPAIGN
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 174.8409881591797
+      coord_origin: BOTTOMLEFT
+      l: 317.66510009765625
+      r: 470.2132568359375
+      t: 185.15008544921875
+    charspan:
+    - 0
+    - 21
+    page_no: 3
+  text: 4 ANNOTATION CAMPAIGN
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/48
+  hash: 11051011402111064878
+  label: text
+  orig: The annotation campaign was carried out in four phases. In phase one, we identified
+    and prepared the data sources for annotation. In phase two, we determined the
+    class labels and how annotations should be done on the documents in order to obtain
+    maximum consistency. The latter was guided by a detailed requirement analysis
+    and exhaustive experiments. In phase three, we trained the annotation staff and
+    performed exams for quality assurance. In phase four,
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 85.38961791992188
+      coord_origin: BOTTOMLEFT
+      l: 317.0245056152344
+      r: 559.7138061523438
+      t: 160.93588256835938
+    charspan:
+    - 0
+    - 457
+    page_no: 3
+  text: The annotation campaign was carried out in four phases. In phase one, we identified
+    and prepared the data sources for annotation. In phase two, we determined the
+    class labels and how annotations should be done on the documents in order to obtain
+    maximum consistency. The latter was guided by a detailed requirement analysis
+    and exhaustive experiments. In phase three, we trained the annotation staff and
+    performed exams for quality assurance. In phase four,
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/49
+  hash: 6768525952307611424
+  label: page_header
+  orig: "KDD \u201922, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph\
+    \ Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar"
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 723.0101318359375
+      coord_origin: BOTTOMLEFT
+      l: 53.345272064208984
+      r: 558.5491943359375
+      t: 732.1525268554688
+    charspan:
+    - 0
+    - 130
+    page_no: 4
+  text: "KDD \u201922, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph\
+    \ Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar"
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/50
+  hash: 5520931533029632037
+  label: caption
+  orig: ''
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 498.30108642578125
+      coord_origin: BOTTOMLEFT
+      l: 98.96420288085938
+      r: 512.7739868164062
+      t: 654.1231689453125
+    charspan:
+    - 0
+    - 0
+    page_no: 4
+  text: ''
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/51
+  hash: 10610193690990616567
+  label: text
+  orig: we distributed the annotation workload and performed continuous quality controls.
+    Phase one and two required a small team of experts only. For phases three and
+    four, a group of 40 dedicated annotators were assembled and supervised.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 116.45683288574219
+      coord_origin: BOTTOMLEFT
+      l: 52.954681396484375
+      r: 294.3648681640625
+      t: 158.3203887939453
+    charspan:
+    - 0
+    - 231
+    page_no: 4
+  text: we distributed the annotation workload and performed continuous quality controls.
+    Phase one and two required a small team of experts only. For phases three and
+    four, a group of 40 dedicated annotators were assembled and supervised.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/52
+  hash: 8450678124529756923
+  label: text
+  orig: 'Phase 1: Data selection and preparation. Our inclusion criteria for documents
+    were described in Section 3. A large effort went into ensuring that all documents
+    are free to use. The data sources'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 83.57982635498047
+      coord_origin: BOTTOMLEFT
+      l: 53.368797302246094
+      r: 295.5584411621094
+      t: 114.14925384521484
+    charspan:
+    - 0
+    - 193
+    page_no: 4
+  text: 'Phase 1: Data selection and preparation. Our inclusion criteria for documents
+    were described in Section 3. A large effort went into ensuring that all documents
+    are free to use. The data sources'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/53
+  hash: 12151724778915504838
+  label: text
+  orig: include publication repositories such as arXiv$^{3}$, government offices,
+    company websites as well as data directory services for financial reports and
+    patents. Scanned documents were excluded wherever possible because they can be
+    rotated or skewed. This would not allow us to perform annotation with rectangular
+    bounding-boxes and therefore complicate the annotation process.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 416.48919677734375
+      coord_origin: BOTTOMLEFT
+      l: 317.2582702636719
+      r: 559.1853637695312
+      t: 481.0997619628906
+    charspan:
+    - 0
+    - 376
+    page_no: 4
+  text: include publication repositories such as arXiv$^{3}$, government offices,
+    company websites as well as data directory services for financial reports and
+    patents. Scanned documents were excluded wherever possible because they can be
+    rotated or skewed. This would not allow us to perform annotation with rectangular
+    bounding-boxes and therefore complicate the annotation process.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/54
+  hash: 15887983992023577324
+  label: text
+  orig: Preparation work included uploading and parsing the sourced PDF documents
+    in the Corpus Conversion Service (CCS) [22], a cloud-native platform which provides
+    a visual annotation interface and allows for dataset inspection and analysis.
+    The annotation interface of CCS is shown in Figure 3. The desired balance of pages
+    between the different document categories was achieved by selective subsampling
+    of pages with certain desired properties. For example, we made sure to include
+    the title page of each document and bias the remaining page selection to those
+    with figures or tables. The latter was achieved by leveraging pre-trained object
+    detection models from PubLayNet, which helped us estimate how many figures and
+    tables a given page contains.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 284.9187316894531
+      coord_origin: BOTTOMLEFT
+      l: 317.0777587890625
+      r: 559.7130737304688
+      t: 415.02398681640625
+    charspan:
+    - 0
+    - 746
+    page_no: 4
+  text: Preparation work included uploading and parsing the sourced PDF documents
+    in the Corpus Conversion Service (CCS) [22], a cloud-native platform which provides
+    a visual annotation interface and allows for dataset inspection and analysis.
+    The annotation interface of CCS is shown in Figure 3. The desired balance of pages
+    between the different document categories was achieved by selective subsampling
+    of pages with certain desired properties. For example, we made sure to include
+    the title page of each document and bias the remaining page selection to those
+    with figures or tables. The latter was achieved by leveraging pre-trained object
+    detection models from PubLayNet, which helped us estimate how many figures and
+    tables a given page contains.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/55
+  hash: 11975880209884411763
+  label: text
+  orig: 'Phase 2: Label selection and guideline. We reviewed the collected documents
+    and identified the most common structural features they exhibit. This was achieved
+    by identifying recurrent layout elements and lead us to the definition of 11 distinct
+    class labels. These 11 class labels are Caption , Footnote , Formula , List-item
+    , Pagefooter , Page-header , Picture , Section-header , Table , Text , and Title
+    . Critical factors that were considered for the choice of these class labels were
+    (1) the overall occurrence of the label, (2) the specificity of the label, (3)
+    recognisability on a single page (i.e. no need for context from previous or next
+    page) and (4) overall coverage of the page. Specificity ensures that the choice
+    of label is not ambiguous, while coverage ensures that all meaningful items on
+    a page can be annotated. We refrained from class labels that are very specific
+    to a document category, such as Abstract in the Scientific Articles category.
+    We also avoided class labels that are tightly linked to the semantics of the text.
+    Labels such as Author and Affiliation , as seen in DocBank, are often only distinguishable
+    by discriminating on'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 98.9438247680664
+      coord_origin: BOTTOMLEFT
+      l: 316.9024963378906
+      r: 559.7176513671875
+      t: 283.8972473144531
+    charspan:
+    - 0
+    - 1159
+    page_no: 4
+  text: 'Phase 2: Label selection and guideline. We reviewed the collected documents
+    and identified the most common structural features they exhibit. This was achieved
+    by identifying recurrent layout elements and lead us to the definition of 11 distinct
+    class labels. These 11 class labels are Caption , Footnote , Formula , List-item
+    , Pagefooter , Page-header , Picture , Section-header , Table , Text , and Title
+    . Critical factors that were considered for the choice of these class labels were
+    (1) the overall occurrence of the label, (2) the specificity of the label, (3)
+    recognisability on a single page (i.e. no need for context from previous or next
+    page) and (4) overall coverage of the page. Specificity ensures that the choice
+    of label is not ambiguous, while coverage ensures that all meaningful items on
+    a page can be annotated. We refrained from class labels that are very specific
+    to a document category, such as Abstract in the Scientific Articles category.
+    We also avoided class labels that are tightly linked to the semantics of the text.
+    Labels such as Author and Affiliation , as seen in DocBank, are often only distinguishable
+    by discriminating on'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/56
+  hash: 723840717012406728
+  label: footnote
+  orig: $^{3}$https://arxiv.org/
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 82.5821304321289
+      coord_origin: BOTTOMLEFT
+      l: 317.7030029296875
+      r: 369.40142822265625
+      t: 90.54422760009766
+    charspan:
+    - 0
+    - 24
+    page_no: 4
+  text: $^{3}$https://arxiv.org/
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/57
+  hash: 15020658425504633198
+  label: page_header
+  orig: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 723.0143432617188
+      coord_origin: BOTTOMLEFT
+      l: 53.456207275390625
+      r: 347.07373046875
+      t: 732.0245361328125
+    charspan:
+    - 0
+    - 71
+    page_no: 5
+  text: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/58
+  hash: 17688098678887076514
+  label: page_header
+  orig: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 723.0404663085938
+      coord_origin: BOTTOMLEFT
+      l: 365.2621765136719
+      r: 558.9374389648438
+      t: 731.9317626953125
+    charspan:
+    - 0
+    - 48
+    page_no: 5
+  text: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/59
+  hash: 938373213925944417
+  label: text
+  orig: the textual content of an element, which goes beyond visual layout recognition,
+    in particular outside the Scientific Articles category.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 684.8170166015625
+      coord_origin: BOTTOMLEFT
+      l: 53.24338912963867
+      r: 294.04541015625
+      t: 705.5283813476562
+    charspan:
+    - 0
+    - 135
+    page_no: 5
+  text: the textual content of an element, which goes beyond visual layout recognition,
+    in particular outside the Scientific Articles category.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/60
+  hash: 11375379645979730878
+  label: text
+  orig: At first sight, the task of visual document-layout interpretation appears
+    intuitive enough to obtain plausible annotations in most cases. However, during
+    early trial-runs in the core team, we observed many cases in which annotators
+    use different annotation styles, especially for documents with challenging layouts.
+    For example, if a figure is presented with subfigures, one annotator might draw
+    a single figure bounding-box, while another might annotate each subfigure separately.
+    The same applies for lists, where one might annotate all list items in one block
+    or each list item separately. In essence, we observed that challenging layouts
+    would be annotated in different but plausible ways. To illustrate this, we show
+    in Figure 4 multiple examples of plausible but inconsistent annotations on the
+    same pages.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 542.8159790039062
+      coord_origin: BOTTOMLEFT
+      l: 53.124725341796875
+      r: 295.5592346191406
+      t: 683.8748168945312
+    charspan:
+    - 0
+    - 812
+    page_no: 5
+  text: At first sight, the task of visual document-layout interpretation appears
+    intuitive enough to obtain plausible annotations in most cases. However, during
+    early trial-runs in the core team, we observed many cases in which annotators
+    use different annotation styles, especially for documents with challenging layouts.
+    For example, if a figure is presented with subfigures, one annotator might draw
+    a single figure bounding-box, while another might annotate each subfigure separately.
+    The same applies for lists, where one might annotate all list items in one block
+    or each list item separately. In essence, we observed that challenging layouts
+    would be annotated in different but plausible ways. To illustrate this, we show
+    in Figure 4 multiple examples of plausible but inconsistent annotations on the
+    same pages.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/61
+  hash: 9627588927681567008
+  label: text
+  orig: 'Obviously, this inconsistency in annotations is not desirable for datasets
+    which are intended to be used for model training. To minimise these inconsistencies,
+    we created a detailed annotation guideline. While perfect consistency across 40
+    annotation staff members is clearly not possible to achieve, we saw a huge improvement
+    in annotation consistency after the introduction of our annotation guideline.
+    A few selected, non-trivial highlights of the guideline are:'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 455.16583251953125
+      coord_origin: BOTTOMLEFT
+      l: 53.339271545410156
+      r: 295.56005859375
+      t: 541.1383666992188
+    charspan:
+    - 0
+    - 465
+    page_no: 5
+  text: 'Obviously, this inconsistency in annotations is not desirable for datasets
+    which are intended to be used for model training. To minimise these inconsistencies,
+    we created a detailed annotation guideline. While perfect consistency across 40
+    annotation staff members is clearly not possible to achieve, we saw a huge improvement
+    in annotation consistency after the introduction of our annotation guideline.
+    A few selected, non-trivial highlights of the guideline are:'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/62
+  hash: 5251173547193772936
+  label: list_item
+  orig: (1) Every list-item is an individual object instance with class label List-item
+    . This definition is different from PubLayNet and DocBank, where all list-items
+    are grouped together into one List object.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 402.13092041015625
+      coord_origin: BOTTOMLEFT
+      l: 64.39098358154297
+      r: 294.42474365234375
+      t: 444.29510498046875
+    charspan:
+    - 0
+    - 202
+    page_no: 5
+  text: (1) Every list-item is an individual object instance with class label List-item
+    . This definition is different from PubLayNet and DocBank, where all list-items
+    are grouped together into one List object.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/63
+  hash: 4202075218951637034
+  label: list_item
+  orig: (2) A List-item is a paragraph with hanging indentation. Singleline elements
+    can qualify as List-item if the neighbour elements expose hanging indentation.
+    Bullet or enumeration symbols are not a requirement.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 358.39984130859375
+      coord_origin: BOTTOMLEFT
+      l: 64.31100463867188
+      r: 295.563720703125
+      t: 400.2758483886719
+    charspan:
+    - 0
+    - 208
+    page_no: 5
+  text: (2) A List-item is a paragraph with hanging indentation. Singleline elements
+    can qualify as List-item if the neighbour elements expose hanging indentation.
+    Bullet or enumeration symbols are not a requirement.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/64
+  hash: 1780046845976491258
+  label: list_item
+  orig: (3) For every Caption , there must be exactly one corresponding Picture or
+    Table .
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 336.4728698730469
+      coord_origin: BOTTOMLEFT
+      l: 64.26787567138672
+      r: 294.60943603515625
+      t: 356.2404479980469
+    charspan:
+    - 0
+    - 82
+    page_no: 5
+  text: (3) For every Caption , there must be exactly one corresponding Picture or
+    Table .
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/65
+  hash: 3653862969821232020
+  label: list_item
+  orig: (4) Connected sub-pictures are grouped together in one Picture object.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 314.5648193359375
+      coord_origin: BOTTOMLEFT
+      l: 64.2632064819336
+      r: 294.7487487792969
+      t: 334.179443359375
+    charspan:
+    - 0
+    - 70
+    page_no: 5
+  text: (4) Connected sub-pictures are grouped together in one Picture object.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/66
+  hash: 5448053117976841193
+  label: list_item
+  orig: (5) Formula numbers are included in a Formula object.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 303.59686279296875
+      coord_origin: BOTTOMLEFT
+      l: 63.9930305480957
+      r: 264.5057067871094
+      t: 312.8252868652344
+    charspan:
+    - 0
+    - 53
+    page_no: 5
+  text: (5) Formula numbers are included in a Formula object.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/67
+  hash: 5907142507865067888
+  label: list_item
+  orig: (6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph
+    is not considered a Section-header , unless it appears exclusively on its own
+    line.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 270.048095703125
+      coord_origin: BOTTOMLEFT
+      l: 64.07823181152344
+      r: 295.0240783691406
+      t: 301.5160827636719
+    charspan:
+    - 0
+    - 160
+    page_no: 5
+  text: (6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph
+    is not considered a Section-header , unless it appears exclusively on its own
+    line.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/68
+  hash: 13967274008596264343
+  label: text
+  orig: The complete annotation guideline is over 100 pages long and a detailed description
+    is obviously out of scope for this paper. Nevertheless, it will be made publicly
+    available alongside with DocLayNet for future reference.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 217.798828125
+      coord_origin: BOTTOMLEFT
+      l: 52.994422912597656
+      r: 295.5625305175781
+      t: 259.6097106933594
+    charspan:
+    - 0
+    - 221
+    page_no: 5
+  text: The complete annotation guideline is over 100 pages long and a detailed description
+    is obviously out of scope for this paper. Nevertheless, it will be made publicly
+    available alongside with DocLayNet for future reference.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/69
+  hash: 889554423716143140
+  label: text
+  orig: 'Phase 3: Training. After a first trial with a small group of people, we realised
+    that providing the annotation guideline and a set of random practice pages did
+    not yield the desired quality level for layout annotation. Therefore we prepared
+    a subset of pages with two different complexity levels, each with a practice and
+    an exam part. 974 pages were reference-annotated by one proficient core team member.
+    Annotation staff were then given the task to annotate the same subsets (blinded
+    from the reference). By comparing the annotations of each staff member with the
+    reference annotations, we could quantify how closely their annotations matched
+    the reference. Only after passing two exam levels with high annotation quality,
+    staff were admitted into the production phase. Practice iterations'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 86.24749755859375
+      coord_origin: BOTTOMLEFT
+      l: 53.26631546020508
+      r: 295.562255859375
+      t: 215.95584106445312
+    charspan:
+    - 0
+    - 792
+    page_no: 5
+  text: 'Phase 3: Training. After a first trial with a small group of people, we realised
+    that providing the annotation guideline and a set of random practice pages did
+    not yield the desired quality level for layout annotation. Therefore we prepared
+    a subset of pages with two different complexity levels, each with a practice and
+    an exam part. 974 pages were reference-annotated by one proficient core team member.
+    Annotation staff were then given the task to annotate the same subsets (blinded
+    from the reference). By comparing the annotations of each staff member with the
+    reference annotations, we could quantify how closely their annotations matched
+    the reference. Only after passing two exam levels with high annotation quality,
+    staff were admitted into the production phase. Practice iterations'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/70
+  hash: 15729892622341582110
+  label: caption
+  orig: 'Figure 4: Examples of plausible annotation alternatives for the same page.
+    Criteria in our annotation guideline can resolve cases '
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 722.92333984375
+      coord_origin: BOTTOMLEFT
+      l: 53.30706024169922
+      r: 558.4274291992188
+      t: 732.1127319335938
+    charspan:
+    - 0
+    - 130
+    page_no: 6
+  text: 'Figure 4: Examples of plausible annotation alternatives for the same page.
+    Criteria in our annotation guideline can resolve cases '
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/71
+  hash: 14428809639626034083
+  label: text
+  orig: were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially
+    allocated annotators did not pass the bar.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 247.1688232421875
+      coord_origin: BOTTOMLEFT
+      l: 316.8349914550781
+      r: 558.204345703125
+      t: 266.81207275390625
+    charspan:
+    - 0
+    - 123
+    page_no: 5
+  text: were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially
+    allocated annotators did not pass the bar.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/72
+  hash: 15056578085083744975
+  label: text
+  orig: 'Phase 4: Production annotation. The previously selected 80K pages were annotated
+    with the defined 11 class labels by 32 annotators. This production phase took
+    around three months to complete. All annotations were created online through CCS,
+    which visualises the programmatic PDF text-cells as an overlay on the page. The
+    page annotation are obtained by drawing rectangular bounding-boxes, as shown in
+    Figure 3. With regard to the annotation practices, we implemented a few constraints
+    and capabilities on the tooling level. First, we only allow non-overlapping, vertically
+    oriented, rectangular boxes. For the large majority of documents, this constraint
+    was sufficient and it speeds up the annotation considerably in comparison with
+    arbitrary segmentation shapes. Second, annotator staff were not able to see each
+    other''s annotations. This was enforced by design to avoid any bias in the annotation,
+    which could skew the numbers of the inter-annotator agreement (see Table 1). We
+    wanted'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 82.7375717163086
+      coord_origin: BOTTOMLEFT
+      l: 317.00592041015625
+      r: 559.7149047851562
+      t: 245.28392028808594
+    charspan:
+    - 0
+    - 987
+    page_no: 5
+  text: 'Phase 4: Production annotation. The previously selected 80K pages were annotated
+    with the defined 11 class labels by 32 annotators. This production phase took
+    around three months to complete. All annotations were created online through CCS,
+    which visualises the programmatic PDF text-cells as an overlay on the page. The
+    page annotation are obtained by drawing rectangular bounding-boxes, as shown in
+    Figure 3. With regard to the annotation practices, we implemented a few constraints
+    and capabilities on the tooling level. First, we only allow non-overlapping, vertically
+    oriented, rectangular boxes. For the large majority of documents, this constraint
+    was sufficient and it speeds up the annotation considerably in comparison with
+    arbitrary segmentation shapes. Second, annotator staff were not able to see each
+    other''s annotations. This was enforced by design to avoid any bias in the annotation,
+    which could skew the numbers of the inter-annotator agreement (see Table 1). We
+    wanted'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/73
+  hash: 2641059782471010186
+  label: caption
+  orig: 'Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks
+    on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models
+    with ResNet-50 or ResNet-101 backbone were trained based on the network architectures
+    from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN
+    3x), with default configurations. The YOLO implementation utilized was YOLOv5x6
+    [13]. All models were initialised using pre-trained weights from the COCO 2017
+    dataset.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 440.30438232421875
+      coord_origin: BOTTOMLEFT
+      l: 61.93328094482422
+      r: 285.75616455078125
+      t: 596.587158203125
+    charspan:
+    - 0
+    - 584
+    page_no: 6
+  text: 'Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks
+    on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models
+    with ResNet-50 or ResNet-101 backbone were trained based on the network architectures
+    from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN
+    3x), with default configurations. The YOLO implementation utilized was YOLOv5x6
+    [13]. All models were initialised using pre-trained weights from the COCO 2017
+    dataset.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/74
+  hash: 2611073847515650604
+  label: text
+  orig: to avoid this at any cost in order to have clear, unbiased baseline numbers
+    for human document-layout annotation. Third, we introduced the feature of snapping
+    boxes around text segments to obtain a pixel-accurate annotation and again reduce
+    time and effort. The CCS annotation tool automatically shrinks every user-drawn
+    box to the minimum bounding-box around the enclosed text-cells for all purely
+    text-based segments, which excludes only Table and Picture . For the latter, we
+    instructed annotation staff to minimise inclusion of surrounding whitespace while
+    including all graphical lines. A downside of snapping boxes to enclosed text cells
+    is that some wrongly parsed PDF pages cannot be annotated correctly and need to
+    be skipped. Fourth, we established a way to flag pages as rejected for cases where
+    no valid annotation according to the label guidelines could be achieved. Example
+    cases for this would be PDF pages that render incorrectly or contain layouts that
+    are impossible to capture with non-overlapping rectangles. Such rejected pages
+    are not contained in the final dataset. With all these measures in place, experienced
+    annotation staff managed to annotate a single page in a typical timeframe of 20s
+    to 60s, depending on its complexity.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 214.2948760986328
+      coord_origin: BOTTOMLEFT
+      l: 53.25688552856445
+      r: 295.5561218261719
+      t: 421.4337158203125
+    charspan:
+    - 0
+    - 1252
+    page_no: 6
+  text: to avoid this at any cost in order to have clear, unbiased baseline numbers
+    for human document-layout annotation. Third, we introduced the feature of snapping
+    boxes around text segments to obtain a pixel-accurate annotation and again reduce
+    time and effort. The CCS annotation tool automatically shrinks every user-drawn
+    box to the minimum bounding-box around the enclosed text-cells for all purely
+    text-based segments, which excludes only Table and Picture . For the latter, we
+    instructed annotation staff to minimise inclusion of surrounding whitespace while
+    including all graphical lines. A downside of snapping boxes to enclosed text cells
+    is that some wrongly parsed PDF pages cannot be annotated correctly and need to
+    be skipped. Fourth, we established a way to flag pages as rejected for cases where
+    no valid annotation according to the label guidelines could be achieved. Example
+    cases for this would be PDF pages that render incorrectly or contain layouts that
+    are impossible to capture with non-overlapping rectangles. Such rejected pages
+    are not contained in the final dataset. With all these measures in place, experienced
+    annotation staff managed to annotate a single page in a typical timeframe of 20s
+    to 60s, depending on its complexity.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/75
+  hash: 19275708379815350
+  label: section_header
+  orig: 5 EXPERIMENTS
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 193.5609893798828
+      coord_origin: BOTTOMLEFT
+      l: 53.62337875366211
+      r: 147.4853515625
+      t: 203.87008666992188
+    charspan:
+    - 0
+    - 13
+    page_no: 6
+  text: 5 EXPERIMENTS
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/76
+  hash: 12611643145785449119
+  label: caption
+  orig: 'Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 722.9555053710938
+      coord_origin: BOTTOMLEFT
+      l: 53.35094451904297
+      r: 347.0172424316406
+      t: 732.038818359375
+    charspan:
+    - 0
+    - 71
+    page_no: 7
+  text: 'Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/77
+  hash: 10405449111938146973
+  label: text
+  orig: paper and leave the detailed evaluation of more recent methods mentioned in
+    Section 2 for future work.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 388.6548156738281
+      coord_origin: BOTTOMLEFT
+      l: 317.2011413574219
+      r: 558.2041625976562
+      t: 408.8042297363281
+    charspan:
+    - 0
+    - 102
+    page_no: 6
+  text: paper and leave the detailed evaluation of more recent methods mentioned in
+    Section 2 for future work.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/78
+  hash: 5970556147693056683
+  label: text
+  orig: In this section, we will present several aspects related to the performance
+    of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate
+    the quality of their predictions using mean average precision (mAP) with 10 overlaps
+    that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are
+    computed by leveraging the evaluation code provided by the COCO API [16].
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 311.45587158203125
+      coord_origin: BOTTOMLEFT
+      l: 317.0830078125
+      r: 558.4364013671875
+      t: 386.632568359375
+    charspan:
+    - 0
+    - 397
+    page_no: 6
+  text: In this section, we will present several aspects related to the performance
+    of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate
+    the quality of their predictions using mean average precision (mAP) with 10 overlaps
+    that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are
+    computed by leveraging the evaluation code provided by the COCO API [16].
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/79
+  hash: 7797862272567426572
+  label: section_header
+  orig: Baselines for Object Detection
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 284.5037841796875
+      coord_origin: BOTTOMLEFT
+      l: 317.1941223144531
+      r: 466.8532409667969
+      t: 295.42913818359375
+    charspan:
+    - 0
+    - 30
+    page_no: 6
+  text: Baselines for Object Detection
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/80
+  hash: 7611035121604324850
+  label: text
+  orig: "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN\
+    \ [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were\
+    \ performed on RGB images with dimensions of 1025 \xD7 1025 pixels. For training,\
+    \ we only used one annotation in case of redundantly annotated pages. As one can\
+    \ observe, the variation in mAP between the models is rather low, but overall\
+    \ between 6 and 10% lower than the mAP computed from the pairwise human annotations\
+    \ on triple-annotated pages. This gives a good indication that the DocLayNet dataset\
+    \ poses a worthwhile challenge for the research community to close the gap between\
+    \ human recognition and ML approaches. It is interesting to see that Mask R-CNN\
+    \ and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based\
+    \ image segmentation derived from bounding-boxes does not help to obtain better\
+    \ predictions. On the other hand, the more recent Yolov5x model does very well\
+    \ and even out-performs humans on selected labels such as Text , Table and Picture\
+    \ . This is not entirely surprising, as Text , Table and Picture are abundant\
+    \ and the most visually distinctive in a document."
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 85.2998275756836
+      coord_origin: BOTTOMLEFT
+      l: 317.0144348144531
+      r: 558.7822875976562
+      t: 280.8944396972656
+    charspan:
+    - 0
+    - 1146
+    page_no: 6
+  text: "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN\
+    \ [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were\
+    \ performed on RGB images with dimensions of 1025 \xD7 1025 pixels. For training,\
+    \ we only used one annotation in case of redundantly annotated pages. As one can\
+    \ observe, the variation in mAP between the models is rather low, but overall\
+    \ between 6 and 10% lower than the mAP computed from the pairwise human annotations\
+    \ on triple-annotated pages. This gives a good indication that the DocLayNet dataset\
+    \ poses a worthwhile challenge for the research community to close the gap between\
+    \ human recognition and ML approaches. It is interesting to see that Mask R-CNN\
+    \ and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based\
+    \ image segmentation derived from bounding-boxes does not help to obtain better\
+    \ predictions. On the other hand, the more recent Yolov5x model does very well\
+    \ and even out-performs humans on selected labels such as Text , Table and Picture\
+    \ . This is not entirely surprising, as Text , Table and Picture are abundant\
+    \ and the most visually distinctive in a document."
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/81
+  hash: 4524736109232879114
+  label: page_header
+  orig: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 723.0802001953125
+      coord_origin: BOTTOMLEFT
+      l: 365.1936950683594
+      r: 558.7797241210938
+      t: 731.8773803710938
+    charspan:
+    - 0
+    - 48
+    page_no: 7
+  text: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/82
+  hash: 8640149219266946286
+  label: caption
+  orig: 'Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained
+    on DocLayNet with different class label sets. The reduced label sets were obtained
+    by either down-mapping or '
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 496.419189453125
+      coord_origin: BOTTOMLEFT
+      l: 80.5073471069336
+      r: 267.3428649902344
+      t: 640.9814453125
+    charspan:
+    - 0
+    - 189
+    page_no: 7
+  text: 'Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained
+    on DocLayNet with different class label sets. The reduced label sets were obtained
+    by either down-mapping or '
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/83
+  hash: 6812192561276511295
+  label: section_header
+  orig: Learning Curve
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 461.592041015625
+      coord_origin: BOTTOMLEFT
+      l: 53.446834564208984
+      r: 131.05624389648438
+      t: 472.6955871582031
+    charspan:
+    - 0
+    - 14
+    page_no: 7
+  text: Learning Curve
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/84
+  hash: 5745526209173602420
+  label: text
+  orig: One of the fundamental questions related to any dataset is if it is "large
+    enough". To answer this question for DocLayNet, we performed a data ablation study
+    in which we evaluated a Mask R-CNN model trained on increasing fractions of the
+    DocLayNet dataset. As can be seen in Figure 5, the mAP score rises sharply in
+    the beginning and eventually levels out. To estimate the error-bar on the metrics,
+    we ran the training five times on the entire data-set. This resulted in a 1% error-bar,
+    depicted by the shaded area in Figure 5. In the inset of Figure 5, we show the
+    exact same data-points, but with a logarithmic scale on the x-axis. As is expected,
+    the mAP score increases linearly as a function of the data-size in the inset.
+    The curve ultimately flattens out between the 80% and 100% mark, with the 80%
+    mark falling within the error-bars of the 100% mark. This provides a good indication
+    that the model would not improve significantly by yet increasing the data size.
+    Rather, it would probably benefit more from improved data consistency (as discussed
+    in Section 3), data augmentation methods [23], or the addition of more document
+    categories and styles.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 262.38037109375
+      coord_origin: BOTTOMLEFT
+      l: 52.78499984741211
+      r: 295.558349609375
+      t: 457.72955322265625
+    charspan:
+    - 0
+    - 1157
+    page_no: 7
+  text: One of the fundamental questions related to any dataset is if it is "large
+    enough". To answer this question for DocLayNet, we performed a data ablation study
+    in which we evaluated a Mask R-CNN model trained on increasing fractions of the
+    DocLayNet dataset. As can be seen in Figure 5, the mAP score rises sharply in
+    the beginning and eventually levels out. To estimate the error-bar on the metrics,
+    we ran the training five times on the entire data-set. This resulted in a 1% error-bar,
+    depicted by the shaded area in Figure 5. In the inset of Figure 5, we show the
+    exact same data-points, but with a logarithmic scale on the x-axis. As is expected,
+    the mAP score increases linearly as a function of the data-size in the inset.
+    The curve ultimately flattens out between the 80% and 100% mark, with the 80%
+    mark falling within the error-bars of the 100% mark. This provides a good indication
+    that the model would not improve significantly by yet increasing the data size.
+    Rather, it would probably benefit more from improved data consistency (as discussed
+    in Section 3), data augmentation methods [23], or the addition of more document
+    categories and styles.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/85
+  hash: 7824280854281589640
+  label: section_header
+  orig: Impact of Class Labels
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 239.1809844970703
+      coord_origin: BOTTOMLEFT
+      l: 53.37664794921875
+      r: 164.3289794921875
+      t: 250.044677734375
+    charspan:
+    - 0
+    - 22
+    page_no: 7
+  text: Impact of Class Labels
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/86
+  hash: 17522119297822048539
+  label: text
+  orig: "The choice and number of labels can have a significant effect on the overall\
+    \ model performance. Since PubLayNet, DocBank and DocLayNet all have different\
+    \ label sets, it is of particular interest to understand and quantify this influence\
+    \ of the label set on the model performance. We investigate this by either down-mapping\
+    \ labels into more common ones (e.g. Caption \u2192 Text ) or excluding them from\
+    \ the annotations entirely. Furthermore, it must be stressed that all mappings\
+    \ and exclusions were performed on the data before model training. In Table 3,\
+    \ we present the mAP scores for a Mask R-CNN R50 network on different label sets.\
+    \ Where a label is down-mapped, we show its corresponding label, otherwise it\
+    \ was excluded. We present three different label sets, with 6, 5 and 4 different\
+    \ labels respectively. The set of 5 labels contains the same labels as PubLayNet.\
+    \ However, due to the different definition of"
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 83.39567565917969
+      coord_origin: BOTTOMLEFT
+      l: 53.06760787963867
+      r: 295.5567932128906
+      t: 235.12689208984375
+    charspan:
+    - 0
+    - 910
+    page_no: 7
+  text: "The choice and number of labels can have a significant effect on the overall\
+    \ model performance. Since PubLayNet, DocBank and DocLayNet all have different\
+    \ label sets, it is of particular interest to understand and quantify this influence\
+    \ of the label set on the model performance. We investigate this by either down-mapping\
+    \ labels into more common ones (e.g. Caption \u2192 Text ) or excluding them from\
+    \ the annotations entirely. Furthermore, it must be stressed that all mappings\
+    \ and exclusions were performed on the data before model training. In Table 3,\
+    \ we present the mAP scores for a Mask R-CNN R50 network on different label sets.\
+    \ Where a label is down-mapped, we show its corresponding label, otherwise it\
+    \ was excluded. We present three different label sets, with 6, 5 and 4 different\
+    \ labels respectively. The set of 5 labels contains the same labels as PubLayNet.\
+    \ However, due to the different definition of"
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/87
+  hash: 12736595303563933946
+  label: caption
+  orig: 'Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise
+    split for different label sets. Naive page-wise '
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 485.2873840332031
+      coord_origin: BOTTOMLEFT
+      l: 353.065185546875
+      r: 523.3069458007812
+      t: 641.25341796875
+    charspan:
+    - 0
+    - 130
+    page_no: 7
+  text: 'Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise
+    split for different label sets. Naive page-wise '
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/88
+  hash: 7783869837125225
+  label: text
+  orig: lists in PubLayNet (grouped list-items) versus DocLayNet (separate list-items),
+    the label set of size 4 is the closest to PubLayNet, in the assumption that the
+    List is down-mapped to Text in PubLayNet. The results in Table 3 show that the
+    prediction accuracy on the remaining class labels does not change significantly
+    when other classes are merged into them. The overall macro-average improves by
+    around 5%, in particular when Page-footer and Page-header are excluded.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 375.50982666015625
+      coord_origin: BOTTOMLEFT
+      l: 317.03326416015625
+      r: 559.5849609375
+      t: 460.6855163574219
+    charspan:
+    - 0
+    - 469
+    page_no: 7
+  text: lists in PubLayNet (grouped list-items) versus DocLayNet (separate list-items),
+    the label set of size 4 is the closest to PubLayNet, in the assumption that the
+    List is down-mapped to Text in PubLayNet. The results in Table 3 show that the
+    prediction accuracy on the remaining class labels does not change significantly
+    when other classes are merged into them. The overall macro-average improves by
+    around 5%, in particular when Page-footer and Page-header are excluded.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/89
+  hash: 5117058535300881242
+  label: section_header
+  orig: Impact of Document Split in Train and Test Set
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 351.4896545410156
+      coord_origin: BOTTOMLEFT
+      l: 317.4661865234375
+      r: 549.860595703125
+      t: 362.8900451660156
+    charspan:
+    - 0
+    - 46
+    page_no: 7
+  text: Impact of Document Split in Train and Test Set
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/90
+  hash: 1569991188631703948
+  label: text
+  orig: "Many documents in DocLayNet have a unique styling. In order to avoid overfitting\
+    \ on a particular style, we have split the train-, test- and validation-sets of\
+    \ DocLayNet on document boundaries, i.e. every document contributes pages to only\
+    \ one set. To the best of our knowledge, this was not considered in PubLayNet\
+    \ or DocBank. To quantify how this affects model performance, we trained and evaluated\
+    \ a Mask R-CNN R50 model on a modified dataset version. Here, the train-, test-\
+    \ and validation-sets were obtained by a randomised draw over the individual pages.\
+    \ As can be seen in Table 4, the difference in model performance is surprisingly\
+    \ large: pagewise splitting gains \u02DC 10% in mAP over the document-wise splitting.\
+    \ Thus, random page-wise splitting of DocLayNet can easily lead to accidental\
+    \ overestimation of model performance and should be avoided."
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 196.5628204345703
+      coord_origin: BOTTOMLEFT
+      l: 316.9546813964844
+      r: 559.7138061523438
+      t: 348.10198974609375
+    charspan:
+    - 0
+    - 852
+    page_no: 7
+  text: "Many documents in DocLayNet have a unique styling. In order to avoid overfitting\
+    \ on a particular style, we have split the train-, test- and validation-sets of\
+    \ DocLayNet on document boundaries, i.e. every document contributes pages to only\
+    \ one set. To the best of our knowledge, this was not considered in PubLayNet\
+    \ or DocBank. To quantify how this affects model performance, we trained and evaluated\
+    \ a Mask R-CNN R50 model on a modified dataset version. Here, the train-, test-\
+    \ and validation-sets were obtained by a randomised draw over the individual pages.\
+    \ As can be seen in Table 4, the difference in model performance is surprisingly\
+    \ large: pagewise splitting gains \u02DC 10% in mAP over the document-wise splitting.\
+    \ Thus, random page-wise splitting of DocLayNet can easily lead to accidental\
+    \ overestimation of model performance and should be avoided."
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/91
+  hash: 16424003151594388576
+  label: section_header
+  orig: Dataset Comparison
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 173.20875549316406
+      coord_origin: BOTTOMLEFT
+      l: 317.3337707519531
+      r: 418.5477600097656
+      t: 183.94322204589844
+    charspan:
+    - 0
+    - 18
+    page_no: 7
+  text: Dataset Comparison
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/92
+  hash: 3914983503582730759
+  label: text
+  orig: Throughout this paper, we claim that DocLayNet's wider variety of document
+    layouts leads to more robust layout detection models. In Table 5, we provide evidence
+    for that. We trained models on each of the available datasets (PubLayNet, DocBank
+    and DocLayNet) and evaluated them on the test sets of the other datasets. Due
+    to the different label sets and annotation styles, a direct comparison is not
+    possible. Hence, we focussed on the common labels among the datasets. Between
+    PubLayNet and DocLayNet, these are Picture ,
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 83.24566650390625
+      coord_origin: BOTTOMLEFT
+      l: 316.7283935546875
+      r: 559.1881713867188
+      t: 168.86700439453125
+    charspan:
+    - 0
+    - 521
+    page_no: 7
+  text: Throughout this paper, we claim that DocLayNet's wider variety of document
+    layouts leads to more robust layout detection models. In Table 5, we provide evidence
+    for that. We trained models on each of the available datasets (PubLayNet, DocBank
+    and DocLayNet) and evaluated them on the test sets of the other datasets. Due
+    to the different label sets and annotation styles, a direct comparison is not
+    possible. Hence, we focussed on the common labels among the datasets. Between
+    PubLayNet and DocLayNet, these are Picture ,
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/93
+  hash: 1407046376659880848
+  label: caption
+  orig: 'Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network
+    across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label
+    classes of each dataset, we observe that the DocLayNet-trained model has much
+    less pronounced variations in performance across all datasets.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 452.12615966796875
+      coord_origin: BOTTOMLEFT
+      l: 72.87370300292969
+      r: 274.87945556640625
+      t: 619.3699951171875
+    charspan:
+    - 0
+    - 573
+    page_no: 8
+  text: 'Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network
+    across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label
+    classes of each dataset, we observe that the DocLayNet-trained model has much
+    less pronounced variations in performance across all datasets.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/94
+  hash: 908797690688183444
+  label: text
+  orig: Section-header , Table and Text . Before training, we either mapped or excluded
+    DocLayNet's other labels as specified in table 3, and also PubLayNet's List to
+    Text . Note that the different clustering of lists (by list-element vs. whole
+    list objects) naturally decreases the mAP score for Text .
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 348.85986328125
+      coord_origin: BOTTOMLEFT
+      l: 53.279537200927734
+      r: 294.6396789550781
+      t: 401.5162658691406
+    charspan:
+    - 0
+    - 295
+    page_no: 8
+  text: Section-header , Table and Text . Before training, we either mapped or excluded
+    DocLayNet's other labels as specified in table 3, and also PubLayNet's List to
+    Text . Note that the different clustering of lists (by list-element vs. whole
+    list objects) naturally decreases the mAP score for Text .
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/95
+  hash: 10214562574915474626
+  label: text
+  orig: For comparison of DocBank with DocLayNet, we trained only on Picture and Table
+    clusters of each dataset. We had to exclude Text because successive paragraphs
+    are often grouped together into a single object in DocBank. This paragraph grouping
+    is incompatible with the individual paragraphs of DocLayNet. As can be seen in
+    Table 5, DocLayNet trained models yield better performance compared to the previous
+    datasets. It is noteworthy that the models trained on PubLayNet and DocBank perform
+    very well on their own test set, but have a much lower performance on the foreign
+    datasets. While this also applies to DocLayNet, the difference is far less pronounced.
+    Thus we conclude that DocLayNet trained models are overall more robust and will
+    produce better results for challenging, unseen layouts.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 205.98951721191406
+      coord_origin: BOTTOMLEFT
+      l: 53.04817581176758
+      r: 295.55908203125
+      t: 346.9607849121094
+    charspan:
+    - 0
+    - 793
+    page_no: 8
+  text: For comparison of DocBank with DocLayNet, we trained only on Picture and Table
+    clusters of each dataset. We had to exclude Text because successive paragraphs
+    are often grouped together into a single object in DocBank. This paragraph grouping
+    is incompatible with the individual paragraphs of DocLayNet. As can be seen in
+    Table 5, DocLayNet trained models yield better performance compared to the previous
+    datasets. It is noteworthy that the models trained on PubLayNet and DocBank perform
+    very well on their own test set, but have a much lower performance on the foreign
+    datasets. While this also applies to DocLayNet, the difference is far less pronounced.
+    Thus we conclude that DocLayNet trained models are overall more robust and will
+    produce better results for challenging, unseen layouts.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/96
+  hash: 13986119087538501170
+  label: section_header
+  orig: Example Predictions
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 176.33340454101562
+      coord_origin: BOTTOMLEFT
+      l: 53.05388259887695
+      r: 156.02235412597656
+      t: 187.29098510742188
+    charspan:
+    - 0
+    - 19
+    page_no: 8
+  text: Example Predictions
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/97
+  hash: 1038155047615801598
+  label: text
+  orig: To conclude this section, we illustrate the quality of layout predictions
+    one can expect from DocLayNet-trained models by providing a selection of examples
+    without any further post-processing applied. Figure 6 shows selected layout predictions
+    on pages from the test-set of DocLayNet. Results look decent in general across
+    document categories, however one can also observe mistakes such as overlapping
+    clusters of different classes, or entirely missing boxes due to low confidence.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 86.64982604980469
+      coord_origin: BOTTOMLEFT
+      l: 53.07720184326172
+      r: 295.5584411621094
+      t: 172.26492309570312
+    charspan:
+    - 0
+    - 481
+    page_no: 8
+  text: To conclude this section, we illustrate the quality of layout predictions
+    one can expect from DocLayNet-trained models by providing a selection of examples
+    without any further post-processing applied. Figure 6 shows selected layout predictions
+    on pages from the test-set of DocLayNet. Results look decent in general across
+    document categories, however one can also observe mistakes such as overlapping
+    clusters of different classes, or entirely missing boxes due to low confidence.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/98
+  hash: 8801089031972856173
+  label: section_header
+  orig: 6 CONCLUSION
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 695.8309936523438
+      coord_origin: BOTTOMLEFT
+      l: 317.4961853027344
+      r: 405.7296142578125
+      t: 706.4700317382812
+    charspan:
+    - 0
+    - 12
+    page_no: 8
+  text: 6 CONCLUSION
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/99
+  hash: 15710626894768820561
+  label: text
+  orig: In this paper, we presented the DocLayNet dataset. It provides the document
+    conversion and layout analysis research community a new and challenging dataset
+    to improve and fine-tune novel ML methods on. In contrast to many other datasets,
+    DocLayNet was created by human annotation in order to obtain reliable layout ground-truth
+    on a wide variety of publication- and typesettingstyles. Including a large proportion
+    of documents outside the scientific publishing domain adds significant value in
+    this respect.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 605.4117431640625
+      coord_origin: BOTTOMLEFT
+      l: 317.0487976074219
+      r: 559.7137451171875
+      t: 691.6207275390625
+    charspan:
+    - 0
+    - 507
+    page_no: 8
+  text: In this paper, we presented the DocLayNet dataset. It provides the document
+    conversion and layout analysis research community a new and challenging dataset
+    to improve and fine-tune novel ML methods on. In contrast to many other datasets,
+    DocLayNet was created by human annotation in order to obtain reliable layout ground-truth
+    on a wide variety of publication- and typesettingstyles. Including a large proportion
+    of documents outside the scientific publishing domain adds significant value in
+    this respect.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/100
+  hash: 5999203225419292280
+  label: text
+  orig: To date, there is still a significant gap between human and ML accuracy on
+    the layout interpretation task, and we hope that this work will inspire the research
+    community to close that gap.
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 474.2935791015625
+      coord_origin: BOTTOMLEFT
+      l: 317.1865234375
+      r: 558.6325073242188
+      t: 505.4895324707031
+    charspan:
+    - 0
+    - 188
+    page_no: 8
+  text: To date, there is still a significant gap between human and ML accuracy on
+    the layout interpretation task, and we hope that this work will inspire the research
+    community to close that gap.
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/101
+  hash: 4445410344359338123
+  label: section_header
+  orig: REFERENCES
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 446.5990295410156
+      coord_origin: BOTTOMLEFT
+      l: 317.4455871582031
+      r: 387.5806579589844
+      t: 457.4013366699219
+    charspan:
+    - 0
+    - 10
+    page_no: 8
+  text: REFERENCES
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/102
+  hash: 16616106884325138631
+  label: list_item
+  orig: "[1] Max G\xF6bel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013\
+    \ table competition. In 2013 12th International Conference on Document Analysis\
+    \ and Recognition , pages 1449-1453, 2013."
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 420.8371276855469
+      coord_origin: BOTTOMLEFT
+      l: 320.5848693847656
+      r: 559.0187377929688
+      t: 444.4063415527344
+    charspan:
+    - 0
+    - 191
+    page_no: 8
+  text: "[1] Max G\xF6bel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013\
+    \ table competition. In 2013 12th International Conference on Document Analysis\
+    \ and Recognition , pages 1449-1453, 2013."
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/103
+  hash: 16787040176255257341
+  label: list_item
+  orig: '[2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher.
+    Icdar2017 competition on recognition of documents with complex layouts rdcl2017.
+    In 2017 14th IAPR International Conference on Document Analysis and Recognition
+    (ICDAR) , volume 01, pages 1404-1410, 2017.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 388.9571228027344
+      coord_origin: BOTTOMLEFT
+      l: 320.76806640625
+      r: 559.7276000976562
+      t: 420.2254333496094
+    charspan:
+    - 0
+    - 279
+    page_no: 8
+  text: '[2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher.
+    Icdar2017 competition on recognition of documents with complex layouts rdcl2017.
+    In 2017 14th IAPR International Conference on Document Analysis and Recognition
+    (ICDAR) , volume 01, pages 1404-1410, 2017.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/104
+  hash: 16229494543393695243
+  label: list_item
+  orig: "[3] Herv\xE9 D\xE9jean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang,\
+    \ Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection\
+    \ and Recognition (cTDaR), April 2019. http://sac.founderit.com/."
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 364.88128662109375
+      coord_origin: BOTTOMLEFT
+      l: 320.58111572265625
+      r: 558.4269409179688
+      t: 388.028076171875
+    charspan:
+    - 0
+    - 213
+    page_no: 8
+  text: "[3] Herv\xE9 D\xE9jean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang,\
+    \ Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection\
+    \ and Recognition (cTDaR), April 2019. http://sac.founderit.com/."
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/105
+  hash: 15423145939859734104
+  label: list_item
+  orig: '[4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on
+    scientific literature parsing. In Proceedings of the International Conference
+    on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag,
+    sep 2021.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 333.173095703125
+      coord_origin: BOTTOMLEFT
+      l: 320.72210693359375
+      r: 559.3787231445312
+      t: 364.17962646484375
+    charspan:
+    - 0
+    - 251
+    page_no: 8
+  text: '[4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on
+    scientific literature parsing. In Proceedings of the International Conference
+    on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag,
+    sep 2021.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/106
+  hash: 5249151387680038785
+  label: list_item
+  orig: '[5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang
+    Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis:
+    not dead yet. International Journal on Document Analysis and Recognition (IJDAR)
+    , pages 1-11, 01 2022.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 300.9960021972656
+      coord_origin: BOTTOMLEFT
+      l: 320.47723388671875
+      r: 559.2555541992188
+      t: 332.2057800292969
+    charspan:
+    - 0
+    - 261
+    page_no: 8
+  text: '[5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang
+    Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis:
+    not dead yet. International Journal on Document Analysis and Recognition (IJDAR)
+    , pages 1-11, 01 2022.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/107
+  hash: 16511389590086473870
+  label: list_item
+  orig: '[6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest
+    dataset ever for document layout analysis. In Proceedings of the International
+    Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep
+    2019.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 277.3751220703125
+      coord_origin: BOTTOMLEFT
+      l: 320.7210998535156
+      r: 558.6044921875
+      t: 300.1542053222656
+    charspan:
+    - 0
+    - 235
+    page_no: 8
+  text: '[6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest
+    dataset ever for document layout analysis. In Proceedings of the International
+    Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep
+    2019.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/108
+  hash: 5841239213590061604
+  label: list_item
+  orig: '[7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li,
+    and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings
+    of the 28th International Conference on Computational Linguistics , COLING, pages
+    949-960. International Committee on Computational Linguistics, dec 2020.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 237.53111267089844
+      coord_origin: BOTTOMLEFT
+      l: 320.7048034667969
+      r: 559.0962524414062
+      t: 276.57550048828125
+    charspan:
+    - 0
+    - 316
+    page_no: 8
+  text: '[7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li,
+    and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings
+    of the 28th International Conference on Computational Linguistics , COLING, pages
+    949-960. International Committee on Computational Linguistics, dec 2020.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/109
+  hash: 11745041684012725305
+  label: list_item
+  orig: '[8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction
+    from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC
+    , 2016.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 213.6141357421875
+      coord_origin: BOTTOMLEFT
+      l: 320.6175537109375
+      r: 558.9022216796875
+      t: 236.84490966796875
+    charspan:
+    - 0
+    - 172
+    page_no: 8
+  text: '[8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction
+    from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC
+    , 2016.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/110
+  hash: 8213734949810000799
+  label: list_item
+  orig: '[9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich
+    feature hierarchies for accurate object detection and semantic segmentation. In
+    IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587.
+    IEEE Computer Society, jun 2014.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 181.74110412597656
+      coord_origin: BOTTOMLEFT
+      l: 320.695556640625
+      r: 559.2744750976562
+      t: 212.77767944335938
+    charspan:
+    - 0
+    - 271
+    page_no: 8
+  text: '[9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich
+    feature hierarchies for accurate object detection and semantic segmentation. In
+    IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587.
+    IEEE Computer Society, jun 2014.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/111
+  hash: 4869209929442963000
+  label: list_item
+  orig: '[10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference
+    on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 165.5072479248047
+      coord_origin: BOTTOMLEFT
+      l: 317.74908447265625
+      r: 558.8585205078125
+      t: 181.0753173828125
+    charspan:
+    - 0
+    - 149
+    page_no: 8
+  text: '[10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference
+    on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/112
+  hash: 16420654594074141837
+  label: list_item
+  orig: '[11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn:
+    Towards real-time object detection with region proposal networks. IEEE Transactions
+    on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 141.8831329345703
+      coord_origin: BOTTOMLEFT
+      l: 317.71527099609375
+      r: 558.4170532226562
+      t: 164.63047790527344
+    charspan:
+    - 0
+    - 227
+    page_no: 8
+  text: '[11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn:
+    Towards real-time object detection with region proposal networks. IEEE Transactions
+    on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/113
+  hash: 453358893855311407
+  label: list_item
+  orig: "[12] Kaiming He, Georgia Gkioxari, Piotr Doll\xE1r, and Ross B. Girshick.\
+    \ Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages\
+    \ 2980-2988. IEEE Computer Society, Oct 2017."
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 117.60646057128906
+      coord_origin: BOTTOMLEFT
+      l: 317.5010986328125
+      r: 559.278076171875
+      t: 141.50643920898438
+    charspan:
+    - 0
+    - 192
+    page_no: 8
+  text: "[12] Kaiming He, Georgia Gkioxari, Piotr Doll\xE1r, and Ross B. Girshick.\
+    \ Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages\
+    \ 2980-2988. IEEE Computer Society, Oct 2017."
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/114
+  hash: 3393294654140361785
+  label: list_item
+  orig: '[13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012,
+    TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing,
+    tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana,
+    Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 86.09910583496094
+      coord_origin: BOTTOMLEFT
+      l: 317.4837341308594
+      r: 559.0487670898438
+      t: 116.94155883789062
+    charspan:
+    - 0
+    - 305
+    page_no: 8
+  text: '[13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012,
+    TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing,
+    tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana,
+    Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/115
+  hash: 13779849536941554365
+  label: page_header
+  orig: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 722.9329223632812
+      coord_origin: BOTTOMLEFT
+      l: 53.55940246582031
+      r: 347.0838623046875
+      t: 731.9924926757812
+    charspan:
+    - 0
+    - 71
+    page_no: 9
+  text: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/116
+  hash: 16675011465179482522
+  label: page_header
+  orig: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 723.0497436523438
+      coord_origin: BOTTOMLEFT
+      l: 365.1275329589844
+      r: 558.905029296875
+      t: 731.96435546875
+    charspan:
+    - 0
+    - 48
+    page_no: 9
+  text: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/117
+  hash: 12604501010340547619
+  label: caption
+  orig: 'Figure 6: Example layout predictions on selected pages from the DocLayNet
+    test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show
+    accurate list-item and paragraph '
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 226.54010009765625
+      coord_origin: BOTTOMLEFT
+      l: 317.53033447265625
+      r: 559.0158081054688
+      t: 249.28826904296875
+    charspan:
+    - 0
+    - 188
+    page_no: 9
+  text: 'Figure 6: Example layout predictions on selected pages from the DocLayNet
+    test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show
+    accurate list-item and paragraph '
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/118
+  hash: 15606020167439278095
+  label: text
+  orig: 'Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang.
+    ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow
+    export, opencv dnn support, October 2021.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 242.22409057617188
+      coord_origin: BOTTOMLEFT
+      l: 68.69137573242188
+      r: 295.22406005859375
+      t: 265.4314270019531
+    charspan:
+    - 0
+    - 195
+    page_no: 9
+  text: 'Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang.
+    ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow
+    export, opencv dnn support, October 2021.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/119
+  hash: 14342144244909907366
+  label: list_item
+  orig: '[14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier,
+    Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers.
+    CoRR , abs/2005.12872, 2020.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 218.56314086914062
+      coord_origin: BOTTOMLEFT
+      l: 53.56020736694336
+      r: 295.12176513671875
+      t: 241.63282775878906
+    charspan:
+    - 0
+    - 190
+    page_no: 9
+  text: '[14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier,
+    Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers.
+    CoRR , abs/2005.12872, 2020.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/120
+  hash: 8955370194868803712
+  label: list_item
+  orig: '[15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and
+    efficient object detection. CoRR , abs/1911.09070, 2019.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 202.62213134765625
+      coord_origin: BOTTOMLEFT
+      l: 53.61275863647461
+      r: 294.3653869628906
+      t: 217.57615661621094
+    charspan:
+    - 0
+    - 132
+    page_no: 9
+  text: '[15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and
+    efficient object detection. CoRR , abs/1911.09070, 2019.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/121
+  hash: 13212807811422473787
+  label: list_item
+  orig: "[16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev,\
+    \ Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\xE1r,\
+    \ and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014."
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 178.71910095214844
+      coord_origin: BOTTOMLEFT
+      l: 53.668941497802734
+      r: 295.2226257324219
+      t: 201.57443237304688
+    charspan:
+    - 0
+    - 219
+    page_no: 9
+  text: "[16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev,\
+    \ Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\xE1r,\
+    \ and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014."
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/122
+  hash: 7441487755804462640
+  label: list_item
+  orig: '[17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross
+    Girshick. Detectron2, 2019.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 162.77911376953125
+      coord_origin: BOTTOMLEFT
+      l: 53.54263687133789
+      r: 295.1200866699219
+      t: 178.3345947265625
+    charspan:
+    - 0
+    - 100
+    page_no: 9
+  text: '[17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross
+    Girshick. Detectron2, 2019.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/123
+  hash: 17408271425993029853
+  label: list_item
+  orig: '[18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk,
+    Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and
+    Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks.
+    In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages
+    1513715145, feb 2021.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 122.92810821533203
+      coord_origin: BOTTOMLEFT
+      l: 53.569610595703125
+      r: 294.8847351074219
+      t: 162.23497009277344
+    charspan:
+    - 0
+    - 339
+    page_no: 9
+  text: '[18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk,
+    Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and
+    Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks.
+    In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages
+    1513715145, feb 2021.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/124
+  hash: 8781691199018342705
+  label: list_item
+  orig: '[19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou.
+    Layoutlm: Pre-training of text and layout for document image understanding. In
+    Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery
+    and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing
+    Machinery.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 82.67352294921875
+      coord_origin: BOTTOMLEFT
+      l: 53.4610595703125
+      r: 295.22174072265625
+      t: 122.19474029541016
+    charspan:
+    - 0
+    - 336
+    page_no: 9
+  text: '[19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou.
+    Layoutlm: Pre-training of text and layout for document image understanding. In
+    Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery
+    and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing
+    Machinery.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/125
+  hash: 2159895940565677367
+  label: list_item
+  orig: '[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang.
+    Vtlayout: Fusion of visual and text features for document layout analysis, 2021.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 249.62921142578125
+      coord_origin: BOTTOMLEFT
+      l: 317.6278076171875
+      r: 559.0263671875
+      t: 265.5798645019531
+    charspan:
+    - 0
+    - 153
+    page_no: 9
+  text: '[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang.
+    Vtlayout: Fusion of visual and text features for document layout analysis, 2021.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/126
+  hash: 15008793456124101567
+  label: list_item
+  orig: '[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus
+    conversion service: A machine learning platform to ingest documents at scale.
+    In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery
+    and Data Mining , KDD, pages 774-782. ACM, 2018.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 194.28546142578125
+      coord_origin: BOTTOMLEFT
+      l: 317.6616516113281
+      r: 559.275390625
+      t: 225.54457092285156
+    charspan:
+    - 0
+    - 290
+    page_no: 9
+  text: '[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus
+    conversion service: A machine learning platform to ingest documents at scale.
+    In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery
+    and Data Mining , KDD, pages 774-782. ACM, 2018.'
+- children: []
+  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/127
+  hash: 17416865681467935095
+  label: list_item
+  orig: '[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation
+    for deep learning. Journal of Big Data , 6(1):60, 2019.'
+  parent:
+    $ref: '#/body'
+  prov:
+  - bbox:
+      b: 178.71212768554688
+      coord_origin: BOTTOMLEFT
+      l: 317.65606689453125
+      r: 559.3782958984375
+      t: 193.30506896972656
+    charspan:
+    - 0
+    - 138
+    page_no: 9
+  text: '[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation
+    for deep learning. Journal of Big Data , 6(1):60, 2019.'
+version: 0.0.1

From 622f625df626bee12f15b6218bc831996925bb97 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Tue, 24 Sep 2024 16:14:23 +0200
Subject: [PATCH 13/34] Introduce provenance info, use enum labels

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/experimental/base.py     |  3 +
 docling_core/types/experimental/document.py | 82 +++++++++++----------
 docling_core/types/experimental/labels.py   | 36 +++++++++
 test/data/experimental/dummy_doc.yaml       |  2 +-
 test/test_docling_doc.py                    | 82 ++++++++++++---------
 5 files changed, 130 insertions(+), 75 deletions(-)

diff --git a/docling_core/types/experimental/base.py b/docling_core/types/experimental/base.py
index b082ea6..13fbefd 100644
--- a/docling_core/types/experimental/base.py
+++ b/docling_core/types/experimental/base.py
@@ -15,6 +15,9 @@ class Size(BaseModel):
     width: float = 0.0
     height: float = 0.0
 
+    def as_tuple(self):
+        return (self.width, self.height)
+
 
 class BoundingBox(BaseModel):
     l: float  # left
diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index e92c047..dcc0c66 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -15,6 +15,7 @@
 
 from docling_core.types.doc.tokens import DocumentToken
 from docling_core.types.experimental.base import BoundingBox, Size
+from docling_core.types.experimental.labels import PageLabel
 
 Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
 LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
@@ -173,14 +174,13 @@ class GroupItem(NodeItem):  # Container type, can't be a leaf node
 class DocItem(
     NodeItem
 ):  # Base type for any element that carries content, can be a leaf node
-    label: str
+    label: PageLabel
     prov: List[ProvenanceItem] = []
 
     def get_location_tokens(
         self,
+        doc: "DoclingDocument",
         new_line: str,
-        page_w: float,
-        page_h: float,
         xsize: int = 100,
         ysize: int = 100,
         add_page_index: bool = True,
@@ -191,6 +191,7 @@ def get_location_tokens(
 
         location = ""
         for prov in self.prov:
+            page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
 
             page_i = -1
             if add_page_index:
@@ -215,9 +216,8 @@ class TextItem(DocItem):
 
     def export_to_document_tokens(
         self,
+        doc: "DoclingDocument",
         new_line: str = "\n",
-        page_w: float = 0.0,
-        page_h: float = 0.0,
         xsize: int = 100,
         ysize: int = 100,
         add_location: bool = True,
@@ -225,7 +225,7 @@ def export_to_document_tokens(
         add_page_index: bool = True,
     ):
         """Export text element to document tokens format."""
-        body = f"<{self.label}>"
+        body = f"<{self.label.value}>"
         # body = f"<{self.name}>"
 
         assert DocumentToken.is_known_token(
@@ -234,9 +234,8 @@ def export_to_document_tokens(
 
         if add_location:
             body += self.get_location_tokens(
+                doc=doc,
                 new_line="",
-                page_w=page_w,
-                page_h=page_h,
                 xsize=xsize,
                 ysize=ysize,
                 add_page_index=add_page_index,
@@ -282,9 +281,8 @@ def export_to_document_tokens(
 
         if add_location:
             body += self.get_location_tokens(
+                doc=doc,
                 new_line=new_line,
-                page_w=page_w,
-                page_h=page_h,
                 xsize=xsize,
                 ysize=ysize,
                 add_page_index=add_page_index,
@@ -404,8 +402,6 @@ def export_to_document_tokens(
         self,
         doc: "DoclingDocument",
         new_line: str = "\n",
-        page_w: float = 0.0,
-        page_h: float = 0.0,
         xsize: int = 100,
         ysize: int = 100,
         add_location: bool = True,
@@ -421,9 +417,8 @@ def export_to_document_tokens(
 
         if add_location:
             body += self.get_location_tokens(
+                doc=doc,
                 new_line=new_line,
-                page_w=page_w,
-                page_h=page_h,
                 xsize=xsize,
                 ysize=ysize,
                 add_page_index=add_page_index,
@@ -454,9 +449,9 @@ def export_to_document_tokens(
                         col.bbox is not None
                         and add_cell_location
                         and add_page_index
-                        and self.prov is not None
                         and len(self.prov) > 0
                     ):
+                        page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
                         cell_loc = DocumentToken.get_location(
                             bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
                             page_w=page_w,
@@ -469,7 +464,10 @@ def export_to_document_tokens(
                         col.bbox is not None
                         and add_cell_location
                         and not add_page_index
+                        and len(self.prov) > 0
                     ):
+                        page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
+
                         cell_loc = DocumentToken.get_location(
                             bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
                             page_w=page_w,
@@ -506,12 +504,11 @@ class DocumentTrees(BaseModel):
     body: GroupItem = GroupItem(name="_root_", dloc="#/body")  # List[RefItem] = []
 
 
-class PageItem(DocumentTrees):
+class PageItem(BaseModel):
     # A page carries separate root items for furniture and body, only referencing items on the page
     hash: str  # page hash
     size: Size
-    image: Optional[ImageRef]
-    num_elements: int
+    image: Optional[ImageRef] = None
     page_no: int
 
 
@@ -595,7 +592,7 @@ def add_table(
         dloc = f"{self.file_info.document_hash}{cref}"
 
         tbl_item = TableItem(
-            label="table", data=data, dloc=dloc, parent=parent.get_ref()
+            label=PageLabel.TABLE, data=data, dloc=dloc, parent=parent.get_ref()
         )
         if prov:
             tbl_item.prov.append(prov)
@@ -622,7 +619,7 @@ def add_figure(
         dloc = f"{self.file_info.document_hash}{cref}"
 
         fig_item = FigureItem(
-            label="figure", data=data, dloc=dloc, parent=parent.get_ref()
+            label=PageLabel.PICTURE, data=data, dloc=dloc, parent=parent.get_ref()
         )
         if prov:
             fig_item.prov.append(prov)
@@ -636,7 +633,7 @@ def add_figure(
 
     def add_heading(
         self,
-        label: str,
+        label: PageLabel,
         text: str,
         orig: Optional[str] = None,
         level: LevelNumber = 1,
@@ -661,14 +658,23 @@ def iterate_elements(
         root: Optional[NodeItem] = None,
         with_groups: bool = False,
         traverse_figures: bool = True,
-        level=0,
+        page_no: Optional[int] = None,
+        _level=0,  # fixed parameter, carries through the node nesting level
     ) -> typing.Iterable[Tuple[NodeItem, int]]:  # tuple of node and level
         # Yield the current node
         if not root:
             root = self.body
 
         if not isinstance(root, GroupItem) or with_groups:
-            yield root, level
+            if isinstance(root, DocItem):
+                if page_no is not None:
+                    for prov in root.prov:
+                        if prov.page_no == page_no:
+                            yield root, _level
+                else:
+                    yield root, _level
+            else:
+                yield root, _level
 
         # Traverse children
         for child_ref in root.children:
@@ -677,9 +683,7 @@ def iterate_elements(
             if isinstance(child, NodeItem):
                 # If the child is a NodeItem, recursively traverse it
                 if not isinstance(child, FigureItem) or traverse_figures:
-                    yield from self.iterate_elements(child, level=level + 1)
-            else:  # leaf
-                yield child, level
+                    yield from self.iterate_elements(child, _level=_level + 1)
 
     def print_element_tree(self):
         for ix, (item, level) in enumerate(self.iterate_elements(with_groups=True)):
@@ -811,11 +815,12 @@ def export_to_document_tokens(
         labels: list[str] = [
             "title",
             "subtitle-level-1",
-            "paragraph",
+            "Section-header" "paragraph",
             "caption",
             "table",
             "figure",
             "text",
+            "Text",
         ],
         xsize: int = 100,
         ysize: int = 100,
@@ -857,24 +862,21 @@ def export_to_document_tokens(
                 prov = item.prov
 
                 page_i = -1
-                page_w = 0.0
-                page_h = 0.0
 
                 if add_location and len(self.pages) and len(prov) > 0:
 
-                    page_i = prov[0].page
-                    page_dim = self.pages[page_i - 1].size
+                    page_i = prov[0].page_no
+                    page_dim = self.pages[page_i].size
 
-                    page_w = float(page_dim.width)
-                    page_h = float(page_dim.height)
+                    float(page_dim.width)
+                    float(page_dim.height)
 
                 item_type = item.label
                 if isinstance(item, TextItem) and (item_type in labels):
 
                     doctags += item.export_to_document_tokens(
+                        doc=self,
                         new_line=new_line,
-                        page_w=page_w,
-                        page_h=page_h,
                         xsize=xsize,
                         ysize=ysize,
                         add_location=add_location,
@@ -887,8 +889,6 @@ def export_to_document_tokens(
                     doctags += item.export_to_document_tokens(
                         doc=self,
                         new_line=new_line,
-                        page_w=page_w,
-                        page_h=page_h,
                         xsize=xsize,
                         ysize=ysize,
                         add_caption=True,
@@ -905,8 +905,6 @@ def export_to_document_tokens(
                     doctags += item.export_to_document_tokens(
                         doc=self,
                         new_line=new_line,
-                        page_w=page_w,
-                        page_h=page_h,
                         xsize=xsize,
                         ysize=ysize,
                         add_caption=True,
@@ -918,3 +916,9 @@ def export_to_document_tokens(
         doctags += DocumentToken.END_DOCUMENT.value
 
         return doctags
+
+    def add_page(self, page_no: int, size: Size, hash: str) -> PageItem:
+        pitem = PageItem(page_no=page_no, size=size, hash=hash)
+
+        self.pages[page_no] = pitem
+        return pitem
diff --git a/docling_core/types/experimental/labels.py b/docling_core/types/experimental/labels.py
index e69de29..7da7fd0 100644
--- a/docling_core/types/experimental/labels.py
+++ b/docling_core/types/experimental/labels.py
@@ -0,0 +1,36 @@
+from enum import Enum
+
+
+class PageLabel(str, Enum):
+    # DocLayNet v2
+    CAPTION = "caption"
+    FOOTNOTE = "footnote"
+    FORMULA = "formula"
+    LIST_ITEM = "list_item"
+    PAGE_FOOTER = "page_footer"
+    PAGE_HEADER = "page_header"
+    PICTURE = "picture"
+    SECTION_HEADER = "section_header"
+    TABLE = "table"
+    TEXT = "text"
+    TITLE = "title"
+    DOCUMENT_INDEX = "document_index"
+    CODE = "code"
+    CHECKBOX_SELECTED = "checkbox_selected"
+    CHECKBOX_UNSELECTED = "checkbox_unselected"
+    FORM = "form"
+    KEY_VALUE_REGION = "key_value_region"
+
+    # Additional labels for markup-based formats (e.g. HTML, Word)
+    LIST = "list"  # group label for list container (not the list-items)
+    PARAGRAPH = "paragraph"  # explicitly a paragraph and not arbitrary text
+    REFERENCE = "reference"
+
+    # To be completed...
+
+
+class TableLabel(str, Enum):
+    COLUMN_HEADER = "col_header"
+    ROW_HEADER = "row_header"
+    SECTION = "row_section"
+    BODY = "body"
diff --git a/test/data/experimental/dummy_doc.yaml b/test/data/experimental/dummy_doc.yaml
index d2a6470..d9a6230 100644
--- a/test/data/experimental/dummy_doc.yaml
+++ b/test/data/experimental/dummy_doc.yaml
@@ -129,7 +129,7 @@ tables: # All tables...
 figures: # All figures...
   - dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/figures/0"
     hash: 7782482
-    label: "figure"
+    label: "picture"
     parent:
       $ref: "#/body"
     captions:
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index ee1a9f9..15c3add 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -7,9 +7,10 @@
     FileInfo,
     TableCell,
 )
+from docling_core.types.experimental.labels import PageLabel
 
 
-def test_load_serialize_doc():
+def test_reference_doc():
     # Read YAML file
     with open("test/data/experimental/dummy_doc.yaml", "r") as fp:
         dict_from_yaml = yaml.safe_load(fp)
@@ -48,19 +49,58 @@ def test_load_serialize_doc():
         print(f"Item: {item} at level {level}")
 
 
+def test_parse_doc():
+    with open(
+        "test/data/experimental/2206.01062.experimental.yaml",
+        "r",
+    ) as fp:
+        dict_from_yaml = yaml.safe_load(fp)
+
+    doc = DoclingDocument.model_validate(dict_from_yaml)
+
+    _test_export_methods(doc)
+    _test_serialize_and_reload(doc)
+
+
 def test_construct_doc():
 
-    doc = DoclingDocument(description={}, file_info=FileInfo(document_hash="xyz"))
+    doc = _construct_doc()
+    _test_export_methods(doc)
+    _test_serialize_and_reload(doc)
+
+
+def _test_serialize_and_reload(doc):
+    ### Serialize and deserialize stuff
+    yaml_dump = yaml.safe_dump(doc.model_dump(mode="json", by_alias=True))
+    # print(f"\n\n{yaml_dump}")
+    DoclingDocument.model_validate(yaml.safe_load(yaml_dump))
+
+
+def _test_export_methods(doc):
+    ### Iterate all elements
+    doc.print_element_tree()
+    ## Export stuff
+    print(doc.export_to_markdown())
+    print(doc.export_to_document_tokens())
+    for table in doc.tables:
+        table.export_to_html()
+        table.export_to_dataframe()
+        table.export_to_document_tokens(doc)
+    for fig in doc.figures:
+        fig.export_to_document_tokens(doc)
+    doc.print_element_tree()
 
+
+def _construct_doc() -> DoclingDocument:
+    doc = DoclingDocument(description={}, file_info=FileInfo(document_hash="xyz"))
     # group, heading, paragraph, table, figure, title, list, provenance
     doc.add_paragraph(label="text", text="Author 1\nAffiliation 1")
     doc.add_paragraph(label="text", text="Author 2\nAffiliation 2")
-
     chapter1 = doc.add_group(
         name="Introduction"
     )  # can be done if such information is present, or ommitted.
     doc.add_heading(
-        parent=chapter1, label="section_header", text="1. Introduction", level=1
+        parent=chapter1, label=PageLabel.SECTION_HEADER, text="1. Introduction", level=1
     )
     doc.add_paragraph(
         parent=chapter1,
@@ -74,11 +114,11 @@ def test_construct_doc():
         text="Cooks your favourite meal before you know you want it.",
     )
     doc.add_paragraph(
-        parent=mylist, label="list_item", text="Cleans up all your dishes."
+        parent=mylist, label=PageLabel.LIST_ITEM, text="Cleans up all your dishes."
     )
     doc.add_paragraph(
         parent=mylist,
-        label="list_item",
+        label=PageLabel.LIST_ITEM,
         text="Drains your bank account without consent.",
     )
     # Make some table cells
@@ -150,36 +190,8 @@ def test_construct_doc():
     )
     table_el = BaseTableData(num_rows=3, num_cols=3, table_cells=table_cells)
     doc.add_table(data=table_el)
-
     fig_caption = doc.add_paragraph(
         label="caption", text="This is the caption of figure 1."
     )
     doc.add_figure(data=BaseFigureData(), caption=fig_caption.get_ref())
-
-    ### Iterate all elements
-
-    for item, level in doc.iterate_elements():
-        print(f"Item: {item}")
-
-    ## Export stuff
-
-    print(doc.export_to_markdown())
-    print(doc.export_to_document_tokens())
-
-    for table in doc.tables:
-        table.export_to_html()
-        table.export_to_dataframe()
-        table.export_to_document_tokens(doc)
-
-    for fig in doc.figures:
-        fig.export_to_document_tokens(doc)
-
-    doc.print_element_tree()
-
-    ### Serialize and deserialize stuff
-
-    yaml_dump = yaml.safe_dump(doc.model_dump(mode="json", by_alias=True))
-
-    # print(f"\n\n{yaml_dump}")
-
-    DoclingDocument.model_validate(yaml.safe_load(yaml_dump))
+    return doc

From b50d53c05bf755ddb73c7d33ececdb542877662a Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Wed, 25 Sep 2024 12:01:06 +0200
Subject: [PATCH 14/34] Update formatting

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/experimental/document.py | 25 ++++++++++++++-------
 docling_core/types/experimental/labels.py   |  8 ++++---
 test/test_docling_doc.py                    | 14 +++++++-----
 3 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index 1b3953b..7e76cbd 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -15,7 +15,7 @@
 
 from docling_core.types.doc.tokens import DocumentToken
 from docling_core.types.experimental.base import BoundingBox, Size
-from docling_core.types.experimental.labels import PageLabel, GroupLabel
+from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
 
 Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
 LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
@@ -175,7 +175,7 @@ class GroupItem(NodeItem):  # Container type, can't be a leaf node
 class DocItem(
     NodeItem
 ):  # Base type for any element that carries content, can be a leaf node
-    label: PageLabel
+    label: DocItemLabel
     prov: List[ProvenanceItem] = []
 
     def get_location_tokens(
@@ -531,7 +531,12 @@ class DoclingDocument(DocumentTrees):
     #    self.furniture.children.append(group)
     #    return group
 
-    def add_group(self, label: Optional[GroupLabel] = None, name: Optional[str] = None, parent: Optional[GroupItem] = None) -> GroupItem:
+    def add_group(
+        self,
+        label: Optional[GroupLabel] = None,
+        name: Optional[str] = None,
+        parent: Optional[GroupItem] = None,
+    ) -> GroupItem:
         if not parent:
             parent = self.body
 
@@ -598,7 +603,7 @@ def add_table(
         dloc = f"{self.file_info.document_hash}{cref}"
 
         tbl_item = TableItem(
-            label=PageLabel.TABLE, data=data, dloc=dloc, parent=parent.get_ref()
+            label=DocItemLabel.TABLE, data=data, dloc=dloc, parent=parent.get_ref()
         )
         if prov:
             tbl_item.prov.append(prov)
@@ -625,7 +630,7 @@ def add_figure(
         dloc = f"{self.file_info.document_hash}{cref}"
 
         fig_item = FigureItem(
-            label=PageLabel.PICTURE, data=data, dloc=dloc, parent=parent.get_ref()
+            label=DocItemLabel.PICTURE, data=data, dloc=dloc, parent=parent.get_ref()
         )
         if prov:
             fig_item.prov.append(prov)
@@ -639,7 +644,7 @@ def add_figure(
 
     def add_heading(
         self,
-        label: PageLabel,
+        label: DocItemLabel,
         text: str,
         orig: Optional[str] = None,
         level: LevelNumber = 1,
@@ -689,7 +694,9 @@ def iterate_elements(
             if isinstance(child, NodeItem):
                 # If the child is a NodeItem, recursively traverse it
                 if not isinstance(child, FigureItem) or traverse_figures:
-                    yield from self.iterate_elements(child, _level=_level + 1, with_groups=with_groups)
+                    yield from self.iterate_elements(
+                        child, _level=_level + 1, with_groups=with_groups
+                    )
 
     def print_element_tree(self):
         for ix, (item, level) in enumerate(self.iterate_elements(with_groups=True)):
@@ -855,7 +862,6 @@ def export_to_document_tokens(
 
         skip_count = 0
         for ix, (item, level) in enumerate(self.iterate_elements(self.body)):
-
             if skip_count < from_element:
                 skip_count += 1
                 continue  # skip as many items as you want
@@ -863,6 +869,9 @@ def export_to_document_tokens(
             if to_element and ix >= to_element:
                 break
 
+            if not isinstance(item, DocItem):
+                continue
+
             prov = item.prov
 
             page_i = -1
diff --git a/docling_core/types/experimental/labels.py b/docling_core/types/experimental/labels.py
index 4e472a7..ca81c21 100644
--- a/docling_core/types/experimental/labels.py
+++ b/docling_core/types/experimental/labels.py
@@ -1,7 +1,7 @@
 from enum import Enum
 
 
-class PageLabel(str, Enum): # Don't call it that. ItemLabel, DocItemLabel...
+class DocItemLabel(str, Enum):
     # DocLayNet v2
     CAPTION = "caption"
     FOOTNOTE = "footnote"
@@ -27,15 +27,17 @@ class PageLabel(str, Enum): # Don't call it that. ItemLabel, DocItemLabel...
 
     # To be completed...
 
+
 class GroupLabel(str, Enum):
     UNSPECIFIED = "unspecified"
     LIST = "list"  # group label for list container (not the list-items)
     CHAPTER = "chapter"
     SECTION = "section"
 
-    #...
+    # ...
+
 
-class TableLabel(str, Enum):
+class TableCellLabel(str, Enum):
     COLUMN_HEADER = "col_header"
     ROW_HEADER = "row_header"
     SECTION = "row_section"
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index 5e96965..51501f2 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -7,7 +7,7 @@
     FileInfo,
     TableCell,
 )
-from docling_core.types.experimental.labels import PageLabel, GroupLabel
+from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
 
 
 def test_reference_doc():
@@ -96,11 +96,13 @@ def _construct_doc() -> DoclingDocument:
     doc.add_paragraph(label="text", text="Author 1\nAffiliation 1")
     doc.add_paragraph(label="text", text="Author 2\nAffiliation 2")
     chapter1 = doc.add_group(
-        label=GroupLabel.CHAPTER,
-        name="Introduction"
+        label=GroupLabel.CHAPTER, name="Introduction"
     )  # can be done if such information is present, or ommitted.
     doc.add_heading(
-        parent=chapter1, label=PageLabel.SECTION_HEADER, text="1. Introduction", level=1
+        parent=chapter1,
+        label=DocItemLabel.SECTION_HEADER,
+        text="1. Introduction",
+        level=1,
     )
     doc.add_paragraph(
         parent=chapter1,
@@ -114,11 +116,11 @@ def _construct_doc() -> DoclingDocument:
         text="Cooks your favourite meal before you know you want it.",
     )
     doc.add_paragraph(
-        parent=mylist, label=PageLabel.LIST_ITEM, text="Cleans up all your dishes."
+        parent=mylist, label=DocItemLabel.LIST_ITEM, text="Cleans up all your dishes."
     )
     doc.add_paragraph(
         parent=mylist,
-        label=PageLabel.LIST_ITEM,
+        label=DocItemLabel.LIST_ITEM,
         text="Drains your bank account without consent.",
     )
     # Make some table cells

From 4c12a696ac3572ab0ae55286b8428913158c7c95 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Wed, 25 Sep 2024 13:40:12 +0200
Subject: [PATCH 15/34] Docstrings and linter fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/experimental/base.py       |  46 +++-
 docling_core/types/experimental/document.py   | 231 ++++++++++++++++--
 docling_core/types/experimental/labels.py     |   8 +
 .../experimental/2206.01062.experimental.yaml |   2 +
 test/data/experimental/dummy_doc.yaml         |   4 +-
 5 files changed, 263 insertions(+), 28 deletions(-)

diff --git a/docling_core/types/experimental/base.py b/docling_core/types/experimental/base.py
index 13fbefd..afe1dd7 100644
--- a/docling_core/types/experimental/base.py
+++ b/docling_core/types/experimental/base.py
@@ -1,3 +1,5 @@
+"""Models for the base data types."""
+
 import copy
 from enum import Enum
 from typing import Tuple
@@ -5,21 +7,27 @@
 from pydantic import BaseModel
 
 
-## All copied from docling
 class CoordOrigin(str, Enum):
+    """CoordOrigin."""
+
     TOPLEFT = "TOPLEFT"
     BOTTOMLEFT = "BOTTOMLEFT"
 
 
 class Size(BaseModel):
+    """Size."""
+
     width: float = 0.0
     height: float = 0.0
 
     def as_tuple(self):
+        """as_tuple."""
         return (self.width, self.height)
 
 
 class BoundingBox(BaseModel):
+    """BoundingBox."""
+
     l: float  # left
     t: float  # top
     r: float  # right
@@ -29,13 +37,20 @@ class BoundingBox(BaseModel):
 
     @property
     def width(self):
+        """width."""
         return self.r - self.l
 
     @property
     def height(self):
+        """height."""
         return abs(self.t - self.b)
 
     def scaled(self, scale: float) -> "BoundingBox":
+        """scaled.
+
+        :param scale: float:
+
+        """
         out_bbox = copy.deepcopy(self)
         out_bbox.l *= scale
         out_bbox.r *= scale
@@ -45,6 +60,11 @@ def scaled(self, scale: float) -> "BoundingBox":
         return out_bbox
 
     def normalized(self, page_size: Size) -> "BoundingBox":
+        """normalized.
+
+        :param page_size: Size:
+
+        """
         out_bbox = copy.deepcopy(self)
         out_bbox.l /= page_size.width
         out_bbox.r /= page_size.width
@@ -54,6 +74,7 @@ def normalized(self, page_size: Size) -> "BoundingBox":
         return out_bbox
 
     def as_tuple(self):
+        """as_tuple."""
         if self.coord_origin == CoordOrigin.TOPLEFT:
             return (self.l, self.t, self.r, self.b)
         elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
@@ -61,6 +82,13 @@ def as_tuple(self):
 
     @classmethod
     def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
+        """from_tuple.
+
+        :param coord: Tuple[float:
+        :param ...]:
+        :param origin: CoordOrigin:
+
+        """
         if origin == CoordOrigin.TOPLEFT:
             l, t, r, b = coord[0], coord[1], coord[2], coord[3]
             if r < l:
@@ -79,9 +107,15 @@ def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
             return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
 
     def area(self) -> float:
+        """area."""
         return (self.r - self.l) * (self.b - self.t)
 
     def intersection_area_with(self, other: "BoundingBox") -> float:
+        """intersection_area_with.
+
+        :param other: "BoundingBox":
+
+        """
         # Calculate intersection coordinates
         left = max(self.l, other.l)
         top = max(self.t, other.t)
@@ -99,6 +133,11 @@ def intersection_area_with(self, other: "BoundingBox") -> float:
         return width * height
 
     def to_bottom_left_origin(self, page_height) -> "BoundingBox":
+        """to_bottom_left_origin.
+
+        :param page_height:
+
+        """
         if self.coord_origin == CoordOrigin.BOTTOMLEFT:
             return self
         elif self.coord_origin == CoordOrigin.TOPLEFT:
@@ -111,6 +150,11 @@ def to_bottom_left_origin(self, page_height) -> "BoundingBox":
             )
 
     def to_top_left_origin(self, page_height):
+        """to_top_left_origin.
+
+        :param page_height:
+
+        """
         if self.coord_origin == CoordOrigin.TOPLEFT:
             return self
         elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index 7e76cbd..a944deb 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -1,3 +1,5 @@
+"""Models for the Docling Document data type."""
+
 import hashlib
 import typing
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -22,10 +24,12 @@
 
 
 class BaseFigureData(BaseModel):  # TBD
-    pass
+    """BaseFigureData."""
 
 
 class TableCell(BaseModel):
+    """TableCell."""
+
     bbox: Optional[BoundingBox] = None
     row_span: int = 1
     col_span: int = 1
@@ -41,8 +45,9 @@ class TableCell(BaseModel):
     @model_validator(mode="before")
     @classmethod
     def from_dict_format(cls, data: Any) -> Any:
+        """from_dict_format."""
         if isinstance(data, Dict):
-            if not "bbox" in data or data["bbox"] == None:
+            if "bbox" not in data or data["bbox"] is None:
                 return data
             text = data["bbox"].get("token", "")
             if not len(text):
@@ -58,6 +63,8 @@ def from_dict_format(cls, data: Any) -> Any:
 
 
 class BaseTableData(BaseModel):  # TBD
+    """BaseTableData."""
+
     table_cells: List[TableCell] = []
     num_rows: int = 0
     num_cols: int = 0
@@ -69,7 +76,7 @@ def grid(
     ) -> List[
         List[TableCell]
     ]:  # TODO compute grid representation on the fly from table_cells
-
+        """grid."""
         # Initialise empty table data grid (only empty cells)
         table_data = [
             [
@@ -101,14 +108,19 @@ def grid(
 
 
 class FileInfo(BaseModel):
+    """FileInfo."""
+
     document_hash: str
 
 
 class RefItem(BaseModel):
+    """RefItem."""
+
     cref: str = Field(alias="$ref")
 
     # This method makes RefItem compatible with DocItem
     def get_ref(self):
+        """get_ref."""
         return self
 
     model_config = ConfigDict(
@@ -116,6 +128,7 @@ def get_ref(self):
     )
 
     def resolve(self, doc: "DoclingDocument"):
+        """resolve."""
         path_components = self.cref.split("/")
         if len(path_components) > 2:
             _, path, index_str = path_components
@@ -133,6 +146,8 @@ def resolve(self, doc: "DoclingDocument"):
 
 
 class ImageRef(BaseModel):
+    """ImageRef."""
+
     format: str  # png, etc.
     dpi: int  # ...
     size: Size
@@ -140,22 +155,28 @@ class ImageRef(BaseModel):
 
 
 class ProvenanceItem(BaseModel):
+    """ProvenanceItem."""
+
     page_no: int
     bbox: BoundingBox
     charspan: Tuple[int, int]
 
 
 class NodeItem(BaseModel):
+    """NodeItem."""
+
     dloc: str  # format spec ({document_hash}{json-path})
     parent: Optional[RefItem] = None
     children: List[RefItem] = []
 
     def get_ref(self):
+        """get_ref."""
         return RefItem(cref=f"#{self.dloc.split('#')[1]}")
 
     @computed_field  # type: ignore
     @property
     def hash(self) -> Uint64:  # TODO align with hasher on deepsearch-glm
+        """hash."""
         if not len(self.dloc):
             return 0
         hash_object = hashlib.sha256(self.dloc.encode("utf-8"))
@@ -168,6 +189,8 @@ def hash(self) -> Uint64:  # TODO align with hasher on deepsearch-glm
 
 
 class GroupItem(NodeItem):  # Container type, can't be a leaf node
+    """GroupItem."""
+
     name: Optional[str] = None
     label: GroupLabel = GroupLabel.UNSPECIFIED
 
@@ -175,6 +198,8 @@ class GroupItem(NodeItem):  # Container type, can't be a leaf node
 class DocItem(
     NodeItem
 ):  # Base type for any element that carries content, can be a leaf node
+    """DocItem."""
+
     label: DocItemLabel
     prov: List[ProvenanceItem] = []
 
@@ -212,6 +237,8 @@ def get_location_tokens(
 
 
 class TextItem(DocItem):
+    """TextItem."""
+
     orig: str  # untreated representation
     text: str  # sanitized representation
 
@@ -225,7 +252,17 @@ def export_to_document_tokens(
         add_content: bool = True,
         add_page_index: bool = True,
     ):
-        """Export text element to document tokens format."""
+        r"""Export text element to document tokens format.
+
+        :param doc: "DoclingDocument":
+        :param new_line: str:  (Default value = "\n")
+        :param xsize: int:  (Default value = 100)
+        :param ysize: int:  (Default value = 100)
+        :param add_location: bool:  (Default value = True)
+        :param add_content: bool:  (Default value = True)
+        :param add_page_index: bool:  (Default value = True)
+
+        """
         body = f"<{self.label.value}>"
         # body = f"<{self.name}>"
 
@@ -251,10 +288,14 @@ def export_to_document_tokens(
 
 
 class Section(TextItem):
+    """Section."""
+
     level: LevelNumber = 1
 
 
 class FloatingItem(DocItem):
+    """FloatingItem."""
+
     captions: List[RefItem] = []
     references: List[RefItem] = []
     footnotes: List[RefItem] = []
@@ -262,6 +303,8 @@ class FloatingItem(DocItem):
 
 
 class FigureItem(FloatingItem):
+    """FigureItem."""
+
     data: BaseFigureData
 
     def export_to_document_tokens(
@@ -277,7 +320,20 @@ def export_to_document_tokens(
         add_content: bool = True,  # not used at the moment
         add_page_index: bool = True,
     ):
-        """Export figure to document tokens format."""
+        r"""Export figure to document tokens format.
+
+        :param doc: "DoclingDocument":
+        :param new_line: str:  (Default value = "\n")
+        :param page_w: float:  (Default value = 0.0)
+        :param page_h: float:  (Default value = 0.0)
+        :param xsize: int:  (Default value = 100)
+        :param ysize: int:  (Default value = 100)
+        :param add_location: bool:  (Default value = True)
+        :param add_caption: bool:  (Default value = True)
+        :param add_content: bool:  (Default value = True)
+        :param # not used at the momentadd_page_index: bool:  (Default value = True)
+
+        """
         body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
 
         if add_location:
@@ -306,6 +362,8 @@ def export_to_document_tokens(
 
 
 class TableItem(FloatingItem):
+    """TableItem."""
+
     data: BaseTableData
 
     def export_to_dataframe(self) -> pd.DataFrame:
@@ -366,15 +424,13 @@ def export_to_html(self) -> str:
             for j in range(ncols):
                 cell: TableCell = self.data.grid[i][j]
 
-                rowspan, rowstart, rowend = (
+                rowspan, rowstart = (
                     cell.row_span,
                     cell.start_row_offset_idx,
-                    cell.end_row_offset_idx,
                 )
-                colspan, colstart, colend = (
+                colspan, colstart = (
                     cell.col_span,
                     cell.start_col_offset_idx,
-                    cell.end_col_offset_idx,
                 )
 
                 if rowstart != i:
@@ -413,7 +469,21 @@ def export_to_document_tokens(
         add_cell_text: bool = True,
         add_page_index: bool = True,
     ):
-        """Export table to document tokens format."""
+        r"""Export table to document tokens format.
+
+        :param doc: "DoclingDocument":
+        :param new_line: str:  (Default value = "\n")
+        :param xsize: int:  (Default value = 100)
+        :param ysize: int:  (Default value = 100)
+        :param add_location: bool:  (Default value = True)
+        :param add_caption: bool:  (Default value = True)
+        :param add_content: bool:  (Default value = True)
+        :param add_cell_location: bool:  (Default value = True)
+        :param add_cell_label: bool:  (Default value = True)
+        :param add_cell_text: bool:  (Default value = True)
+        :param add_page_index: bool:  (Default value = True)
+
+        """
         body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
 
         if add_location:
@@ -480,7 +550,14 @@ def export_to_document_tokens(
 
                     cell_label = ""
                     if add_cell_label:
-                        cell_label = f"<{'col_header' if col.column_header else 'row_header' if col.row_header else 'row_section' if col.row_section else 'body'}>"
+                        if col.column_header:
+                            cell_label = "<col_header>"
+                        elif col.row_header:
+                            cell_label = "<row_header>"
+                        elif col.row_section:
+                            cell_label = "<row_section>"
+                        else:
+                            cell_label = "<body>"
 
                     body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
 
@@ -492,13 +569,15 @@ def export_to_document_tokens(
 
 
 class KeyValueItem(DocItem):
-    pass
+    """KeyValueItem."""
 
 
 ContentItem = Union[TextItem, FigureItem, TableItem, KeyValueItem]
 
 
 class DocumentTrees(BaseModel):
+    """DocumentTrees."""
+
     furniture: GroupItem = GroupItem(
         name="_root_", dloc="#/furniture"
     )  # List[RefItem] = []
@@ -506,7 +585,10 @@ class DocumentTrees(BaseModel):
 
 
 class PageItem(BaseModel):
-    # A page carries separate root items for furniture and body, only referencing items on the page
+    """PageItem."""
+
+    # A page carries separate root items for furniture and body,
+    # only referencing items on the page
     hash: str  # page hash
     size: Size
     image: Optional[ImageRef] = None
@@ -514,6 +596,8 @@ class PageItem(BaseModel):
 
 
 class DoclingDocument(DocumentTrees):
+    """DoclingDocument."""
+
     version: str = "0.0.1"  # = SemanticVersion(version="0.0.1")
     description: Any
     file_info: FileInfo
@@ -537,6 +621,13 @@ def add_group(
         name: Optional[str] = None,
         parent: Optional[GroupItem] = None,
     ) -> GroupItem:
+        """add_group.
+
+        :param label: Optional[GroupLabel]:  (Default value = None)
+        :param name: Optional[str]:  (Default value = None)
+        :param parent: Optional[GroupItem]:  (Default value = None)
+
+        """
         if not parent:
             parent = self.body
 
@@ -564,6 +655,16 @@ def add_paragraph(
         parent: Optional[GroupItem] = None,
         item_cls=TextItem,
     ):
+        """add_paragraph.
+
+        :param label: str:
+        :param text: str:
+        :param orig: Optional[str]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param parent: Optional[GroupItem]:  (Default value = None)
+        :param item_cls:  (Default value = TextItem)
+
+        """
         if not parent:
             parent = self.body
 
@@ -595,6 +696,15 @@ def add_table(
         prov: Optional[ProvenanceItem] = None,
         parent: Optional[GroupItem] = None,
     ):
+        """add_table.
+
+        :param data: BaseTableData:
+        :param caption: Optional[Union[TextItem:
+        :param RefItem]]:  (Default value = None)
+        :param # This is not cool yet.prov: Optional[ProvenanceItem]
+        :param parent: Optional[GroupItem]:  (Default value = None)
+
+        """
         if not parent:
             parent = self.body
 
@@ -622,6 +732,15 @@ def add_figure(
         prov: Optional[ProvenanceItem] = None,
         parent: Optional[GroupItem] = None,
     ):
+        """add_figure.
+
+        :param data: BaseFigureData:
+        :param caption: Optional[Union[TextItem:
+        :param RefItem]]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param parent: Optional[GroupItem]:  (Default value = None)
+
+        """
         if not parent:
             parent = self.body
 
@@ -651,6 +770,16 @@ def add_heading(
         prov: Optional[ProvenanceItem] = None,
         parent: Optional[GroupItem] = None,
     ):
+        """add_heading.
+
+        :param label: DocItemLabel:
+        :param text: str:
+        :param orig: Optional[str]:  (Default value = None)
+        :param level: LevelNumber:  (Default value = 1)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param parent: Optional[GroupItem]:  (Default value = None)
+
+        """
         item: Section = self.add_paragraph(
             label, text, orig, prov, parent, item_cls=Section
         )
@@ -658,11 +787,13 @@ def add_heading(
         return item
 
     def num_pages(self):
+        """num_pages."""
         return len(self.pages.values())
 
     def build_page_trees(self):
-        # TODO: For every PageItem, update the furniture and body trees from the main doc.
-        pass
+        """build_page_trees."""
+        # TODO: For every PageItem, update the furniture and body trees
+        # from the main doc.
 
     def iterate_elements(
         self,
@@ -672,7 +803,16 @@ def iterate_elements(
         page_no: Optional[int] = None,
         _level=0,  # fixed parameter, carries through the node nesting level
     ) -> typing.Iterable[Tuple[NodeItem, int]]:  # tuple of node and level
-        # Yield the current node
+        """iterate_elements.
+
+        :param root: Optional[NodeItem]:  (Default value = None)
+        :param with_groups: bool:  (Default value = False)
+        :param traverse_figures: bool:  (Default value = True)
+        :param page_no: Optional[int]:  (Default value = None)
+        :param _level:  (Default value = 0)
+        :param # fixed parameter:
+        :param carries through the node nesting level:
+        """
         if not root:
             root = self.body
 
@@ -699,6 +839,7 @@ def iterate_elements(
                     )
 
     def print_element_tree(self):
+        """print_element_tree."""
         for ix, (item, level) in enumerate(self.iterate_elements(with_groups=True)):
             if isinstance(item, GroupItem):
                 print(" " * level, f"{ix}: {item.label.value} with name={item.name}")
@@ -726,16 +867,29 @@ def export_to_markdown(
         Operates on a slice of the document's main_text as defined through arguments
         main_text_start and main_text_stop; defaulting to the whole main_text.
 
-        Args:
-            delim (str, optional): Delimiter to use when concatenating the various
+        :param delim: Delimiter to use when concatenating the various
                 Markdown parts. Defaults to "\n\n".
-            from_element (int, optional): Body slicing start index (inclusive).
+        :type delim: str
+        :param from_element: Body slicing start index (inclusive).
                 Defaults to 0.
-            to_element (Optional[int], optional): Body slicing stop index
+        :type from_element: int
+        :param to_element: Body slicing stop index
                 (exclusive). Defaults to None.
-
-        Returns:
-            str: The exported Markdown representation.
+        :type to_element: Optional[int]
+        :param delim: str:  (Default value = "\n\n")
+        :param from_element: int:  (Default value = 0)
+        :param to_element: Optional[int]:  (Default value = None)
+        :param labels: list[str]:  (Default value = ["title")
+        :param "subtitle-level-1":
+        :param "paragraph":
+        :param "caption":
+        :param "table":
+        :param "Text":
+        :param "text":
+        :param ]:
+        :param strict_text: bool:  (Default value = False)
+        :returns: The exported Markdown representation.
+        :rtype: str
         """
         has_title = False
         prev_text = ""
@@ -849,8 +1003,28 @@ def export_to_document_tokens(
         Operates on a slice of the document's body as defined through arguments
         from_element and to_element; defaulting to the whole main_text.
 
-        Returns:
-            str: The content of the document formatted as a DocTags string.
+        :param delim: str:  (Default value = "\n\n")
+        :param from_element: int:  (Default value = 0)
+        :param to_element: Optional[int]:  (Default value = None)
+        :param labels: list[str]:  (Default value = ["title")
+        :param "subtitle-level-1":
+        :param "Section-header" "paragraph":
+        :param "caption":
+        :param "table":
+        :param "figure":
+        :param "text":
+        :param "Text":
+        :param ]:
+        :param xsize: int:  (Default value = 100)
+        :param ysize: int:  (Default value = 100)
+        :param add_location: bool:  (Default value = True)
+        :param add_content: bool:  (Default value = True)
+        :param add_page_index: bool:  (Default value = True)
+        :param # table specific flagsadd_table_cell_location: bool
+        :param add_table_cell_label: bool:  (Default value = True)
+        :param add_table_cell_text: bool:  (Default value = True)
+        :returns: The content of the document formatted as a DocTags string.
+        :rtype: str
         """
         new_line = ""
         if delim:
@@ -931,6 +1105,13 @@ def export_to_document_tokens(
         return doctags
 
     def add_page(self, page_no: int, size: Size, hash: str) -> PageItem:
+        """add_page.
+
+        :param page_no: int:
+        :param size: Size:
+        :param hash: str:
+
+        """
         pitem = PageItem(page_no=page_no, size=size, hash=hash)
 
         self.pages[page_no] = pitem
diff --git a/docling_core/types/experimental/labels.py b/docling_core/types/experimental/labels.py
index ca81c21..99abfca 100644
--- a/docling_core/types/experimental/labels.py
+++ b/docling_core/types/experimental/labels.py
@@ -1,7 +1,11 @@
+"""Models for the labels types."""
+
 from enum import Enum
 
 
 class DocItemLabel(str, Enum):
+    """DocItemLabel."""
+
     # DocLayNet v2
     CAPTION = "caption"
     FOOTNOTE = "footnote"
@@ -29,6 +33,8 @@ class DocItemLabel(str, Enum):
 
 
 class GroupLabel(str, Enum):
+    """GroupLabel."""
+
     UNSPECIFIED = "unspecified"
     LIST = "list"  # group label for list container (not the list-items)
     CHAPTER = "chapter"
@@ -38,6 +44,8 @@ class GroupLabel(str, Enum):
 
 
 class TableCellLabel(str, Enum):
+    """TableCellLabel."""
+
     COLUMN_HEADER = "col_header"
     ROW_HEADER = "row_header"
     SECTION = "row_section"
diff --git a/test/data/experimental/2206.01062.experimental.yaml b/test/data/experimental/2206.01062.experimental.yaml
index 76f8480..efd8bb1 100644
--- a/test/data/experimental/2206.01062.experimental.yaml
+++ b/test/data/experimental/2206.01062.experimental.yaml
@@ -140,6 +140,7 @@ body:
   - $ref: '#/texts/127'
   dloc: '#/body'
   hash: 1876595454579351028
+  label: unspecified
   name: _root_
   parent: null
 description: {}
@@ -215,6 +216,7 @@ furniture:
   children: []
   dloc: '#/furniture'
   hash: 5280524054814059340
+  label: unspecified
   name: _root_
   parent: null
 groups: []
diff --git a/test/data/experimental/dummy_doc.yaml b/test/data/experimental/dummy_doc.yaml
index d9a6230..8b0116d 100644
--- a/test/data/experimental/dummy_doc.yaml
+++ b/test/data/experimental/dummy_doc.yaml
@@ -92,7 +92,7 @@ texts:
           t: 354.3
           b: 334.4
           r: 376.0
-          coord_origin: "BOTTOMLEFT"
+          coord_origin: BOTTOMLEFT
         charspan: [ 1,423 ] # 2-tuple, references to "orig"
 
 
@@ -123,7 +123,7 @@ tables: # All tables...
           t: 354.3
           b: 334.4
           r: 376.0
-          coord_origin: "BOTTOMLEFT"
+          coord_origin: BOTTOMLEFT
         charspan: [ 1,423 ] # 2-tuple, references to "orig"
 
 figures: # All figures...

From 8251f9944cd169d56984885cba253f1e250cdc2f Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Wed, 25 Sep 2024 13:52:02 +0200
Subject: [PATCH 16/34] Lockfile rollback, since updating breaks tests

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 poetry.lock    | 1079 +++++++++++++++++++++---------------------------
 pyproject.toml |    2 -
 2 files changed, 477 insertions(+), 604 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 0d72bb9..4ab6882 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -13,22 +13,22 @@ files = [
 
 [[package]]
 name = "attrs"
-version = "24.2.0"
+version = "23.2.0"
 description = "Classes Without Boilerplate"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"},
-    {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"},
+    {file = "attrs-23.2.0-py3-none-any.whl", hash = "sha256:99b87a485a5820b23b879f04c2305b44b951b502fd64be915879d77a7e8fc6f1"},
+    {file = "attrs-23.2.0.tar.gz", hash = "sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30"},
 ]
 
 [package.extras]
-benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
-cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
-dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
-docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
-tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
-tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
+cov = ["attrs[tests]", "coverage[toml] (>=5.3)"]
+dev = ["attrs[tests]", "pre-commit"]
+docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope-interface"]
+tests = ["attrs[tests-no-zope]", "zope-interface"]
+tests-mypy = ["mypy (>=1.6)", "pytest-mypy-plugins"]
+tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-xdist[psutil]"]
 
 [[package]]
 name = "autoflake"
@@ -45,50 +45,35 @@ files = [
 pyflakes = ">=3.0.0"
 tomli = {version = ">=2.0.1", markers = "python_version < \"3.11\""}
 
-[[package]]
-name = "backports-tarfile"
-version = "1.2.0"
-description = "Backport of CPython tarfile module"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "backports.tarfile-1.2.0-py3-none-any.whl", hash = "sha256:77e284d754527b01fb1e6fa8a1afe577858ebe4e9dad8919e34c862cb399bc34"},
-    {file = "backports_tarfile-1.2.0.tar.gz", hash = "sha256:d75e02c268746e1b8144c278978b6e98e85de6ad16f8e4b0844a154557eca991"},
-]
-
-[package.extras]
-docs = ["furo", "jaraco.packaging (>=9.3)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)"]
-
 [[package]]
 name = "black"
-version = "24.8.0"
+version = "24.4.2"
 description = "The uncompromising code formatter."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "black-24.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:09cdeb74d494ec023ded657f7092ba518e8cf78fa8386155e4a03fdcc44679e6"},
-    {file = "black-24.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:81c6742da39f33b08e791da38410f32e27d632260e599df7245cccee2064afeb"},
-    {file = "black-24.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:707a1ca89221bc8a1a64fb5e15ef39cd755633daa672a9db7498d1c19de66a42"},
-    {file = "black-24.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:d6417535d99c37cee4091a2f24eb2b6d5ec42b144d50f1f2e436d9fe1916fe1a"},
-    {file = "black-24.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fb6e2c0b86bbd43dee042e48059c9ad7830abd5c94b0bc518c0eeec57c3eddc1"},
-    {file = "black-24.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:837fd281f1908d0076844bc2b801ad2d369c78c45cf800cad7b61686051041af"},
-    {file = "black-24.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:62e8730977f0b77998029da7971fa896ceefa2c4c4933fcd593fa599ecbf97a4"},
-    {file = "black-24.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:72901b4913cbac8972ad911dc4098d5753704d1f3c56e44ae8dce99eecb0e3af"},
-    {file = "black-24.8.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:7c046c1d1eeb7aea9335da62472481d3bbf3fd986e093cffd35f4385c94ae368"},
-    {file = "black-24.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:649f6d84ccbae73ab767e206772cc2d7a393a001070a4c814a546afd0d423aed"},
-    {file = "black-24.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2b59b250fdba5f9a9cd9d0ece6e6d993d91ce877d121d161e4698af3eb9c1018"},
-    {file = "black-24.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:6e55d30d44bed36593c3163b9bc63bf58b3b30e4611e4d88a0c3c239930ed5b2"},
-    {file = "black-24.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:505289f17ceda596658ae81b61ebbe2d9b25aa78067035184ed0a9d855d18afd"},
-    {file = "black-24.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b19c9ad992c7883ad84c9b22aaa73562a16b819c1d8db7a1a1a49fb7ec13c7d2"},
-    {file = "black-24.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f13f7f386f86f8121d76599114bb8c17b69d962137fc70efe56137727c7047e"},
-    {file = "black-24.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:f490dbd59680d809ca31efdae20e634f3fae27fba3ce0ba3208333b713bc3920"},
-    {file = "black-24.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:eab4dd44ce80dea27dc69db40dab62d4ca96112f87996bca68cd75639aeb2e4c"},
-    {file = "black-24.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3c4285573d4897a7610054af5a890bde7c65cb466040c5f0c8b732812d7f0e5e"},
-    {file = "black-24.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e84e33b37be070ba135176c123ae52a51f82306def9f7d063ee302ecab2cf47"},
-    {file = "black-24.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:73bbf84ed136e45d451a260c6b73ed674652f90a2b3211d6a35e78054563a9bb"},
-    {file = "black-24.8.0-py3-none-any.whl", hash = "sha256:972085c618ee94f402da1af548a4f218c754ea7e5dc70acb168bfaca4c2542ed"},
-    {file = "black-24.8.0.tar.gz", hash = "sha256:2500945420b6784c38b9ee885af039f5e7471ef284ab03fa35ecdde4688cd83f"},
+    {file = "black-24.4.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dd1b5a14e417189db4c7b64a6540f31730713d173f0b63e55fabd52d61d8fdce"},
+    {file = "black-24.4.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e537d281831ad0e71007dcdcbe50a71470b978c453fa41ce77186bbe0ed6021"},
+    {file = "black-24.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eaea3008c281f1038edb473c1aa8ed8143a5535ff18f978a318f10302b254063"},
+    {file = "black-24.4.2-cp310-cp310-win_amd64.whl", hash = "sha256:7768a0dbf16a39aa5e9a3ded568bb545c8c2727396d063bbaf847df05b08cd96"},
+    {file = "black-24.4.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:257d724c2c9b1660f353b36c802ccece186a30accc7742c176d29c146df6e474"},
+    {file = "black-24.4.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bdde6f877a18f24844e381d45e9947a49e97933573ac9d4345399be37621e26c"},
+    {file = "black-24.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e151054aa00bad1f4e1f04919542885f89f5f7d086b8a59e5000e6c616896ffb"},
+    {file = "black-24.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:7e122b1c4fb252fd85df3ca93578732b4749d9be076593076ef4d07a0233c3e1"},
+    {file = "black-24.4.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:accf49e151c8ed2c0cdc528691838afd217c50412534e876a19270fea1e28e2d"},
+    {file = "black-24.4.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:88c57dc656038f1ab9f92b3eb5335ee9b021412feaa46330d5eba4e51fe49b04"},
+    {file = "black-24.4.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be8bef99eb46d5021bf053114442914baeb3649a89dc5f3a555c88737e5e98fc"},
+    {file = "black-24.4.2-cp312-cp312-win_amd64.whl", hash = "sha256:415e686e87dbbe6f4cd5ef0fbf764af7b89f9057b97c908742b6008cc554b9c0"},
+    {file = "black-24.4.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bf10f7310db693bb62692609b397e8d67257c55f949abde4c67f9cc574492cc7"},
+    {file = "black-24.4.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:98e123f1d5cfd42f886624d84464f7756f60ff6eab89ae845210631714f6db94"},
+    {file = "black-24.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48a85f2cb5e6799a9ef05347b476cce6c182d6c71ee36925a6c194d074336ef8"},
+    {file = "black-24.4.2-cp38-cp38-win_amd64.whl", hash = "sha256:b1530ae42e9d6d5b670a34db49a94115a64596bc77710b1d05e9801e62ca0a7c"},
+    {file = "black-24.4.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:37aae07b029fa0174d39daf02748b379399b909652a806e5708199bd93899da1"},
+    {file = "black-24.4.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:da33a1a5e49c4122ccdfd56cd021ff1ebc4a1ec4e2d01594fef9b6f267a9e741"},
+    {file = "black-24.4.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef703f83fc32e131e9bcc0a5094cfe85599e7109f896fe8bc96cc402f3eb4b6e"},
+    {file = "black-24.4.2-cp39-cp39-win_amd64.whl", hash = "sha256:b9176b9832e84308818a99a561e90aa479e73c523b3f77afd07913380ae2eab7"},
+    {file = "black-24.4.2-py3-none-any.whl", hash = "sha256:d36ed1124bb81b32f8614555b34cc4259c3fbc7eec17870e8ff8ded335b58d8c"},
+    {file = "black-24.4.2.tar.gz", hash = "sha256:c872b53057f000085da66a19c55d68f6f8ddcac2642392ad3a355878406fbd4d"},
 ]
 
 [package.dependencies]
@@ -108,89 +93,74 @@ uvloop = ["uvloop (>=0.15.2)"]
 
 [[package]]
 name = "certifi"
-version = "2024.8.30"
+version = "2024.7.4"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"},
-    {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"},
+    {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"},
+    {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"},
 ]
 
 [[package]]
 name = "cffi"
-version = "1.17.1"
+version = "1.16.0"
 description = "Foreign Function Interface for Python calling C code."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"},
-    {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"},
-    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382"},
-    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702"},
-    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3"},
-    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5da5719280082ac6bd9aa7becb3938dc9f9cbd57fac7d2871717b1feb0902ab6"},
-    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb1a08b8008b281856e5971307cc386a8e9c5b625ac297e853d36da6efe9c17"},
-    {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8"},
-    {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e"},
-    {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be"},
-    {file = "cffi-1.17.1-cp310-cp310-win32.whl", hash = "sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c"},
-    {file = "cffi-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15"},
-    {file = "cffi-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401"},
-    {file = "cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf"},
-    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4"},
-    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41"},
-    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1"},
-    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6"},
-    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d"},
-    {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6"},
-    {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f"},
-    {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b"},
-    {file = "cffi-1.17.1-cp311-cp311-win32.whl", hash = "sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655"},
-    {file = "cffi-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0"},
-    {file = "cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4"},
-    {file = "cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c"},
-    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36"},
-    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5"},
-    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff"},
-    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99"},
-    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93"},
-    {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3"},
-    {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8"},
-    {file = "cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65"},
-    {file = "cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903"},
-    {file = "cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e"},
-    {file = "cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2"},
-    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3"},
-    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683"},
-    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5"},
-    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4"},
-    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd"},
-    {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed"},
-    {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9"},
-    {file = "cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d"},
-    {file = "cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a"},
-    {file = "cffi-1.17.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b"},
-    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964"},
-    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9"},
-    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc"},
-    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c"},
-    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1"},
-    {file = "cffi-1.17.1-cp38-cp38-win32.whl", hash = "sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8"},
-    {file = "cffi-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1"},
-    {file = "cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16"},
-    {file = "cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36"},
-    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8"},
-    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca74b8dbe6e8e8263c0ffd60277de77dcee6c837a3d0881d8c1ead7268c9e576"},
-    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7f5baafcc48261359e14bcd6d9bff6d4b28d9103847c9e136694cb0501aef87"},
-    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98e3969bcff97cae1b2def8ba499ea3d6f31ddfdb7635374834cf89a1a08ecf0"},
-    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdf5ce3acdfd1661132f2a9c19cac174758dc2352bfe37d98aa7512c6b7178b3"},
-    {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9755e4345d1ec879e3849e62222a18c7174d65a6a92d5b346b1863912168b595"},
-    {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f1e22e8c4419538cb197e4dd60acc919d7696e5ef98ee4da4e01d3f8cfa4cc5a"},
-    {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c03e868a0b3bc35839ba98e74211ed2b05d2119be4e8a0f224fba9384f1fe02e"},
-    {file = "cffi-1.17.1-cp39-cp39-win32.whl", hash = "sha256:e31ae45bc2e29f6b2abd0de1cc3b9d5205aa847cafaecb8af1476a609a2f6eb7"},
-    {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"},
-    {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"},
+    {file = "cffi-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088"},
+    {file = "cffi-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614"},
+    {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743"},
+    {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d"},
+    {file = "cffi-1.16.0-cp310-cp310-win32.whl", hash = "sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a"},
+    {file = "cffi-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1"},
+    {file = "cffi-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404"},
+    {file = "cffi-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e"},
+    {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc"},
+    {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb"},
+    {file = "cffi-1.16.0-cp311-cp311-win32.whl", hash = "sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab"},
+    {file = "cffi-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba"},
+    {file = "cffi-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956"},
+    {file = "cffi-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969"},
+    {file = "cffi-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520"},
+    {file = "cffi-1.16.0-cp312-cp312-win32.whl", hash = "sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b"},
+    {file = "cffi-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235"},
+    {file = "cffi-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324"},
+    {file = "cffi-1.16.0-cp38-cp38-win32.whl", hash = "sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a"},
+    {file = "cffi-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36"},
+    {file = "cffi-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed"},
+    {file = "cffi-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098"},
+    {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000"},
+    {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe"},
+    {file = "cffi-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4"},
+    {file = "cffi-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8"},
+    {file = "cffi-1.16.0.tar.gz", hash = "sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0"},
 ]
 
 [package.dependencies]
@@ -448,13 +418,13 @@ files = [
 
 [[package]]
 name = "exceptiongroup"
-version = "1.2.2"
+version = "1.2.1"
 description = "Backport of PEP 654 (exception groups)"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
-    {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
+    {file = "exceptiongroup-1.2.1-py3-none-any.whl", hash = "sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad"},
+    {file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"},
 ]
 
 [package.extras]
@@ -462,29 +432,29 @@ test = ["pytest (>=6)"]
 
 [[package]]
 name = "filelock"
-version = "3.16.1"
+version = "3.15.4"
 description = "A platform independent file lock."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "filelock-3.16.1-py3-none-any.whl", hash = "sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0"},
-    {file = "filelock-3.16.1.tar.gz", hash = "sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435"},
+    {file = "filelock-3.15.4-py3-none-any.whl", hash = "sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7"},
+    {file = "filelock-3.15.4.tar.gz", hash = "sha256:2207938cbc1844345cb01a5a95524dae30f0ce089eba5b00378295a17e3e90cb"},
 ]
 
 [package.extras]
-docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4.1)"]
-testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.2)", "pytest (>=8.3.3)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.4)"]
-typing = ["typing-extensions (>=4.12.2)"]
+docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"]
+testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-asyncio (>=0.21)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)", "virtualenv (>=20.26.2)"]
+typing = ["typing-extensions (>=4.8)"]
 
 [[package]]
 name = "flake8"
-version = "7.1.1"
+version = "7.1.0"
 description = "the modular source code checker: pep8 pyflakes and co"
 optional = false
 python-versions = ">=3.8.1"
 files = [
-    {file = "flake8-7.1.1-py2.py3-none-any.whl", hash = "sha256:597477df7860daa5aa0fdd84bf5208a043ab96b8e96ab708770ae0364dd03213"},
-    {file = "flake8-7.1.1.tar.gz", hash = "sha256:049d058491e228e03e67b390f311bbf88fce2dbaa8fa673e7aea87b7198b8d38"},
+    {file = "flake8-7.1.0-py2.py3-none-any.whl", hash = "sha256:2e416edcc62471a64cea09353f4e7bdba32aeb079b6e360554c659a122b1bc6a"},
+    {file = "flake8-7.1.0.tar.gz", hash = "sha256:48a07b626b55236e0fb4784ee69a465fbf59d79eec1f5b4785c3d3bc57d17aa5"},
 ]
 
 [package.dependencies]
@@ -551,13 +521,13 @@ files = [
 
 [[package]]
 name = "identify"
-version = "2.6.1"
+version = "2.6.0"
 description = "File identification library for Python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "identify-2.6.1-py2.py3-none-any.whl", hash = "sha256:53863bcac7caf8d2ed85bd20312ea5dcfc22226800f6d6881f232d861db5a8f0"},
-    {file = "identify-2.6.1.tar.gz", hash = "sha256:91478c5fb7c3aac5ff7bf9b4344f803843dc586832d5f110d672b19aa1984c98"},
+    {file = "identify-2.6.0-py2.py3-none-any.whl", hash = "sha256:e79ae4406387a9d300332b5fd366d8994f1525e8414984e1a59e058b2eda2dd0"},
+    {file = "identify-2.6.0.tar.gz", hash = "sha256:cb171c685bdc31bcc4c1734698736a7d5b6c8bf2e0c15117f4d469c8640ae5cf"},
 ]
 
 [package.extras]
@@ -565,40 +535,33 @@ license = ["ukkonen"]
 
 [[package]]
 name = "idna"
-version = "3.10"
+version = "3.7"
 description = "Internationalized Domain Names in Applications (IDNA)"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.5"
 files = [
-    {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"},
-    {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"},
+    {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"},
+    {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
 ]
 
-[package.extras]
-all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
-
 [[package]]
 name = "importlib-metadata"
-version = "8.5.0"
+version = "8.0.0"
 description = "Read metadata from Python packages"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b"},
-    {file = "importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7"},
+    {file = "importlib_metadata-8.0.0-py3-none-any.whl", hash = "sha256:15584cf2b1bf449d98ff8a6ff1abef57bf20f3ac6454f431736cd3e660921b2f"},
+    {file = "importlib_metadata-8.0.0.tar.gz", hash = "sha256:188bd24e4c346d3f0a933f275c2fec67050326a856b9a359881d7c2a697e8812"},
 ]
 
 [package.dependencies]
-zipp = ">=3.20"
+zipp = ">=0.5"
 
 [package.extras]
-check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
-cover = ["pytest-cov"]
 doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-enabler = ["pytest-enabler (>=2.2)"]
 perf = ["ipython"]
-test = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"]
-type = ["pytest-mypy"]
+test = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-perf (>=0.9.2)", "pytest-ruff (>=0.2.1)"]
 
 [[package]]
 name = "iniconfig"
@@ -654,42 +617,6 @@ more-itertools = "*"
 docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
 testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
 
-[[package]]
-name = "jaraco-context"
-version = "6.0.1"
-description = "Useful decorators and context managers"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "jaraco.context-6.0.1-py3-none-any.whl", hash = "sha256:f797fc481b490edb305122c9181830a3a5b76d84ef6d1aef2fb9b47ab956f9e4"},
-    {file = "jaraco_context-6.0.1.tar.gz", hash = "sha256:9bae4ea555cf0b14938dc0aee7c9f32ed303aa20a3b73e7dc80111628792d1b3"},
-]
-
-[package.dependencies]
-"backports.tarfile" = {version = "*", markers = "python_version < \"3.12\""}
-
-[package.extras]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-test = ["portend", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
-
-[[package]]
-name = "jaraco-functools"
-version = "4.0.2"
-description = "Functools like those found in stdlib"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "jaraco.functools-4.0.2-py3-none-any.whl", hash = "sha256:c9d16a3ed4ccb5a889ad8e0b7a343401ee5b2a71cee6ed192d3f68bc351e94e3"},
-    {file = "jaraco_functools-4.0.2.tar.gz", hash = "sha256:3460c74cd0d32bf82b9576bbb3527c4364d5b27a21f5158a62aed6c4b42e23f5"},
-]
-
-[package.dependencies]
-more-itertools = "*"
-
-[package.extras]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-test = ["jaraco.classes", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
-
 [[package]]
 name = "jeepney"
 version = "0.8.0"
@@ -747,13 +674,13 @@ requests = ">=2.31.0,<3.0.0"
 
 [[package]]
 name = "jsondiff"
-version = "2.2.1"
+version = "2.1.1"
 description = "Diff JSON and JSON-like structures in Python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "jsondiff-2.2.1-py3-none-any.whl", hash = "sha256:b1f0f7e2421881848b1d556d541ac01a91680cfcc14f51a9b62cdf4da0e56722"},
-    {file = "jsondiff-2.2.1.tar.gz", hash = "sha256:658d162c8a86ba86de26303cd86a7b37e1b2c1ec98b569a60e2ca6180545f7fe"},
+    {file = "jsondiff-2.1.1-py3-none-any.whl", hash = "sha256:ffab5bc00237c2c9f48a4b07fff7bf7df13e4b98f9585bd00b6e6e5f371a98fc"},
+    {file = "jsondiff-2.1.1.tar.gz", hash = "sha256:c7dfd4f8c9307500a536e9b93492b2c1ba62dac2b3c5189aa6e37d63b427b4d8"},
 ]
 
 [package.dependencies]
@@ -810,32 +737,26 @@ referencing = ">=0.31.0"
 
 [[package]]
 name = "keyring"
-version = "25.4.1"
+version = "24.3.1"
 description = "Store and access your passwords safely."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "keyring-25.4.1-py3-none-any.whl", hash = "sha256:5426f817cf7f6f007ba5ec722b1bcad95a75b27d780343772ad76b17cb47b0bf"},
-    {file = "keyring-25.4.1.tar.gz", hash = "sha256:b07ebc55f3e8ed86ac81dd31ef14e81ace9dd9c3d4b5d77a6e9a2016d0d71a1b"},
+    {file = "keyring-24.3.1-py3-none-any.whl", hash = "sha256:df38a4d7419a6a60fea5cef1e45a948a3e8430dd12ad88b0f423c5c143906218"},
+    {file = "keyring-24.3.1.tar.gz", hash = "sha256:c3327b6ffafc0e8befbdb597cacdb4928ffe5c1212f7645f186e6d9957a898db"},
 ]
 
 [package.dependencies]
 importlib-metadata = {version = ">=4.11.4", markers = "python_version < \"3.12\""}
 "jaraco.classes" = "*"
-"jaraco.context" = "*"
-"jaraco.functools" = "*"
 jeepney = {version = ">=0.4.2", markers = "sys_platform == \"linux\""}
 pywin32-ctypes = {version = ">=0.2.0", markers = "sys_platform == \"win32\""}
 SecretStorage = {version = ">=3.2", markers = "sys_platform == \"linux\""}
 
 [package.extras]
-check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
 completion = ["shtab (>=1.1.0)"]
-cover = ["pytest-cov"]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-enabler = ["pytest-enabler (>=2.2)"]
-test = ["pyfakefs", "pytest (>=6,!=8.1.*)"]
-type = ["pygobject-stubs", "pytest-mypy", "shtab", "types-pywin32"]
+docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"]
+testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
 
 [[package]]
 name = "markdown2"
@@ -925,13 +846,13 @@ files = [
 
 [[package]]
 name = "marshmallow"
-version = "3.22.0"
+version = "3.21.3"
 description = "A lightweight library for converting complex datatypes to and from native Python datatypes."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "marshmallow-3.22.0-py3-none-any.whl", hash = "sha256:71a2dce49ef901c3f97ed296ae5051135fd3febd2bf43afe0ae9a82143a494d9"},
-    {file = "marshmallow-3.22.0.tar.gz", hash = "sha256:4972f529104a220bb8637d595aa4c9762afbe7f7a77d82dc58c1615d70c5823e"},
+    {file = "marshmallow-3.21.3-py3-none-any.whl", hash = "sha256:86ce7fb914aa865001a4b2092c4c2872d13bc347f3d42673272cabfdbad386f1"},
+    {file = "marshmallow-3.21.3.tar.gz", hash = "sha256:4f57c5e050a54d66361e826f94fba213eb10b67b2fdb02c3e0343ce207ba1662"},
 ]
 
 [package.dependencies]
@@ -939,7 +860,7 @@ packaging = ">=17.0"
 
 [package.extras]
 dev = ["marshmallow[tests]", "pre-commit (>=3.5,<4.0)", "tox"]
-docs = ["alabaster (==1.0.0)", "autodocsumm (==0.2.13)", "sphinx (==8.0.2)", "sphinx-issues (==4.1.0)", "sphinx-version-warning (==1.1.2)"]
+docs = ["alabaster (==0.7.16)", "autodocsumm (==0.2.12)", "sphinx (==7.3.7)", "sphinx-issues (==4.1.0)", "sphinx-version-warning (==1.1.2)"]
 tests = ["pytest", "pytz", "simplejson"]
 
 [[package]]
@@ -969,55 +890,55 @@ files = [
 
 [[package]]
 name = "more-itertools"
-version = "10.5.0"
+version = "10.3.0"
 description = "More routines for operating on iterables, beyond itertools"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "more-itertools-10.5.0.tar.gz", hash = "sha256:5482bfef7849c25dc3c6dd53a6173ae4795da2a41a80faea6700d9f5846c5da6"},
-    {file = "more_itertools-10.5.0-py3-none-any.whl", hash = "sha256:037b0d3203ce90cca8ab1defbbdac29d5f993fc20131f3664dc8d6acfa872aef"},
+    {file = "more-itertools-10.3.0.tar.gz", hash = "sha256:e5d93ef411224fbcef366a6e8ddc4c5781bc6359d43412a65dd5964e46111463"},
+    {file = "more_itertools-10.3.0-py3-none-any.whl", hash = "sha256:ea6a02e24a9161e51faad17a8782b92a0df82c12c1c8886fec7f0c3fa1a1b320"},
 ]
 
 [[package]]
 name = "mypy"
-version = "1.11.2"
+version = "1.10.1"
 description = "Optional static typing for Python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "mypy-1.11.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d42a6dd818ffce7be66cce644f1dff482f1d97c53ca70908dff0b9ddc120b77a"},
-    {file = "mypy-1.11.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:801780c56d1cdb896eacd5619a83e427ce436d86a3bdf9112527f24a66618fef"},
-    {file = "mypy-1.11.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:41ea707d036a5307ac674ea172875f40c9d55c5394f888b168033177fce47383"},
-    {file = "mypy-1.11.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6e658bd2d20565ea86da7d91331b0eed6d2eee22dc031579e6297f3e12c758c8"},
-    {file = "mypy-1.11.2-cp310-cp310-win_amd64.whl", hash = "sha256:478db5f5036817fe45adb7332d927daa62417159d49783041338921dcf646fc7"},
-    {file = "mypy-1.11.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:75746e06d5fa1e91bfd5432448d00d34593b52e7e91a187d981d08d1f33d4385"},
-    {file = "mypy-1.11.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a976775ab2256aadc6add633d44f100a2517d2388906ec4f13231fafbb0eccca"},
-    {file = "mypy-1.11.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cd953f221ac1379050a8a646585a29574488974f79d8082cedef62744f0a0104"},
-    {file = "mypy-1.11.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:57555a7715c0a34421013144a33d280e73c08df70f3a18a552938587ce9274f4"},
-    {file = "mypy-1.11.2-cp311-cp311-win_amd64.whl", hash = "sha256:36383a4fcbad95f2657642a07ba22ff797de26277158f1cc7bd234821468b1b6"},
-    {file = "mypy-1.11.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e8960dbbbf36906c5c0b7f4fbf2f0c7ffb20f4898e6a879fcf56a41a08b0d318"},
-    {file = "mypy-1.11.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:06d26c277962f3fb50e13044674aa10553981ae514288cb7d0a738f495550b36"},
-    {file = "mypy-1.11.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6e7184632d89d677973a14d00ae4d03214c8bc301ceefcdaf5c474866814c987"},
-    {file = "mypy-1.11.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3a66169b92452f72117e2da3a576087025449018afc2d8e9bfe5ffab865709ca"},
-    {file = "mypy-1.11.2-cp312-cp312-win_amd64.whl", hash = "sha256:969ea3ef09617aff826885a22ece0ddef69d95852cdad2f60c8bb06bf1f71f70"},
-    {file = "mypy-1.11.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:37c7fa6121c1cdfcaac97ce3d3b5588e847aa79b580c1e922bb5d5d2902df19b"},
-    {file = "mypy-1.11.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4a8a53bc3ffbd161b5b2a4fff2f0f1e23a33b0168f1c0778ec70e1a3d66deb86"},
-    {file = "mypy-1.11.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2ff93107f01968ed834f4256bc1fc4475e2fecf6c661260066a985b52741ddce"},
-    {file = "mypy-1.11.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:edb91dded4df17eae4537668b23f0ff6baf3707683734b6a818d5b9d0c0c31a1"},
-    {file = "mypy-1.11.2-cp38-cp38-win_amd64.whl", hash = "sha256:ee23de8530d99b6db0573c4ef4bd8f39a2a6f9b60655bf7a1357e585a3486f2b"},
-    {file = "mypy-1.11.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:801ca29f43d5acce85f8e999b1e431fb479cb02d0e11deb7d2abb56bdaf24fd6"},
-    {file = "mypy-1.11.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:af8d155170fcf87a2afb55b35dc1a0ac21df4431e7d96717621962e4b9192e70"},
-    {file = "mypy-1.11.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f7821776e5c4286b6a13138cc935e2e9b6fde05e081bdebf5cdb2bb97c9df81d"},
-    {file = "mypy-1.11.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:539c570477a96a4e6fb718b8d5c3e0c0eba1f485df13f86d2970c91f0673148d"},
-    {file = "mypy-1.11.2-cp39-cp39-win_amd64.whl", hash = "sha256:3f14cd3d386ac4d05c5a39a51b84387403dadbd936e17cb35882134d4f8f0d24"},
-    {file = "mypy-1.11.2-py3-none-any.whl", hash = "sha256:b499bc07dbdcd3de92b0a8b29fdf592c111276f6a12fe29c30f6c417dd546d12"},
-    {file = "mypy-1.11.2.tar.gz", hash = "sha256:7f9993ad3e0ffdc95c2a14b66dee63729f021968bff8ad911867579c65d13a79"},
+    {file = "mypy-1.10.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e36f229acfe250dc660790840916eb49726c928e8ce10fbdf90715090fe4ae02"},
+    {file = "mypy-1.10.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:51a46974340baaa4145363b9e051812a2446cf583dfaeba124af966fa44593f7"},
+    {file = "mypy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:901c89c2d67bba57aaaca91ccdb659aa3a312de67f23b9dfb059727cce2e2e0a"},
+    {file = "mypy-1.10.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0cd62192a4a32b77ceb31272d9e74d23cd88c8060c34d1d3622db3267679a5d9"},
+    {file = "mypy-1.10.1-cp310-cp310-win_amd64.whl", hash = "sha256:a2cbc68cb9e943ac0814c13e2452d2046c2f2b23ff0278e26599224cf164e78d"},
+    {file = "mypy-1.10.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bd6f629b67bb43dc0d9211ee98b96d8dabc97b1ad38b9b25f5e4c4d7569a0c6a"},
+    {file = "mypy-1.10.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a1bbb3a6f5ff319d2b9d40b4080d46cd639abe3516d5a62c070cf0114a457d84"},
+    {file = "mypy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8edd4e9bbbc9d7b79502eb9592cab808585516ae1bcc1446eb9122656c6066f"},
+    {file = "mypy-1.10.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6166a88b15f1759f94a46fa474c7b1b05d134b1b61fca627dd7335454cc9aa6b"},
+    {file = "mypy-1.10.1-cp311-cp311-win_amd64.whl", hash = "sha256:5bb9cd11c01c8606a9d0b83ffa91d0b236a0e91bc4126d9ba9ce62906ada868e"},
+    {file = "mypy-1.10.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d8681909f7b44d0b7b86e653ca152d6dff0eb5eb41694e163c6092124f8246d7"},
+    {file = "mypy-1.10.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:378c03f53f10bbdd55ca94e46ec3ba255279706a6aacaecac52ad248f98205d3"},
+    {file = "mypy-1.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bacf8f3a3d7d849f40ca6caea5c055122efe70e81480c8328ad29c55c69e93e"},
+    {file = "mypy-1.10.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:701b5f71413f1e9855566a34d6e9d12624e9e0a8818a5704d74d6b0402e66c04"},
+    {file = "mypy-1.10.1-cp312-cp312-win_amd64.whl", hash = "sha256:3c4c2992f6ea46ff7fce0072642cfb62af7a2484efe69017ed8b095f7b39ef31"},
+    {file = "mypy-1.10.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:604282c886497645ffb87b8f35a57ec773a4a2721161e709a4422c1636ddde5c"},
+    {file = "mypy-1.10.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37fd87cab83f09842653f08de066ee68f1182b9b5282e4634cdb4b407266bade"},
+    {file = "mypy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8addf6313777dbb92e9564c5d32ec122bf2c6c39d683ea64de6a1fd98b90fe37"},
+    {file = "mypy-1.10.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5cc3ca0a244eb9a5249c7c583ad9a7e881aa5d7b73c35652296ddcdb33b2b9c7"},
+    {file = "mypy-1.10.1-cp38-cp38-win_amd64.whl", hash = "sha256:1b3a2ffce52cc4dbaeee4df762f20a2905aa171ef157b82192f2e2f368eec05d"},
+    {file = "mypy-1.10.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fe85ed6836165d52ae8b88f99527d3d1b2362e0cb90b005409b8bed90e9059b3"},
+    {file = "mypy-1.10.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c2ae450d60d7d020d67ab440c6e3fae375809988119817214440033f26ddf7bf"},
+    {file = "mypy-1.10.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6be84c06e6abd72f960ba9a71561c14137a583093ffcf9bbfaf5e613d63fa531"},
+    {file = "mypy-1.10.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2189ff1e39db399f08205e22a797383613ce1cb0cb3b13d8bcf0170e45b96cc3"},
+    {file = "mypy-1.10.1-cp39-cp39-win_amd64.whl", hash = "sha256:97a131ee36ac37ce9581f4220311247ab6cba896b4395b9c87af0675a13a755f"},
+    {file = "mypy-1.10.1-py3-none-any.whl", hash = "sha256:71d8ac0b906354ebda8ef1673e5fde785936ac1f29ff6987c7483cfbd5a4235a"},
+    {file = "mypy-1.10.1.tar.gz", hash = "sha256:1f8f492d7db9e3593ef42d4f115f04e556130f2819ad33ab84551403e97dd4c0"},
 ]
 
 [package.dependencies]
 mypy-extensions = ">=1.0.0"
 tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
-typing-extensions = ">=4.6.0"
+typing-extensions = ">=4.1.0"
 
 [package.extras]
 dmypy = ["psutil (>=4.0)"]
@@ -1201,60 +1122,47 @@ files = [
 
 [[package]]
 name = "pandas"
-version = "2.2.3"
+version = "2.2.2"
 description = "Powerful data structures for data analysis, time series, and statistics"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5"},
-    {file = "pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348"},
-    {file = "pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed"},
-    {file = "pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57"},
-    {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42"},
-    {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f"},
-    {file = "pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645"},
-    {file = "pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039"},
-    {file = "pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd"},
-    {file = "pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698"},
-    {file = "pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc"},
-    {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3"},
-    {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32"},
-    {file = "pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5"},
-    {file = "pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9"},
-    {file = "pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4"},
-    {file = "pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3"},
-    {file = "pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319"},
-    {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8"},
-    {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a"},
-    {file = "pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13"},
-    {file = "pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015"},
-    {file = "pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28"},
-    {file = "pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0"},
-    {file = "pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24"},
-    {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659"},
-    {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb"},
-    {file = "pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d"},
-    {file = "pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468"},
-    {file = "pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18"},
-    {file = "pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2"},
-    {file = "pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4"},
-    {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d"},
-    {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a"},
-    {file = "pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39"},
-    {file = "pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30"},
-    {file = "pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c"},
-    {file = "pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c"},
-    {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea"},
-    {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761"},
-    {file = "pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e"},
-    {file = "pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667"},
+    {file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"},
+    {file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"},
+    {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"},
+    {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"},
+    {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"},
+    {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e5a0b00e1e56a842f922e7fae8ae4077aee4af0acb5ae3622bd4b4c30aedf99"},
+    {file = "pandas-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:ddf818e4e6c7c6f4f7c8a12709696d193976b591cc7dc50588d3d1a6b5dc8772"},
+    {file = "pandas-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:696039430f7a562b74fa45f540aca068ea85fa34c244d0deee539cb6d70aa288"},
+    {file = "pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8e90497254aacacbc4ea6ae5e7a8cd75629d6ad2b30025a4a8b09aa4faf55151"},
+    {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58b84b91b0b9f4bafac2a0ac55002280c094dfc6402402332c0913a59654ab2b"},
+    {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee"},
+    {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2925720037f06e89af896c70bca73459d7e6a4be96f9de79e2d440bd499fe0db"},
+    {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0cace394b6ea70c01ca1595f839cf193df35d1575986e484ad35c4aeae7266c1"},
+    {file = "pandas-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:873d13d177501a28b2756375d59816c365e42ed8417b41665f346289adc68d24"},
+    {file = "pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9dfde2a0ddef507a631dc9dc4af6a9489d5e2e740e226ad426a05cabfbd7c8ef"},
+    {file = "pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e9b79011ff7a0f4b1d6da6a61aa1aa604fb312d6647de5bad20013682d1429ce"},
+    {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cb51fe389360f3b5a4d57dbd2848a5f033350336ca3b340d1c53a1fad33bcad"},
+    {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eee3a87076c0756de40b05c5e9a6069c035ba43e8dd71c379e68cab2c20f16ad"},
+    {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3e374f59e440d4ab45ca2fffde54b81ac3834cf5ae2cdfa69c90bc03bde04d76"},
+    {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"},
+    {file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"},
+    {file = "pandas-2.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0ca6377b8fca51815f382bd0b697a0814c8bda55115678cbc94c30aacbb6eff2"},
+    {file = "pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9057e6aa78a584bc93a13f0a9bf7e753a5e9770a30b4d758b8d5f2a62a9433cd"},
+    {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:001910ad31abc7bf06f49dcc903755d2f7f3a9186c0c040b827e522e9cef0863"},
+    {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66b479b0bd07204e37583c191535505410daa8df638fd8e75ae1b383851fe921"},
+    {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a77e9d1c386196879aa5eb712e77461aaee433e54c68cf253053a73b7e49c33a"},
+    {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92fd6b027924a7e178ac202cfbe25e53368db90d56872d20ffae94b96c7acc57"},
+    {file = "pandas-2.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:640cef9aa381b60e296db324337a554aeeb883ead99dc8f6c18e81a93942f5f4"},
+    {file = "pandas-2.2.2.tar.gz", hash = "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54"},
 ]
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
     {version = ">=1.22.4", markers = "python_version < \"3.11\""},
     {version = ">=1.23.2", markers = "python_version == \"3.11\""},
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
 python-dateutil = ">=2.8.2"
 pytz = ">=2020.1"
@@ -1326,19 +1234,19 @@ testing = ["pytest", "pytest-cov", "wheel"]
 
 [[package]]
 name = "platformdirs"
-version = "4.3.6"
+version = "4.2.2"
 description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"},
-    {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"},
+    {file = "platformdirs-4.2.2-py3-none-any.whl", hash = "sha256:2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee"},
+    {file = "platformdirs-4.2.2.tar.gz", hash = "sha256:38b7b51f512eed9e84a22788b4bce1de17c0adb134d6becb09836e37d8654cd3"},
 ]
 
 [package.extras]
-docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"]
-test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"]
-type = ["mypy (>=1.11.2)"]
+docs = ["furo (>=2023.9.10)", "proselint (>=0.13)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"]
+test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)"]
+type = ["mypy (>=1.8)"]
 
 [[package]]
 name = "pluggy"
@@ -1357,13 +1265,13 @@ testing = ["pytest", "pytest-benchmark"]
 
 [[package]]
 name = "pre-commit"
-version = "3.8.0"
+version = "3.7.1"
 description = "A framework for managing and maintaining multi-language pre-commit hooks."
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "pre_commit-3.8.0-py2.py3-none-any.whl", hash = "sha256:9a90a53bf82fdd8778d58085faf8d83df56e40dfe18f45b19446e26bf1b3a63f"},
-    {file = "pre_commit-3.8.0.tar.gz", hash = "sha256:8bb6494d4a20423842e198980c9ecf9f96607a07ea29549e180eef9ae80fe7af"},
+    {file = "pre_commit-3.7.1-py2.py3-none-any.whl", hash = "sha256:fae36fd1d7ad7d6a5a1c0b0d5adb2ed1a3bda5a21bf6c3e5372073d7a11cd4c5"},
+    {file = "pre_commit-3.7.1.tar.gz", hash = "sha256:8ca3ad567bc78a4972a3f1a477e94a79d4597e8140a6e0b651c5e33899c3654a"},
 ]
 
 [package.dependencies]
@@ -1375,13 +1283,13 @@ virtualenv = ">=20.10.0"
 
 [[package]]
 name = "pycodestyle"
-version = "2.12.1"
+version = "2.12.0"
 description = "Python style guide checker"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pycodestyle-2.12.1-py2.py3-none-any.whl", hash = "sha256:46f0fb92069a7c28ab7bb558f05bfc0110dac69a0cd23c61ea0040283a9d78b3"},
-    {file = "pycodestyle-2.12.1.tar.gz", hash = "sha256:6838eae08bbce4f6accd5d5572075c63626a15ee3e6f842df996bf62f6d73521"},
+    {file = "pycodestyle-2.12.0-py2.py3-none-any.whl", hash = "sha256:949a39f6b86c3e1515ba1787c2022131d165a8ad271b11370a8819aa070269e4"},
+    {file = "pycodestyle-2.12.0.tar.gz", hash = "sha256:442f950141b4f43df752dd303511ffded3a04c2b6fb7f65980574f0c31e6e79c"},
 ]
 
 [[package]]
@@ -1397,150 +1305,127 @@ files = [
 
 [[package]]
 name = "pydantic"
-version = "2.9.2"
+version = "2.8.2"
 description = "Data validation using Python type hints"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pydantic-2.9.2-py3-none-any.whl", hash = "sha256:f048cec7b26778210e28a0459867920654d48e5e62db0958433636cde4254f12"},
-    {file = "pydantic-2.9.2.tar.gz", hash = "sha256:d155cef71265d1e9807ed1c32b4c8deec042a44a50a4188b25ac67ecd81a9c0f"},
+    {file = "pydantic-2.8.2-py3-none-any.whl", hash = "sha256:73ee9fddd406dc318b885c7a2eab8a6472b68b8fb5ba8150949fc3db939f23c8"},
+    {file = "pydantic-2.8.2.tar.gz", hash = "sha256:6f62c13d067b0755ad1c21a34bdd06c0c12625a22b0fc09c6b149816604f7c2a"},
 ]
 
 [package.dependencies]
-annotated-types = ">=0.6.0"
-pydantic-core = "2.23.4"
+annotated-types = ">=0.4.0"
+pydantic-core = "2.20.1"
 typing-extensions = [
-    {version = ">=4.12.2", markers = "python_version >= \"3.13\""},
     {version = ">=4.6.1", markers = "python_version < \"3.13\""},
+    {version = ">=4.12.2", markers = "python_version >= \"3.13\""},
 ]
 
 [package.extras]
 email = ["email-validator (>=2.0.0)"]
-timezone = ["tzdata"]
 
 [[package]]
 name = "pydantic-core"
-version = "2.23.4"
+version = "2.20.1"
 description = "Core functionality for Pydantic validation and serialization"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pydantic_core-2.23.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:b10bd51f823d891193d4717448fab065733958bdb6a6b351967bd349d48d5c9b"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4fc714bdbfb534f94034efaa6eadd74e5b93c8fa6315565a222f7b6f42ca1166"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63e46b3169866bd62849936de036f901a9356e36376079b05efa83caeaa02ceb"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed1a53de42fbe34853ba90513cea21673481cd81ed1be739f7f2efb931b24916"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cfdd16ab5e59fc31b5e906d1a3f666571abc367598e3e02c83403acabc092e07"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:255a8ef062cbf6674450e668482456abac99a5583bbafb73f9ad469540a3a232"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a7cd62e831afe623fbb7aabbb4fe583212115b3ef38a9f6b71869ba644624a2"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f09e2ff1f17c2b51f2bc76d1cc33da96298f0a036a137f5440ab3ec5360b624f"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e38e63e6f3d1cec5a27e0afe90a085af8b6806ee208b33030e65b6516353f1a3"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0dbd8dbed2085ed23b5c04afa29d8fd2771674223135dc9bc937f3c09284d071"},
-    {file = "pydantic_core-2.23.4-cp310-none-win32.whl", hash = "sha256:6531b7ca5f951d663c339002e91aaebda765ec7d61b7d1e3991051906ddde119"},
-    {file = "pydantic_core-2.23.4-cp310-none-win_amd64.whl", hash = "sha256:7c9129eb40958b3d4500fa2467e6a83356b3b61bfff1b414c7361d9220f9ae8f"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:77733e3892bb0a7fa797826361ce8a9184d25c8dffaec60b7ffe928153680ba8"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b84d168f6c48fabd1f2027a3d1bdfe62f92cade1fb273a5d68e621da0e44e6d"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df49e7a0861a8c36d089c1ed57d308623d60416dab2647a4a17fe050ba85de0e"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ff02b6d461a6de369f07ec15e465a88895f3223eb75073ffea56b84d9331f607"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:996a38a83508c54c78a5f41456b0103c30508fed9abcad0a59b876d7398f25fd"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d97683ddee4723ae8c95d1eddac7c192e8c552da0c73a925a89fa8649bf13eea"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:216f9b2d7713eb98cb83c80b9c794de1f6b7e3145eef40400c62e86cee5f4e1e"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6f783e0ec4803c787bcea93e13e9932edab72068f68ecffdf86a99fd5918878b"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d0776dea117cf5272382634bd2a5c1b6eb16767c223c6a5317cd3e2a757c61a0"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d5f7a395a8cf1621939692dba2a6b6a830efa6b3cee787d82c7de1ad2930de64"},
-    {file = "pydantic_core-2.23.4-cp311-none-win32.whl", hash = "sha256:74b9127ffea03643e998e0c5ad9bd3811d3dac8c676e47db17b0ee7c3c3bf35f"},
-    {file = "pydantic_core-2.23.4-cp311-none-win_amd64.whl", hash = "sha256:98d134c954828488b153d88ba1f34e14259284f256180ce659e8d83e9c05eaa3"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f3e0da4ebaef65158d4dfd7d3678aad692f7666877df0002b8a522cdf088f231"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f69a8e0b033b747bb3e36a44e7732f0c99f7edd5cea723d45bc0d6e95377ffee"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:723314c1d51722ab28bfcd5240d858512ffd3116449c557a1336cbe3919beb87"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bb2802e667b7051a1bebbfe93684841cc9351004e2badbd6411bf357ab8d5ac8"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d18ca8148bebe1b0a382a27a8ee60350091a6ddaf475fa05ef50dc35b5df6327"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33e3d65a85a2a4a0dc3b092b938a4062b1a05f3a9abde65ea93b233bca0e03f2"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:128585782e5bfa515c590ccee4b727fb76925dd04a98864182b22e89a4e6ed36"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:68665f4c17edcceecc112dfed5dbe6f92261fb9d6054b47d01bf6371a6196126"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:20152074317d9bed6b7a95ade3b7d6054845d70584216160860425f4fbd5ee9e"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9261d3ce84fa1d38ed649c3638feefeae23d32ba9182963e465d58d62203bd24"},
-    {file = "pydantic_core-2.23.4-cp312-none-win32.whl", hash = "sha256:4ba762ed58e8d68657fc1281e9bb72e1c3e79cc5d464be146e260c541ec12d84"},
-    {file = "pydantic_core-2.23.4-cp312-none-win_amd64.whl", hash = "sha256:97df63000f4fea395b2824da80e169731088656d1818a11b95f3b173747b6cd9"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7530e201d10d7d14abce4fb54cfe5b94a0aefc87da539d0346a484ead376c3cc"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:df933278128ea1cd77772673c73954e53a1c95a4fdf41eef97c2b779271bd0bd"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cb3da3fd1b6a5d0279a01877713dbda118a2a4fc6f0d821a57da2e464793f05"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:42c6dcb030aefb668a2b7009c85b27f90e51e6a3b4d5c9bc4c57631292015b0d"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:696dd8d674d6ce621ab9d45b205df149399e4bb9aa34102c970b721554828510"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2971bb5ffe72cc0f555c13e19b23c85b654dd2a8f7ab493c262071377bfce9f6"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8394d940e5d400d04cad4f75c0598665cbb81aecefaca82ca85bd28264af7f9b"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0dff76e0602ca7d4cdaacc1ac4c005e0ce0dcfe095d5b5259163a80d3a10d327"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7d32706badfe136888bdea71c0def994644e09fff0bfe47441deaed8e96fdbc6"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ed541d70698978a20eb63d8c5d72f2cc6d7079d9d90f6b50bad07826f1320f5f"},
-    {file = "pydantic_core-2.23.4-cp313-none-win32.whl", hash = "sha256:3d5639516376dce1940ea36edf408c554475369f5da2abd45d44621cb616f769"},
-    {file = "pydantic_core-2.23.4-cp313-none-win_amd64.whl", hash = "sha256:5a1504ad17ba4210df3a045132a7baeeba5a200e930f57512ee02909fc5c4cb5"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:d4488a93b071c04dc20f5cecc3631fc78b9789dd72483ba15d423b5b3689b555"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:81965a16b675b35e1d09dd14df53f190f9129c0202356ed44ab2728b1c905658"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ffa2ebd4c8530079140dd2d7f794a9d9a73cbb8e9d59ffe24c63436efa8f271"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:61817945f2fe7d166e75fbfb28004034b48e44878177fc54d81688e7b85a3665"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:29d2c342c4bc01b88402d60189f3df065fb0dda3654744d5a165a5288a657368"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5e11661ce0fd30a6790e8bcdf263b9ec5988e95e63cf901972107efc49218b13"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d18368b137c6295db49ce7218b1a9ba15c5bc254c96d7c9f9e924a9bc7825ad"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ec4e55f79b1c4ffb2eecd8a0cfba9955a2588497d96851f4c8f99aa4a1d39b12"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:374a5e5049eda9e0a44c696c7ade3ff355f06b1fe0bb945ea3cac2bc336478a2"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5c364564d17da23db1106787675fc7af45f2f7b58b4173bfdd105564e132e6fb"},
-    {file = "pydantic_core-2.23.4-cp38-none-win32.whl", hash = "sha256:d7a80d21d613eec45e3d41eb22f8f94ddc758a6c4720842dc74c0581f54993d6"},
-    {file = "pydantic_core-2.23.4-cp38-none-win_amd64.whl", hash = "sha256:5f5ff8d839f4566a474a969508fe1c5e59c31c80d9e140566f9a37bba7b8d556"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:a4fa4fc04dff799089689f4fd502ce7d59de529fc2f40a2c8836886c03e0175a"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0a7df63886be5e270da67e0966cf4afbae86069501d35c8c1b3b6c168f42cb36"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dcedcd19a557e182628afa1d553c3895a9f825b936415d0dbd3cd0bbcfd29b4b"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f54b118ce5de9ac21c363d9b3caa6c800341e8c47a508787e5868c6b79c9323"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86d2f57d3e1379a9525c5ab067b27dbb8a0642fb5d454e17a9ac434f9ce523e3"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de6d1d1b9e5101508cb37ab0d972357cac5235f5c6533d1071964c47139257df"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1278e0d324f6908e872730c9102b0112477a7f7cf88b308e4fc36ce1bdb6d58c"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9a6b5099eeec78827553827f4c6b8615978bb4b6a88e5d9b93eddf8bb6790f55"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:e55541f756f9b3ee346b840103f32779c695a19826a4c442b7954550a0972040"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a5c7ba8ffb6d6f8f2ab08743be203654bb1aaa8c9dcb09f82ddd34eadb695605"},
-    {file = "pydantic_core-2.23.4-cp39-none-win32.whl", hash = "sha256:37b0fe330e4a58d3c58b24d91d1eb102aeec675a3db4c292ec3928ecd892a9a6"},
-    {file = "pydantic_core-2.23.4-cp39-none-win_amd64.whl", hash = "sha256:1498bec4c05c9c787bde9125cfdcc63a41004ff167f495063191b863399b1a29"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f455ee30a9d61d3e1a15abd5068827773d6e4dc513e795f380cdd59932c782d5"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1e90d2e3bd2c3863d48525d297cd143fe541be8bbf6f579504b9712cb6b643ec"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e203fdf807ac7e12ab59ca2bfcabb38c7cf0b33c41efeb00f8e5da1d86af480"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e08277a400de01bc72436a0ccd02bdf596631411f592ad985dcee21445bd0068"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f220b0eea5965dec25480b6333c788fb72ce5f9129e8759ef876a1d805d00801"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d06b0c8da4f16d1d1e352134427cb194a0a6e19ad5db9161bf32b2113409e728"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:ba1a0996f6c2773bd83e63f18914c1de3c9dd26d55f4ac302a7efe93fb8e7433"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:9a5bce9d23aac8f0cf0836ecfc033896aa8443b501c58d0602dbfd5bd5b37753"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:78ddaaa81421a29574a682b3179d4cf9e6d405a09b99d93ddcf7e5239c742e21"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:883a91b5dd7d26492ff2f04f40fbb652de40fcc0afe07e8129e8ae779c2110eb"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88ad334a15b32a791ea935af224b9de1bf99bcd62fabf745d5f3442199d86d59"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:233710f069d251feb12a56da21e14cca67994eab08362207785cf8c598e74577"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:19442362866a753485ba5e4be408964644dd6a09123d9416c54cd49171f50744"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:624e278a7d29b6445e4e813af92af37820fafb6dcc55c012c834f9e26f9aaaef"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f5ef8f42bec47f21d07668a043f077d507e5bf4e668d5c6dfe6aaba89de1a5b8"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:aea443fffa9fbe3af1a9ba721a87f926fe548d32cab71d188a6ede77d0ff244e"},
-    {file = "pydantic_core-2.23.4.tar.gz", hash = "sha256:2584f7cf844ac4d970fba483a717dbe10c1c1c96a969bf65d61ffe94df1b2863"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3acae97ffd19bf091c72df4d726d552c473f3576409b2a7ca36b2f535ffff4a3"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:41f4c96227a67a013e7de5ff8f20fb496ce573893b7f4f2707d065907bffdbd6"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f239eb799a2081495ea659d8d4a43a8f42cd1fe9ff2e7e436295c38a10c286a"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:53e431da3fc53360db73eedf6f7124d1076e1b4ee4276b36fb25514544ceb4a3"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1f62b2413c3a0e846c3b838b2ecd6c7a19ec6793b2a522745b0869e37ab5bc1"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d41e6daee2813ecceea8eda38062d69e280b39df793f5a942fa515b8ed67953"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d482efec8b7dc6bfaedc0f166b2ce349df0011f5d2f1f25537ced4cfc34fd98"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e93e1a4b4b33daed65d781a57a522ff153dcf748dee70b40c7258c5861e1768a"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e7c4ea22b6739b162c9ecaaa41d718dfad48a244909fe7ef4b54c0b530effc5a"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4f2790949cf385d985a31984907fecb3896999329103df4e4983a4a41e13e840"},
+    {file = "pydantic_core-2.20.1-cp310-none-win32.whl", hash = "sha256:5e999ba8dd90e93d57410c5e67ebb67ffcaadcea0ad973240fdfd3a135506250"},
+    {file = "pydantic_core-2.20.1-cp310-none-win_amd64.whl", hash = "sha256:512ecfbefef6dac7bc5eaaf46177b2de58cdf7acac8793fe033b24ece0b9566c"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d2a8fa9d6d6f891f3deec72f5cc668e6f66b188ab14bb1ab52422fe8e644f312"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:175873691124f3d0da55aeea1d90660a6ea7a3cfea137c38afa0a5ffabe37b88"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37eee5b638f0e0dcd18d21f59b679686bbd18917b87db0193ae36f9c23c355fc"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:25e9185e2d06c16ee438ed39bf62935ec436474a6ac4f9358524220f1b236e43"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:150906b40ff188a3260cbee25380e7494ee85048584998c1e66df0c7a11c17a6"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ad4aeb3e9a97286573c03df758fc7627aecdd02f1da04516a86dc159bf70121"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3f3ed29cd9f978c604708511a1f9c2fdcb6c38b9aae36a51905b8811ee5cbf1"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b0dae11d8f5ded51699c74d9548dcc5938e0804cc8298ec0aa0da95c21fff57b"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:faa6b09ee09433b87992fb5a2859efd1c264ddc37280d2dd5db502126d0e7f27"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9dc1b507c12eb0481d071f3c1808f0529ad41dc415d0ca11f7ebfc666e66a18b"},
+    {file = "pydantic_core-2.20.1-cp311-none-win32.whl", hash = "sha256:fa2fddcb7107e0d1808086ca306dcade7df60a13a6c347a7acf1ec139aa6789a"},
+    {file = "pydantic_core-2.20.1-cp311-none-win_amd64.whl", hash = "sha256:40a783fb7ee353c50bd3853e626f15677ea527ae556429453685ae32280c19c2"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:595ba5be69b35777474fa07f80fc260ea71255656191adb22a8c53aba4479231"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a4f55095ad087474999ee28d3398bae183a66be4823f753cd7d67dd0153427c9"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f9aa05d09ecf4c75157197f27cdc9cfaeb7c5f15021c6373932bf3e124af029f"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e97fdf088d4b31ff4ba35db26d9cc472ac7ef4a2ff2badeabf8d727b3377fc52"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc633a9fe1eb87e250b5c57d389cf28998e4292336926b0b6cdaee353f89a237"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d573faf8eb7e6b1cbbcb4f5b247c60ca8be39fe2c674495df0eb4318303137fe"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26dc97754b57d2fd00ac2b24dfa341abffc380b823211994c4efac7f13b9e90e"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:33499e85e739a4b60c9dac710c20a08dc73cb3240c9a0e22325e671b27b70d24"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:bebb4d6715c814597f85297c332297c6ce81e29436125ca59d1159b07f423eb1"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:516d9227919612425c8ef1c9b869bbbee249bc91912c8aaffb66116c0b447ebd"},
+    {file = "pydantic_core-2.20.1-cp312-none-win32.whl", hash = "sha256:469f29f9093c9d834432034d33f5fe45699e664f12a13bf38c04967ce233d688"},
+    {file = "pydantic_core-2.20.1-cp312-none-win_amd64.whl", hash = "sha256:035ede2e16da7281041f0e626459bcae33ed998cca6a0a007a5ebb73414ac72d"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:0827505a5c87e8aa285dc31e9ec7f4a17c81a813d45f70b1d9164e03a813a686"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:19c0fa39fa154e7e0b7f82f88ef85faa2a4c23cc65aae2f5aea625e3c13c735a"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa223cd1e36b642092c326d694d8bf59b71ddddc94cdb752bbbb1c5c91d833b"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c336a6d235522a62fef872c6295a42ecb0c4e1d0f1a3e500fe949415761b8a19"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7eb6a0587eded33aeefea9f916899d42b1799b7b14b8f8ff2753c0ac1741edac"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70c8daf4faca8da5a6d655f9af86faf6ec2e1768f4b8b9d0226c02f3d6209703"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e9fa4c9bf273ca41f940bceb86922a7667cd5bf90e95dbb157cbb8441008482c"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:11b71d67b4725e7e2a9f6e9c0ac1239bbc0c48cce3dc59f98635efc57d6dac83"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:270755f15174fb983890c49881e93f8f1b80f0b5e3a3cc1394a255706cabd203"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:c81131869240e3e568916ef4c307f8b99583efaa60a8112ef27a366eefba8ef0"},
+    {file = "pydantic_core-2.20.1-cp313-none-win32.whl", hash = "sha256:b91ced227c41aa29c672814f50dbb05ec93536abf8f43cd14ec9521ea09afe4e"},
+    {file = "pydantic_core-2.20.1-cp313-none-win_amd64.whl", hash = "sha256:65db0f2eefcaad1a3950f498aabb4875c8890438bc80b19362cf633b87a8ab20"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:4745f4ac52cc6686390c40eaa01d48b18997cb130833154801a442323cc78f91"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a8ad4c766d3f33ba8fd692f9aa297c9058970530a32c728a2c4bfd2616d3358b"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41e81317dd6a0127cabce83c0c9c3fbecceae981c8391e6f1dec88a77c8a569a"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:04024d270cf63f586ad41fff13fde4311c4fc13ea74676962c876d9577bcc78f"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eaad4ff2de1c3823fddf82f41121bdf453d922e9a238642b1dedb33c4e4f98ad"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:26ab812fa0c845df815e506be30337e2df27e88399b985d0bb4e3ecfe72df31c"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c5ebac750d9d5f2706654c638c041635c385596caf68f81342011ddfa1e5598"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2aafc5a503855ea5885559eae883978c9b6d8c8993d67766ee73d82e841300dd"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:4868f6bd7c9d98904b748a2653031fc9c2f85b6237009d475b1008bfaeb0a5aa"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:aa2f457b4af386254372dfa78a2eda2563680d982422641a85f271c859df1987"},
+    {file = "pydantic_core-2.20.1-cp38-none-win32.whl", hash = "sha256:225b67a1f6d602de0ce7f6c1c3ae89a4aa25d3de9be857999e9124f15dab486a"},
+    {file = "pydantic_core-2.20.1-cp38-none-win_amd64.whl", hash = "sha256:6b507132dcfc0dea440cce23ee2182c0ce7aba7054576efc65634f080dbe9434"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:b03f7941783b4c4a26051846dea594628b38f6940a2fdc0df00b221aed39314c"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1eedfeb6089ed3fad42e81a67755846ad4dcc14d73698c120a82e4ccf0f1f9f6"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:635fee4e041ab9c479e31edda27fcf966ea9614fff1317e280d99eb3e5ab6fe2"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:77bf3ac639c1ff567ae3b47f8d4cc3dc20f9966a2a6dd2311dcc055d3d04fb8a"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ed1b0132f24beeec5a78b67d9388656d03e6a7c837394f99257e2d55b461611"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c6514f963b023aeee506678a1cf821fe31159b925c4b76fe2afa94cc70b3222b"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10d4204d8ca33146e761c79f83cc861df20e7ae9f6487ca290a97702daf56006"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2d036c7187b9422ae5b262badb87a20a49eb6c5238b2004e96d4da1231badef1"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9ebfef07dbe1d93efb94b4700f2d278494e9162565a54f124c404a5656d7ff09"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:6b9d9bb600328a1ce523ab4f454859e9d439150abb0906c5a1983c146580ebab"},
+    {file = "pydantic_core-2.20.1-cp39-none-win32.whl", hash = "sha256:784c1214cb6dd1e3b15dd8b91b9a53852aed16671cc3fbe4786f4f1db07089e2"},
+    {file = "pydantic_core-2.20.1-cp39-none-win_amd64.whl", hash = "sha256:d2fe69c5434391727efa54b47a1e7986bb0186e72a41b203df8f5b0a19a4f669"},
+    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a45f84b09ac9c3d35dfcf6a27fd0634d30d183205230a0ebe8373a0e8cfa0906"},
+    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d02a72df14dfdbaf228424573a07af10637bd490f0901cee872c4f434a735b94"},
+    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2b27e6af28f07e2f195552b37d7d66b150adbaa39a6d327766ffd695799780f"},
+    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:084659fac3c83fd674596612aeff6041a18402f1e1bc19ca39e417d554468482"},
+    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:242b8feb3c493ab78be289c034a1f659e8826e2233786e36f2893a950a719bb6"},
+    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:38cf1c40a921d05c5edc61a785c0ddb4bed67827069f535d794ce6bcded919fc"},
+    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e0bbdd76ce9aa5d4209d65f2b27fc6e5ef1312ae6c5333c26db3f5ade53a1e99"},
+    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:254ec27fdb5b1ee60684f91683be95e5133c994cc54e86a0b0963afa25c8f8a6"},
+    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:407653af5617f0757261ae249d3fba09504d7a71ab36ac057c938572d1bc9331"},
+    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:c693e916709c2465b02ca0ad7b387c4f8423d1db7b4649c551f27a529181c5ad"},
+    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b5ff4911aea936a47d9376fd3ab17e970cc543d1b68921886e7f64bd28308d1"},
+    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:177f55a886d74f1808763976ac4efd29b7ed15c69f4d838bbd74d9d09cf6fa86"},
+    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:964faa8a861d2664f0c7ab0c181af0bea66098b1919439815ca8803ef136fc4e"},
+    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:4dd484681c15e6b9a977c785a345d3e378d72678fd5f1f3c0509608da24f2ac0"},
+    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f6d6cff3538391e8486a431569b77921adfcdef14eb18fbf19b7c0a5294d4e6a"},
+    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a6d511cc297ff0883bc3708b465ff82d7560193169a8b93260f74ecb0a5e08a7"},
+    {file = "pydantic_core-2.20.1.tar.gz", hash = "sha256:26ca695eeee5f9f1aeeb211ffc12f10bcb6f71e2989988fda61dabd65db878d4"},
 ]
 
 [package.dependencies]
 typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
 
-[[package]]
-name = "pydantic-extra-types"
-version = "2.9.0"
-description = "Extra Pydantic types."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "pydantic_extra_types-2.9.0-py3-none-any.whl", hash = "sha256:f0bb975508572ba7bf3390b7337807588463b7248587e69f43b1ad7c797530d0"},
-    {file = "pydantic_extra_types-2.9.0.tar.gz", hash = "sha256:e061c01636188743bb69f368dcd391f327b8cfbfede2fe1cbb1211b06601ba3b"},
-]
-
-[package.dependencies]
-pydantic = ">=2.5.2"
-
-[package.extras]
-all = ["pendulum (>=3.0.0,<4.0.0)", "phonenumbers (>=8,<9)", "pycountry (>=23)", "python-ulid (>=1,<2)", "python-ulid (>=1,<3)", "pytz (>=2024.1)", "semver (>=3.0.2)", "tzdata (>=2024.1)"]
-pendulum = ["pendulum (>=3.0.0,<4.0.0)"]
-phonenumbers = ["phonenumbers (>=8,<9)"]
-pycountry = ["pycountry (>=23)"]
-python-ulid = ["python-ulid (>=1,<2)", "python-ulid (>=1,<3)"]
-semver = ["semver (>=3.0.2)"]
-
 [[package]]
 name = "pydocstyle"
 version = "6.3.0"
@@ -1671,86 +1556,84 @@ test = ["coverage (>=5,<6)", "mock (==1.3.0)", "pytest (>=7,<8)", "pytest-mock (
 
 [[package]]
 name = "pytz"
-version = "2024.2"
+version = "2024.1"
 description = "World timezone definitions, modern and historical"
 optional = false
 python-versions = "*"
 files = [
-    {file = "pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725"},
-    {file = "pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a"},
+    {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
+    {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
 ]
 
 [[package]]
 name = "pywin32-ctypes"
-version = "0.2.3"
+version = "0.2.2"
 description = "A (partial) reimplementation of pywin32 using ctypes/cffi"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "pywin32-ctypes-0.2.3.tar.gz", hash = "sha256:d162dc04946d704503b2edc4d55f3dba5c1d539ead017afa00142c38b9885755"},
-    {file = "pywin32_ctypes-0.2.3-py3-none-any.whl", hash = "sha256:8a1513379d709975552d202d942d9837758905c8d01eb82b8bcc30918929e7b8"},
+    {file = "pywin32-ctypes-0.2.2.tar.gz", hash = "sha256:3426e063bdd5fd4df74a14fa3cf80a0b42845a87e1d1e81f6549f9daec593a60"},
+    {file = "pywin32_ctypes-0.2.2-py3-none-any.whl", hash = "sha256:bf490a1a709baf35d688fe0ecf980ed4de11d2b3e37b51e5442587a75d9957e7"},
 ]
 
 [[package]]
 name = "pyyaml"
-version = "6.0.2"
+version = "6.0.1"
 description = "YAML parser and emitter for Python"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.6"
 files = [
-    {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
-    {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
-    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"},
-    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"},
-    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"},
-    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"},
-    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"},
-    {file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"},
-    {file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"},
-    {file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"},
-    {file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"},
-    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"},
-    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"},
-    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"},
-    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"},
-    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"},
-    {file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"},
-    {file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"},
-    {file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"},
-    {file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"},
-    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"},
-    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"},
-    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"},
-    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"},
-    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"},
-    {file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"},
-    {file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"},
-    {file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"},
-    {file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"},
-    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"},
-    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"},
-    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"},
-    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"},
-    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"},
-    {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"},
-    {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"},
-    {file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"},
-    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"},
-    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"},
-    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"},
-    {file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"},
-    {file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"},
-    {file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"},
-    {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"},
-    {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"},
-    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"},
-    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"},
-    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"},
-    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"},
-    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"},
-    {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"},
-    {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"},
-    {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
+    {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"},
+    {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"},
+    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
+    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
+    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
+    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
+    {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
+    {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
+    {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
+    {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"},
+    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
+    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
+    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
+    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
+    {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
+    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
+    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"},
+    {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"},
+    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
+    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
+    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
+    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
+    {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
+    {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
+    {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
+    {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"},
+    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
+    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
+    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
+    {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
+    {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
 ]
 
 [[package]]
@@ -1838,114 +1721,110 @@ idna2008 = ["idna"]
 
 [[package]]
 name = "rpds-py"
-version = "0.20.0"
+version = "0.19.0"
 description = "Python bindings to Rust's persistent data structures (rpds)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "rpds_py-0.20.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3ad0fda1635f8439cde85c700f964b23ed5fc2d28016b32b9ee5fe30da5c84e2"},
-    {file = "rpds_py-0.20.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9bb4a0d90fdb03437c109a17eade42dfbf6190408f29b2744114d11586611d6f"},
-    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6377e647bbfd0a0b159fe557f2c6c602c159fc752fa316572f012fc0bf67150"},
-    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb851b7df9dda52dc1415ebee12362047ce771fc36914586b2e9fcbd7d293b3e"},
-    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e0f80b739e5a8f54837be5d5c924483996b603d5502bfff79bf33da06164ee2"},
-    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a8c94dad2e45324fc74dce25e1645d4d14df9a4e54a30fa0ae8bad9a63928e3"},
-    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8e604fe73ba048c06085beaf51147eaec7df856824bfe7b98657cf436623daf"},
-    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:df3de6b7726b52966edf29663e57306b23ef775faf0ac01a3e9f4012a24a4140"},
-    {file = "rpds_py-0.20.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cf258ede5bc22a45c8e726b29835b9303c285ab46fc7c3a4cc770736b5304c9f"},
-    {file = "rpds_py-0.20.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:55fea87029cded5df854ca7e192ec7bdb7ecd1d9a3f63d5c4eb09148acf4a7ce"},
-    {file = "rpds_py-0.20.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ae94bd0b2f02c28e199e9bc51485d0c5601f58780636185660f86bf80c89af94"},
-    {file = "rpds_py-0.20.0-cp310-none-win32.whl", hash = "sha256:28527c685f237c05445efec62426d285e47a58fb05ba0090a4340b73ecda6dee"},
-    {file = "rpds_py-0.20.0-cp310-none-win_amd64.whl", hash = "sha256:238a2d5b1cad28cdc6ed15faf93a998336eb041c4e440dd7f902528b8891b399"},
-    {file = "rpds_py-0.20.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:ac2f4f7a98934c2ed6505aead07b979e6f999389f16b714448fb39bbaa86a489"},
-    {file = "rpds_py-0.20.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:220002c1b846db9afd83371d08d239fdc865e8f8c5795bbaec20916a76db3318"},
-    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8d7919548df3f25374a1f5d01fbcd38dacab338ef5f33e044744b5c36729c8db"},
-    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:758406267907b3781beee0f0edfe4a179fbd97c0be2e9b1154d7f0a1279cf8e5"},
-    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3d61339e9f84a3f0767b1995adfb171a0d00a1185192718a17af6e124728e0f5"},
-    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1259c7b3705ac0a0bd38197565a5d603218591d3f6cee6e614e380b6ba61c6f6"},
-    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c1dc0f53856b9cc9a0ccca0a7cc61d3d20a7088201c0937f3f4048c1718a209"},
-    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7e60cb630f674a31f0368ed32b2a6b4331b8350d67de53c0359992444b116dd3"},
-    {file = "rpds_py-0.20.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbe982f38565bb50cb7fb061ebf762c2f254ca3d8c20d4006878766e84266272"},
-    {file = "rpds_py-0.20.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:514b3293b64187172bc77c8fb0cdae26981618021053b30d8371c3a902d4d5ad"},
-    {file = "rpds_py-0.20.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d0a26ffe9d4dd35e4dfdd1e71f46401cff0181c75ac174711ccff0459135fa58"},
-    {file = "rpds_py-0.20.0-cp311-none-win32.whl", hash = "sha256:89c19a494bf3ad08c1da49445cc5d13d8fefc265f48ee7e7556839acdacf69d0"},
-    {file = "rpds_py-0.20.0-cp311-none-win_amd64.whl", hash = "sha256:c638144ce971df84650d3ed0096e2ae7af8e62ecbbb7b201c8935c370df00a2c"},
-    {file = "rpds_py-0.20.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a84ab91cbe7aab97f7446652d0ed37d35b68a465aeef8fc41932a9d7eee2c1a6"},
-    {file = "rpds_py-0.20.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:56e27147a5a4c2c21633ff8475d185734c0e4befd1c989b5b95a5d0db699b21b"},
-    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2580b0c34583b85efec8c5c5ec9edf2dfe817330cc882ee972ae650e7b5ef739"},
-    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b80d4a7900cf6b66bb9cee5c352b2d708e29e5a37fe9bf784fa97fc11504bf6c"},
-    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:50eccbf054e62a7b2209b28dc7a22d6254860209d6753e6b78cfaeb0075d7bee"},
-    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:49a8063ea4296b3a7e81a5dfb8f7b2d73f0b1c20c2af401fb0cdf22e14711a96"},
-    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ea438162a9fcbee3ecf36c23e6c68237479f89f962f82dae83dc15feeceb37e4"},
-    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:18d7585c463087bddcfa74c2ba267339f14f2515158ac4db30b1f9cbdb62c8ef"},
-    {file = "rpds_py-0.20.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d4c7d1a051eeb39f5c9547e82ea27cbcc28338482242e3e0b7768033cb083821"},
-    {file = "rpds_py-0.20.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:e4df1e3b3bec320790f699890d41c59d250f6beda159ea3c44c3f5bac1976940"},
-    {file = "rpds_py-0.20.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2cf126d33a91ee6eedc7f3197b53e87a2acdac63602c0f03a02dd69e4b138174"},
-    {file = "rpds_py-0.20.0-cp312-none-win32.whl", hash = "sha256:8bc7690f7caee50b04a79bf017a8d020c1f48c2a1077ffe172abec59870f1139"},
-    {file = "rpds_py-0.20.0-cp312-none-win_amd64.whl", hash = "sha256:0e13e6952ef264c40587d510ad676a988df19adea20444c2b295e536457bc585"},
-    {file = "rpds_py-0.20.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:aa9a0521aeca7d4941499a73ad7d4f8ffa3d1affc50b9ea11d992cd7eff18a29"},
-    {file = "rpds_py-0.20.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4a1f1d51eccb7e6c32ae89243cb352389228ea62f89cd80823ea7dd1b98e0b91"},
-    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a86a9b96070674fc88b6f9f71a97d2c1d3e5165574615d1f9168ecba4cecb24"},
-    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6c8ef2ebf76df43f5750b46851ed1cdf8f109d7787ca40035fe19fbdc1acc5a7"},
-    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b74b25f024b421d5859d156750ea9a65651793d51b76a2e9238c05c9d5f203a9"},
-    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57eb94a8c16ab08fef6404301c38318e2c5a32216bf5de453e2714c964c125c8"},
-    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1940dae14e715e2e02dfd5b0f64a52e8374a517a1e531ad9412319dc3ac7879"},
-    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d20277fd62e1b992a50c43f13fbe13277a31f8c9f70d59759c88f644d66c619f"},
-    {file = "rpds_py-0.20.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:06db23d43f26478303e954c34c75182356ca9aa7797d22c5345b16871ab9c45c"},
-    {file = "rpds_py-0.20.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b2a5db5397d82fa847e4c624b0c98fe59d2d9b7cf0ce6de09e4d2e80f8f5b3f2"},
-    {file = "rpds_py-0.20.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5a35df9f5548fd79cb2f52d27182108c3e6641a4feb0f39067911bf2adaa3e57"},
-    {file = "rpds_py-0.20.0-cp313-none-win32.whl", hash = "sha256:fd2d84f40633bc475ef2d5490b9c19543fbf18596dcb1b291e3a12ea5d722f7a"},
-    {file = "rpds_py-0.20.0-cp313-none-win_amd64.whl", hash = "sha256:9bc2d153989e3216b0559251b0c260cfd168ec78b1fac33dd485750a228db5a2"},
-    {file = "rpds_py-0.20.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:f2fbf7db2012d4876fb0d66b5b9ba6591197b0f165db8d99371d976546472a24"},
-    {file = "rpds_py-0.20.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1e5f3cd7397c8f86c8cc72d5a791071431c108edd79872cdd96e00abd8497d29"},
-    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce9845054c13696f7af7f2b353e6b4f676dab1b4b215d7fe5e05c6f8bb06f965"},
-    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c3e130fd0ec56cb76eb49ef52faead8ff09d13f4527e9b0c400307ff72b408e1"},
-    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b16aa0107ecb512b568244ef461f27697164d9a68d8b35090e9b0c1c8b27752"},
-    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aa7f429242aae2947246587d2964fad750b79e8c233a2367f71b554e9447949c"},
-    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af0fc424a5842a11e28956e69395fbbeab2c97c42253169d87e90aac2886d751"},
-    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b8c00a3b1e70c1d3891f0db1b05292747f0dbcfb49c43f9244d04c70fbc40eb8"},
-    {file = "rpds_py-0.20.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:40ce74fc86ee4645d0a225498d091d8bc61f39b709ebef8204cb8b5a464d3c0e"},
-    {file = "rpds_py-0.20.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:4fe84294c7019456e56d93e8ababdad5a329cd25975be749c3f5f558abb48253"},
-    {file = "rpds_py-0.20.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:338ca4539aad4ce70a656e5187a3a31c5204f261aef9f6ab50e50bcdffaf050a"},
-    {file = "rpds_py-0.20.0-cp38-none-win32.whl", hash = "sha256:54b43a2b07db18314669092bb2de584524d1ef414588780261e31e85846c26a5"},
-    {file = "rpds_py-0.20.0-cp38-none-win_amd64.whl", hash = "sha256:a1862d2d7ce1674cffa6d186d53ca95c6e17ed2b06b3f4c476173565c862d232"},
-    {file = "rpds_py-0.20.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:3fde368e9140312b6e8b6c09fb9f8c8c2f00999d1823403ae90cc00480221b22"},
-    {file = "rpds_py-0.20.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9824fb430c9cf9af743cf7aaf6707bf14323fb51ee74425c380f4c846ea70789"},
-    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11ef6ce74616342888b69878d45e9f779b95d4bd48b382a229fe624a409b72c5"},
-    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c52d3f2f82b763a24ef52f5d24358553e8403ce05f893b5347098014f2d9eff2"},
-    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9d35cef91e59ebbeaa45214861874bc6f19eb35de96db73e467a8358d701a96c"},
-    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d72278a30111e5b5525c1dd96120d9e958464316f55adb030433ea905866f4de"},
-    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4c29cbbba378759ac5786730d1c3cb4ec6f8ababf5c42a9ce303dc4b3d08cda"},
-    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6632f2d04f15d1bd6fe0eedd3b86d9061b836ddca4c03d5cf5c7e9e6b7c14580"},
-    {file = "rpds_py-0.20.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d0b67d87bb45ed1cd020e8fbf2307d449b68abc45402fe1a4ac9e46c3c8b192b"},
-    {file = "rpds_py-0.20.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ec31a99ca63bf3cd7f1a5ac9fe95c5e2d060d3c768a09bc1d16e235840861420"},
-    {file = "rpds_py-0.20.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22e6c9976e38f4d8c4a63bd8a8edac5307dffd3ee7e6026d97f3cc3a2dc02a0b"},
-    {file = "rpds_py-0.20.0-cp39-none-win32.whl", hash = "sha256:569b3ea770c2717b730b61998b6c54996adee3cef69fc28d444f3e7920313cf7"},
-    {file = "rpds_py-0.20.0-cp39-none-win_amd64.whl", hash = "sha256:e6900ecdd50ce0facf703f7a00df12374b74bbc8ad9fe0f6559947fb20f82364"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:617c7357272c67696fd052811e352ac54ed1d9b49ab370261a80d3b6ce385045"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:9426133526f69fcaba6e42146b4e12d6bc6c839b8b555097020e2b78ce908dcc"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:deb62214c42a261cb3eb04d474f7155279c1a8a8c30ac89b7dcb1721d92c3c02"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fcaeb7b57f1a1e071ebd748984359fef83ecb026325b9d4ca847c95bc7311c92"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d454b8749b4bd70dd0a79f428731ee263fa6995f83ccb8bada706e8d1d3ff89d"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d807dc2051abe041b6649681dce568f8e10668e3c1c6543ebae58f2d7e617855"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3c20f0ddeb6e29126d45f89206b8291352b8c5b44384e78a6499d68b52ae511"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b7f19250ceef892adf27f0399b9e5afad019288e9be756d6919cb58892129f51"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:4f1ed4749a08379555cebf4650453f14452eaa9c43d0a95c49db50c18b7da075"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:dcedf0b42bcb4cfff4101d7771a10532415a6106062f005ab97d1d0ab5681c60"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:39ed0d010457a78f54090fafb5d108501b5aa5604cc22408fc1c0c77eac14344"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:bb273176be34a746bdac0b0d7e4e2c467323d13640b736c4c477881a3220a989"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f918a1a130a6dfe1d7fe0f105064141342e7dd1611f2e6a21cd2f5c8cb1cfb3e"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:f60012a73aa396be721558caa3a6fd49b3dd0033d1675c6d59c4502e870fcf0c"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d2b1ad682a3dfda2a4e8ad8572f3100f95fad98cb99faf37ff0ddfe9cbf9d03"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:614fdafe9f5f19c63ea02817fa4861c606a59a604a77c8cdef5aa01d28b97921"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fa518bcd7600c584bf42e6617ee8132869e877db2f76bcdc281ec6a4113a53ab"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0475242f447cc6cb8a9dd486d68b2ef7fbee84427124c232bff5f63b1fe11e5"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f90a4cd061914a60bd51c68bcb4357086991bd0bb93d8aa66a6da7701370708f"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:def7400461c3a3f26e49078302e1c1b38f6752342c77e3cf72ce91ca69fb1bc1"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:65794e4048ee837494aea3c21a28ad5fc080994dfba5b036cf84de37f7ad5074"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:faefcc78f53a88f3076b7f8be0a8f8d35133a3ecf7f3770895c25f8813460f08"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:5b4f105deeffa28bbcdff6c49b34e74903139afa690e35d2d9e3c2c2fba18cec"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdfc3a892927458d98f3d55428ae46b921d1f7543b89382fdb483f5640daaec8"},
-    {file = "rpds_py-0.20.0.tar.gz", hash = "sha256:d72a210824facfdaf8768cf2d7ca25a042c30320b3020de2fa04640920d4e121"},
+    {file = "rpds_py-0.19.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:fb37bd599f031f1a6fb9e58ec62864ccf3ad549cf14bac527dbfa97123edcca4"},
+    {file = "rpds_py-0.19.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3384d278df99ec2c6acf701d067147320b864ef6727405d6470838476e44d9e8"},
+    {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e54548e0be3ac117595408fd4ca0ac9278fde89829b0b518be92863b17ff67a2"},
+    {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8eb488ef928cdbc05a27245e52de73c0d7c72a34240ef4d9893fdf65a8c1a955"},
+    {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a5da93debdfe27b2bfc69eefb592e1831d957b9535e0943a0ee8b97996de21b5"},
+    {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:79e205c70afddd41f6ee79a8656aec738492a550247a7af697d5bd1aee14f766"},
+    {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:959179efb3e4a27610e8d54d667c02a9feaa86bbabaf63efa7faa4dfa780d4f1"},
+    {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a6e605bb9edcf010f54f8b6a590dd23a4b40a8cb141255eec2a03db249bc915b"},
+    {file = "rpds_py-0.19.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9133d75dc119a61d1a0ded38fb9ba40a00ef41697cc07adb6ae098c875195a3f"},
+    {file = "rpds_py-0.19.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:dd36b712d35e757e28bf2f40a71e8f8a2d43c8b026d881aa0c617b450d6865c9"},
+    {file = "rpds_py-0.19.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:354f3a91718489912f2e0fc331c24eaaf6a4565c080e00fbedb6015857c00582"},
+    {file = "rpds_py-0.19.0-cp310-none-win32.whl", hash = "sha256:ebcbf356bf5c51afc3290e491d3722b26aaf5b6af3c1c7f6a1b757828a46e336"},
+    {file = "rpds_py-0.19.0-cp310-none-win_amd64.whl", hash = "sha256:75a6076289b2df6c8ecb9d13ff79ae0cad1d5fb40af377a5021016d58cd691ec"},
+    {file = "rpds_py-0.19.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6d45080095e585f8c5097897313def60caa2046da202cdb17a01f147fb263b81"},
+    {file = "rpds_py-0.19.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c5c9581019c96f865483d031691a5ff1cc455feb4d84fc6920a5ffc48a794d8a"},
+    {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1540d807364c84516417115c38f0119dfec5ea5c0dd9a25332dea60b1d26fc4d"},
+    {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9e65489222b410f79711dc3d2d5003d2757e30874096b2008d50329ea4d0f88c"},
+    {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9da6f400eeb8c36f72ef6646ea530d6d175a4f77ff2ed8dfd6352842274c1d8b"},
+    {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:37f46bb11858717e0efa7893c0f7055c43b44c103e40e69442db5061cb26ed34"},
+    {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:071d4adc734de562bd11d43bd134330fb6249769b2f66b9310dab7460f4bf714"},
+    {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9625367c8955e4319049113ea4f8fee0c6c1145192d57946c6ffcd8fe8bf48dd"},
+    {file = "rpds_py-0.19.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e19509145275d46bc4d1e16af0b57a12d227c8253655a46bbd5ec317e941279d"},
+    {file = "rpds_py-0.19.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4d438e4c020d8c39961deaf58f6913b1bf8832d9b6f62ec35bd93e97807e9cbc"},
+    {file = "rpds_py-0.19.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:90bf55d9d139e5d127193170f38c584ed3c79e16638890d2e36f23aa1630b952"},
+    {file = "rpds_py-0.19.0-cp311-none-win32.whl", hash = "sha256:8d6ad132b1bc13d05ffe5b85e7a01a3998bf3a6302ba594b28d61b8c2cf13aaf"},
+    {file = "rpds_py-0.19.0-cp311-none-win_amd64.whl", hash = "sha256:7ec72df7354e6b7f6eb2a17fa6901350018c3a9ad78e48d7b2b54d0412539a67"},
+    {file = "rpds_py-0.19.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:5095a7c838a8647c32aa37c3a460d2c48debff7fc26e1136aee60100a8cd8f68"},
+    {file = "rpds_py-0.19.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6f2f78ef14077e08856e788fa482107aa602636c16c25bdf59c22ea525a785e9"},
+    {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b7cc6cb44f8636fbf4a934ca72f3e786ba3c9f9ba4f4d74611e7da80684e48d2"},
+    {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cf902878b4af334a09de7a45badbff0389e7cf8dc2e4dcf5f07125d0b7c2656d"},
+    {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:688aa6b8aa724db1596514751ffb767766e02e5c4a87486ab36b8e1ebc1aedac"},
+    {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57dbc9167d48e355e2569346b5aa4077f29bf86389c924df25c0a8b9124461fb"},
+    {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b4cf5a9497874822341c2ebe0d5850fed392034caadc0bad134ab6822c0925b"},
+    {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8a790d235b9d39c70a466200d506bb33a98e2ee374a9b4eec7a8ac64c2c261fa"},
+    {file = "rpds_py-0.19.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1d16089dfa58719c98a1c06f2daceba6d8e3fb9b5d7931af4a990a3c486241cb"},
+    {file = "rpds_py-0.19.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:bc9128e74fe94650367fe23f37074f121b9f796cabbd2f928f13e9661837296d"},
+    {file = "rpds_py-0.19.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c8f77e661ffd96ff104bebf7d0f3255b02aa5d5b28326f5408d6284c4a8b3248"},
+    {file = "rpds_py-0.19.0-cp312-none-win32.whl", hash = "sha256:5f83689a38e76969327e9b682be5521d87a0c9e5a2e187d2bc6be4765f0d4600"},
+    {file = "rpds_py-0.19.0-cp312-none-win_amd64.whl", hash = "sha256:06925c50f86da0596b9c3c64c3837b2481337b83ef3519e5db2701df695453a4"},
+    {file = "rpds_py-0.19.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:52e466bea6f8f3a44b1234570244b1cff45150f59a4acae3fcc5fd700c2993ca"},
+    {file = "rpds_py-0.19.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e21cc693045fda7f745c790cb687958161ce172ffe3c5719ca1764e752237d16"},
+    {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b31f059878eb1f5da8b2fd82480cc18bed8dcd7fb8fe68370e2e6285fa86da6"},
+    {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1dd46f309e953927dd018567d6a9e2fb84783963650171f6c5fe7e5c41fd5666"},
+    {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:34a01a4490e170376cd79258b7f755fa13b1a6c3667e872c8e35051ae857a92b"},
+    {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bcf426a8c38eb57f7bf28932e68425ba86def6e756a5b8cb4731d8e62e4e0223"},
+    {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68eea5df6347d3f1378ce992d86b2af16ad7ff4dcb4a19ccdc23dea901b87fb"},
+    {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dab8d921b55a28287733263c0e4c7db11b3ee22aee158a4de09f13c93283c62d"},
+    {file = "rpds_py-0.19.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:6fe87efd7f47266dfc42fe76dae89060038f1d9cb911f89ae7e5084148d1cc08"},
+    {file = "rpds_py-0.19.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:535d4b52524a961d220875688159277f0e9eeeda0ac45e766092bfb54437543f"},
+    {file = "rpds_py-0.19.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:8b1a94b8afc154fbe36978a511a1f155f9bd97664e4f1f7a374d72e180ceb0ae"},
+    {file = "rpds_py-0.19.0-cp38-none-win32.whl", hash = "sha256:7c98298a15d6b90c8f6e3caa6457f4f022423caa5fa1a1ca7a5e9e512bdb77a4"},
+    {file = "rpds_py-0.19.0-cp38-none-win_amd64.whl", hash = "sha256:b0da31853ab6e58a11db3205729133ce0df26e6804e93079dee095be3d681dc1"},
+    {file = "rpds_py-0.19.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:5039e3cef7b3e7a060de468a4a60a60a1f31786da94c6cb054e7a3c75906111c"},
+    {file = "rpds_py-0.19.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab1932ca6cb8c7499a4d87cb21ccc0d3326f172cfb6a64021a889b591bb3045c"},
+    {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2afd2164a1e85226fcb6a1da77a5c8896c18bfe08e82e8ceced5181c42d2179"},
+    {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b1c30841f5040de47a0046c243fc1b44ddc87d1b12435a43b8edff7e7cb1e0d0"},
+    {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f757f359f30ec7dcebca662a6bd46d1098f8b9fb1fcd661a9e13f2e8ce343ba1"},
+    {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:15e65395a59d2e0e96caf8ee5389ffb4604e980479c32742936ddd7ade914b22"},
+    {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb0f6eb3a320f24b94d177e62f4074ff438f2ad9d27e75a46221904ef21a7b05"},
+    {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b228e693a2559888790936e20f5f88b6e9f8162c681830eda303bad7517b4d5a"},
+    {file = "rpds_py-0.19.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2575efaa5d949c9f4e2cdbe7d805d02122c16065bfb8d95c129372d65a291a0b"},
+    {file = "rpds_py-0.19.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:5c872814b77a4e84afa293a1bee08c14daed1068b2bb1cc312edbf020bbbca2b"},
+    {file = "rpds_py-0.19.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:850720e1b383df199b8433a20e02b25b72f0fded28bc03c5bd79e2ce7ef050be"},
+    {file = "rpds_py-0.19.0-cp39-none-win32.whl", hash = "sha256:ce84a7efa5af9f54c0aa7692c45861c1667080814286cacb9958c07fc50294fb"},
+    {file = "rpds_py-0.19.0-cp39-none-win_amd64.whl", hash = "sha256:1c26da90b8d06227d7769f34915913911222d24ce08c0ab2d60b354e2d9c7aff"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:75969cf900d7be665ccb1622a9aba225cf386bbc9c3bcfeeab9f62b5048f4a07"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8445f23f13339da640d1be8e44e5baf4af97e396882ebbf1692aecd67f67c479"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5a7c1062ef8aea3eda149f08120f10795835fc1c8bc6ad948fb9652a113ca55"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:462b0c18fbb48fdbf980914a02ee38c423a25fcc4cf40f66bacc95a2d2d73bc8"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3208f9aea18991ac7f2b39721e947bbd752a1abbe79ad90d9b6a84a74d44409b"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3444fe52b82f122d8a99bf66777aed6b858d392b12f4c317da19f8234db4533"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88cb4bac7185a9f0168d38c01d7a00addece9822a52870eee26b8d5b61409213"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6b130bd4163c93798a6b9bb96be64a7c43e1cec81126ffa7ffaa106e1fc5cef5"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:a707b158b4410aefb6b054715545bbb21aaa5d5d0080217290131c49c2124a6e"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:dc9ac4659456bde7c567107556ab065801622396b435a3ff213daef27b495388"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:81ea573aa46d3b6b3d890cd3c0ad82105985e6058a4baed03cf92518081eec8c"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3f148c3f47f7f29a79c38cc5d020edcb5ca780020fab94dbc21f9af95c463581"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:b0906357f90784a66e89ae3eadc2654f36c580a7d65cf63e6a616e4aec3a81be"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f629ecc2db6a4736b5ba95a8347b0089240d69ad14ac364f557d52ad68cf94b0"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c6feacd1d178c30e5bc37184526e56740342fd2aa6371a28367bad7908d454fc"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae8b6068ee374fdfab63689be0963333aa83b0815ead5d8648389a8ded593378"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:78d57546bad81e0da13263e4c9ce30e96dcbe720dbff5ada08d2600a3502e526"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8b6683a37338818646af718c9ca2a07f89787551057fae57c4ec0446dc6224b"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e8481b946792415adc07410420d6fc65a352b45d347b78fec45d8f8f0d7496f0"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:bec35eb20792ea64c3c57891bc3ca0bedb2884fbac2c8249d9b731447ecde4fa"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:aa5476c3e3a402c37779e95f7b4048db2cb5b0ed0b9d006983965e93f40fe05a"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:19d02c45f2507b489fd4df7b827940f1420480b3e2e471e952af4d44a1ea8e34"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a3e2fd14c5d49ee1da322672375963f19f32b3d5953f0615b175ff7b9d38daed"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:93a91c2640645303e874eada51f4f33351b84b351a689d470f8108d0e0694210"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5b9fc03bf76a94065299d4a2ecd8dfbae4ae8e2e8098bbfa6ab6413ca267709"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5a4b07cdf3f84310c08c1de2c12ddadbb7a77568bcb16e95489f9c81074322ed"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba0ed0dc6763d8bd6e5de5cf0d746d28e706a10b615ea382ac0ab17bb7388633"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:474bc83233abdcf2124ed3f66230a1c8435896046caa4b0b5ab6013c640803cc"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:329c719d31362355a96b435f4653e3b4b061fcc9eba9f91dd40804ca637d914e"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ef9101f3f7b59043a34f1dccbb385ca760467590951952d6701df0da9893ca0c"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:0121803b0f424ee2109d6e1f27db45b166ebaa4b32ff47d6aa225642636cd834"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:8344127403dea42f5970adccf6c5957a71a47f522171fafaf4c6ddb41b61703a"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:443cec402ddd650bb2b885113e1dcedb22b1175c6be223b14246a714b61cd521"},
+    {file = "rpds_py-0.19.0.tar.gz", hash = "sha256:4fdc9afadbeb393b4bbbad75481e0ea78e4469f2e1d713a90811700830b553a9"},
 ]
 
 [[package]]
@@ -2034,24 +1913,24 @@ files = [
 
 [[package]]
 name = "tomlkit"
-version = "0.13.2"
+version = "0.13.0"
 description = "Style preserving TOML library"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "tomlkit-0.13.2-py3-none-any.whl", hash = "sha256:7a974427f6e119197f670fbbbeae7bef749a6c14e793db934baefc1b5f03efde"},
-    {file = "tomlkit-0.13.2.tar.gz", hash = "sha256:fff5fe59a87295b278abd31bec92c15d9bc4a06885ab12bcea52c71119392e79"},
+    {file = "tomlkit-0.13.0-py3-none-any.whl", hash = "sha256:7075d3042d03b80f603482d69bf0c8f345c2b30e41699fd8883227f89972b264"},
+    {file = "tomlkit-0.13.0.tar.gz", hash = "sha256:08ad192699734149f5b97b45f1f18dad7eb1b6d16bc72ad0c2335772650d7b72"},
 ]
 
 [[package]]
 name = "tqdm"
-version = "4.66.5"
+version = "4.66.4"
 description = "Fast, Extensible Progress Meter"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "tqdm-4.66.5-py3-none-any.whl", hash = "sha256:90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd"},
-    {file = "tqdm-4.66.5.tar.gz", hash = "sha256:e1020aef2e5096702d8a025ac7d16b1577279c9d63f8375b63083e9a5f0fcbad"},
+    {file = "tqdm-4.66.4-py3-none-any.whl", hash = "sha256:b75ca56b413b030bc3f00af51fd2c1a1a5eac6a0c1cca83cbb37a5c52abce644"},
+    {file = "tqdm-4.66.4.tar.gz", hash = "sha256:e4d936c9de8727928f3be6079590e97d9abfe8d39a590be678eb5919ffc186bb"},
 ]
 
 [package.dependencies]
@@ -2125,24 +2004,24 @@ typing-extensions = ">=3.7.4"
 
 [[package]]
 name = "tzdata"
-version = "2024.2"
+version = "2024.1"
 description = "Provider of IANA time zone data"
 optional = false
 python-versions = ">=2"
 files = [
-    {file = "tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd"},
-    {file = "tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc"},
+    {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"},
+    {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"},
 ]
 
 [[package]]
 name = "urllib3"
-version = "2.2.3"
+version = "2.2.2"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"},
-    {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"},
+    {file = "urllib3-2.2.2-py3-none-any.whl", hash = "sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472"},
+    {file = "urllib3-2.2.2.tar.gz", hash = "sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168"},
 ]
 
 [package.extras]
@@ -2153,13 +2032,13 @@ zstd = ["zstandard (>=0.18.0)"]
 
 [[package]]
 name = "virtualenv"
-version = "20.26.5"
+version = "20.26.3"
 description = "Virtual Python Environment builder"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "virtualenv-20.26.5-py3-none-any.whl", hash = "sha256:4f3ac17b81fba3ce3bd6f4ead2749a72da5929c01774948e243db9ba41df4ff6"},
-    {file = "virtualenv-20.26.5.tar.gz", hash = "sha256:ce489cac131aa58f4b25e321d6d186171f78e6cb13fafbf32a840cee67733ff4"},
+    {file = "virtualenv-20.26.3-py3-none-any.whl", hash = "sha256:8cc4a31139e796e9a7de2cd5cf2489de1217193116a8fd42328f1bd65f434589"},
+    {file = "virtualenv-20.26.3.tar.gz", hash = "sha256:4c43a2a236279d9ea36a0d76f98d84bd6ca94ac4e0f4a3b9d46d05e10fea542a"},
 ]
 
 [package.dependencies]
@@ -2173,13 +2052,13 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess
 
 [[package]]
 name = "wheel"
-version = "0.44.0"
+version = "0.43.0"
 description = "A built-package format for Python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "wheel-0.44.0-py3-none-any.whl", hash = "sha256:2376a90c98cc337d18623527a97c31797bd02bad0033d41547043a1cbfbe448f"},
-    {file = "wheel-0.44.0.tar.gz", hash = "sha256:a29c3f2817e95ab89aa4660681ad547c0e9547f20e75b0562fe7723c9a2a9d49"},
+    {file = "wheel-0.43.0-py3-none-any.whl", hash = "sha256:55c570405f142630c6b9f72fe09d9b67cf1477fcf543ae5b8dcb1f5b7377da81"},
+    {file = "wheel-0.43.0.tar.gz", hash = "sha256:465ef92c69fa5c5da2d1cf8ac40559a8c940886afcef87dcf14b9470862f1d85"},
 ]
 
 [package.extras]
@@ -2187,24 +2066,20 @@ test = ["pytest (>=6.0.0)", "setuptools (>=65)"]
 
 [[package]]
 name = "zipp"
-version = "3.20.2"
+version = "3.19.2"
 description = "Backport of pathlib-compatible object wrapper for zip files"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350"},
-    {file = "zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29"},
+    {file = "zipp-3.19.2-py3-none-any.whl", hash = "sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c"},
+    {file = "zipp-3.19.2.tar.gz", hash = "sha256:bf1dcf6450f873a13e952a29504887c89e6de7506209e5b1bcc3460135d4de19"},
 ]
 
 [package.extras]
-check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
-cover = ["pytest-cov"]
 doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-enabler = ["pytest-enabler (>=2.2)"]
-test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
-type = ["pytest-mypy"]
+test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
 
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "452024ee94c190f635889484d209b5c588dfbdd9161a6c23fd95b3872359ad86"
+content-hash = "2256d7b264ca3af01e83a71107252a8f9cc57abcbe73bf1e4b6bebd33906cf9e"
diff --git a/pyproject.toml b/pyproject.toml
index c80e884..f05dcc0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,7 +53,6 @@ jsonref = "^1.1.0"
 json-schema-for-humans = "^1.0.0"
 tabulate = "^0.9.0"
 pandas = "^2.2.2"
-pydantic-extra-types = "^2.9.0"
 
 [tool.poetry.group.dev.dependencies]
 black = "^24.4.2"
@@ -120,7 +119,6 @@ module = [
     "pandas.*",
     "requests.*",
     "tabulate.*",
-    "yaml.*"
 ]
 ignore_missing_imports = true
 

From acb1cdcd448a516ef853bc52b4601499c7c3b02e Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Wed, 25 Sep 2024 13:53:43 +0200
Subject: [PATCH 17/34] Cleanup

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/utils/ds_generate_jsonschema.py | 4 ++--
 pyproject.toml                               | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/docling_core/utils/ds_generate_jsonschema.py b/docling_core/utils/ds_generate_jsonschema.py
index 8bdf5d6..67acf19 100644
--- a/docling_core/utils/ds_generate_jsonschema.py
+++ b/docling_core/utils/ds_generate_jsonschema.py
@@ -6,7 +6,7 @@
 """Generate the JSON Schema of pydantic models and export them to files.
 
 Example:
-    python docling_core/utils/ds_generate_jsonschema.py legacy.base.TableCell
+    python docling_core/utils/ds_generate_jsonschema.py doc.base.TableCell
 
 """
 import argparse
@@ -48,7 +48,7 @@ def main() -> None:
     """Print the JSON Schema of a model."""
     argparser = argparse.ArgumentParser()
     argparser.add_argument(
-        "class_ref", help="Class reference, e.g., legacy.base.TableCell"
+        "class_ref", help="Class reference, e.g., doc.base.TableCell"
     )
     args = argparser.parse_args()
 
diff --git a/pyproject.toml b/pyproject.toml
index f05dcc0..3d353b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -119,6 +119,7 @@ module = [
     "pandas.*",
     "requests.*",
     "tabulate.*",
+    "yaml.*",
 ]
 ignore_missing_imports = true
 

From ce0b7ee64750944e530d03a1cf22a75636fa2775 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Wed, 25 Sep 2024 15:50:51 +0200
Subject: [PATCH 18/34] Several improvements and cleanup

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/doc/base.py                   |  1 -
 docling_core/types/experimental/document.py      | 16 ++--------------
 .../experimental/2206.01062.experimental.yaml    |  1 +
 test/data/experimental/dummy_doc.yaml            |  1 +
 test/test_docling_doc.py                         | 12 ++++++------
 5 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/docling_core/types/doc/base.py b/docling_core/types/doc/base.py
index fa04399..432e6f6 100644
--- a/docling_core/types/doc/base.py
+++ b/docling_core/types/doc/base.py
@@ -440,7 +440,6 @@ def export_to_document_tokens(
     ):
         """Export text element to document tokens format."""
         body = f"<{self.obj_type}>"
-        # body = f"<{self.name}>"
 
         assert DocumentToken.is_known_token(
             body
diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index a944deb..ac5e1b3 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -73,9 +73,7 @@ class BaseTableData(BaseModel):  # TBD
     @property
     def grid(
         self,
-    ) -> List[
-        List[TableCell]
-    ]:  # TODO compute grid representation on the fly from table_cells
+    ) -> List[List[TableCell]]:
         """grid."""
         # Initialise empty table data grid (only empty cells)
         table_data = [
@@ -110,6 +108,7 @@ def grid(
 class FileInfo(BaseModel):
     """FileInfo."""
 
+    filename: str
     document_hash: str
 
 
@@ -264,7 +263,6 @@ def export_to_document_tokens(
 
         """
         body = f"<{self.label.value}>"
-        # body = f"<{self.name}>"
 
         assert DocumentToken.is_known_token(
             body
@@ -610,11 +608,6 @@ class DoclingDocument(DocumentTrees):
 
     pages: Dict[int, PageItem] = {}  # empty as default
 
-    # def add_furniture_group(self, name: str):
-    #    group = GroupItem(name=name)
-    #    self.furniture.children.append(group)
-    #    return group
-
     def add_group(
         self,
         label: Optional[GroupLabel] = None,
@@ -790,11 +783,6 @@ def num_pages(self):
         """num_pages."""
         return len(self.pages.values())
 
-    def build_page_trees(self):
-        """build_page_trees."""
-        # TODO: For every PageItem, update the furniture and body trees
-        # from the main doc.
-
     def iterate_elements(
         self,
         root: Optional[NodeItem] = None,
diff --git a/test/data/experimental/2206.01062.experimental.yaml b/test/data/experimental/2206.01062.experimental.yaml
index efd8bb1..1d72988 100644
--- a/test/data/experimental/2206.01062.experimental.yaml
+++ b/test/data/experimental/2206.01062.experimental.yaml
@@ -212,6 +212,7 @@ figures:
   references: []
 file_info:
   document_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc
+  filename: "2206.01062.pdf"
 furniture:
   children: []
   dloc: '#/furniture'
diff --git a/test/data/experimental/dummy_doc.yaml b/test/data/experimental/dummy_doc.yaml
index 8b0116d..720a357 100644
--- a/test/data/experimental/dummy_doc.yaml
+++ b/test/data/experimental/dummy_doc.yaml
@@ -3,6 +3,7 @@
 description: { } # DescriptionType - TBD
 file_info: # FileInfo type
   document_hash: e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5
+  filename: dummy_doc
 
 # Root element for any headers, footers, framing, navigation elements, all other non-body text, type GroupItem
 furniture:
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index 51501f2..5818d6e 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -91,10 +91,10 @@ def _test_export_methods(doc):
 
 
 def _construct_doc() -> DoclingDocument:
-    doc = DoclingDocument(description={}, file_info=FileInfo(document_hash="xyz"))
+    doc = DoclingDocument(description={}, file_info=FileInfo(filename="dummy", document_hash="xyz"))
     # group, heading, paragraph, table, figure, title, list, provenance
-    doc.add_paragraph(label="text", text="Author 1\nAffiliation 1")
-    doc.add_paragraph(label="text", text="Author 2\nAffiliation 2")
+    doc.add_paragraph(label=DocItemLabel.TEXT, text="Author 1\nAffiliation 1")
+    doc.add_paragraph(label=DocItemLabel.TEXT, text="Author 2\nAffiliation 2")
     chapter1 = doc.add_group(
         label=GroupLabel.CHAPTER, name="Introduction"
     )  # can be done if such information is present, or ommitted.
@@ -106,13 +106,13 @@ def _construct_doc() -> DoclingDocument:
     )
     doc.add_paragraph(
         parent=chapter1,
-        label="text",
+        label=DocItemLabel.TEXT,
         text="This paper introduces the biggest invention ever made. ...",
     )
     mylist = doc.add_group(parent=chapter1, label=GroupLabel.LIST)
     doc.add_paragraph(
         parent=mylist,
-        label="list_item",
+        label=DocItemLabel.LIST_ITEM,
         text="Cooks your favourite meal before you know you want it.",
     )
     doc.add_paragraph(
@@ -193,7 +193,7 @@ def _construct_doc() -> DoclingDocument:
     table_el = BaseTableData(num_rows=3, num_cols=3, table_cells=table_cells)
     doc.add_table(data=table_el)
     fig_caption = doc.add_paragraph(
-        label="caption", text="This is the caption of figure 1."
+        label=DocItemLabel.CAPTION, text="This is the caption of figure 1."
     )
     doc.add_figure(data=BaseFigureData(), caption=fig_caption.get_ref())
     return doc

From c986ea5eabd11b017b79e7494fdafd084be55ffd Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Wed, 25 Sep 2024 15:54:22 +0200
Subject: [PATCH 19/34] Format fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 test/test_docling_doc.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index 5818d6e..54c78ec 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -91,7 +91,9 @@ def _test_export_methods(doc):
 
 
 def _construct_doc() -> DoclingDocument:
-    doc = DoclingDocument(description={}, file_info=FileInfo(filename="dummy", document_hash="xyz"))
+    doc = DoclingDocument(
+        description={}, file_info=FileInfo(filename="dummy", document_hash="xyz")
+    )
     # group, heading, paragraph, table, figure, title, list, provenance
     doc.add_paragraph(label=DocItemLabel.TEXT, text="Author 1\nAffiliation 1")
     doc.add_paragraph(label=DocItemLabel.TEXT, text="Author 2\nAffiliation 2")

From 23894064986057dd3e40dfbba3b0327cdecc95ca Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Fri, 27 Sep 2024 11:29:56 +0200
Subject: [PATCH 20/34] Big redesign for usage of hashes, several other fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/experimental/__init__.py   |    25 +
 docling_core/types/experimental/document.py   |   241 +-
 docling_core/types/experimental/labels.py     |     8 +-
 pyproject.toml                                |     5 +
 .../experimental/2206.01062.experimental.yaml | 10370 +++++++++++++++-
 test/data/experimental/dummy_doc.yaml         |    41 +-
 test/test_docling_doc.py                      |    10 +-
 7 files changed, 10287 insertions(+), 413 deletions(-)

diff --git a/docling_core/types/experimental/__init__.py b/docling_core/types/experimental/__init__.py
index 79fe213..fdad4f0 100644
--- a/docling_core/types/experimental/__init__.py
+++ b/docling_core/types/experimental/__init__.py
@@ -4,3 +4,28 @@
 #
 
 """Package for models defined by the Document type."""
+
+from .base import BoundingBox, CoordOrigin, Size
+from .document import (
+    BaseFigureData,
+    BaseTableData,
+    DescriptionItem,
+    DocItem,
+    DoclingDocument,
+    DocumentOrigin,
+    DocumentTrees,
+    FigureItem,
+    FloatingItem,
+    GroupItem,
+    ImageRef,
+    KeyValueItem,
+    NodeItem,
+    PageItem,
+    ProvenanceItem,
+    RefItem,
+    Section,
+    TableCell,
+    TableItem,
+    TextItem,
+)
+from .labels import DocItemLabel, GroupLabel, TableCellLabel
diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index ac5e1b3..db343cc 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -1,6 +1,8 @@
 """Models for the Docling Document data type."""
 
 import hashlib
+import json
+import mimetypes
 import typing
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -11,12 +13,14 @@
     ConfigDict,
     Field,
     computed_field,
+    field_serializer,
+    field_validator,
     model_validator,
 )
 from tabulate import tabulate
 
 from docling_core.types.doc.tokens import DocumentToken
-from docling_core.types.experimental.base import BoundingBox, Size
+from docling_core.types.experimental import BoundingBox, Size
 from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
 
 Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
@@ -105,11 +109,43 @@ def grid(
         return table_data
 
 
-class FileInfo(BaseModel):
-    """FileInfo."""
+class DocumentOrigin(BaseModel):
+    """FileSource."""
 
-    filename: str
-    document_hash: str
+    mimetype: str  # the mimetype of the original file
+    binary_hash: Uint64  # the binary hash of the original file.
+    # TODO: Change to be Uint64 and provide utility method to generate
+
+    filename: str  # The name of the original file, including extension, without path.
+    # Could stem from filesystem, source URI, Content-Disposition header, ...
+
+    uri: Optional[AnyUrl] = (
+        None  # any possible reference to a source file,
+        # from any file handler protocol (e.g. https://, file://, s3://)
+    )
+
+    @field_validator("binary_hash", mode="before")
+    @classmethod
+    def parse_hex_string(cls, value):
+        """parse_hex_string."""
+        if isinstance(value, str):
+            try:
+                # Convert hex string to an integer
+                hash_int = Uint64(value, 16)
+                # Mask to fit within 64 bits (unsigned)
+                return hash_int & 0xFFFFFFFFFFFFFFFF
+            except ValueError:
+                raise ValueError(f"Invalid sha256 hexdigest: {value}")
+        return value  # If already an int, return it as is.
+
+    @field_validator("mimetype")
+    @classmethod
+    def validate_mimetype(cls, v):
+        """validate_mimetype."""
+        # Check if the provided MIME type is valid using mimetypes module
+        if v not in mimetypes.types_map.values():
+            raise ValueError(f"'{v}' is not a valid MIME type")
+        return v
 
 
 class RefItem(BaseModel):
@@ -147,11 +183,20 @@ def resolve(self, doc: "DoclingDocument"):
 class ImageRef(BaseModel):
     """ImageRef."""
 
-    format: str  # png, etc.
-    dpi: int  # ...
+    mimetype: str
+    dpi: int
     size: Size
     uri: AnyUrl
 
+    @field_validator("mimetype")
+    @classmethod
+    def validate_mimetype(cls, v):
+        """validate_mimetype."""
+        # Check if the provided MIME type is valid using mimetypes module
+        if v not in mimetypes.types_map.values():
+            raise ValueError(f"'{v}' is not a valid MIME type")
+        return v
+
 
 class ProvenanceItem(BaseModel):
     """ProvenanceItem."""
@@ -164,27 +209,14 @@ class ProvenanceItem(BaseModel):
 class NodeItem(BaseModel):
     """NodeItem."""
 
-    dloc: str  # format spec ({document_hash}{json-path})
+    self_ref: str  # format spec: json-path
     parent: Optional[RefItem] = None
     children: List[RefItem] = []
+    hash: Uint64 = 0
 
     def get_ref(self):
         """get_ref."""
-        return RefItem(cref=f"#{self.dloc.split('#')[1]}")
-
-    @computed_field  # type: ignore
-    @property
-    def hash(self) -> Uint64:  # TODO align with hasher on deepsearch-glm
-        """hash."""
-        if not len(self.dloc):
-            return 0
-        hash_object = hashlib.sha256(self.dloc.encode("utf-8"))
-
-        # Convert the hash to an integer
-        hash_int = int.from_bytes(hash_object.digest(), "big")
-
-        # Mask it to fit within 64 bits
-        return Uint64(hash_int & 0xFFFFFFFFFFFFFFFF)  # 64-bit unsigned integer mask
+        return RefItem(cref=self.self_ref)
 
 
 class GroupItem(NodeItem):  # Container type, can't be a leaf node
@@ -264,9 +296,10 @@ def export_to_document_tokens(
         """
         body = f"<{self.label.value}>"
 
-        assert DocumentToken.is_known_token(
-            body
-        ), f"failed DocumentToken.is_known_token({body})"
+        # TODO: This must be done through an explicit mapping.
+        # assert DocumentToken.is_known_token(
+        #    body
+        # ), f"failed DocumentToken.is_known_token({body})"
 
         if add_location:
             body += self.get_location_tokens(
@@ -280,7 +313,7 @@ def export_to_document_tokens(
         if add_content and self.text is not None:
             body += self.text.strip()
 
-        body += f"</{self.label}>{new_line}"
+        body += f"</{self.label.value}>{new_line}"
 
         return body
 
@@ -577,9 +610,9 @@ class DocumentTrees(BaseModel):
     """DocumentTrees."""
 
     furniture: GroupItem = GroupItem(
-        name="_root_", dloc="#/furniture"
+        name="_root_", self_ref="#/furniture"
     )  # List[RefItem] = []
-    body: GroupItem = GroupItem(name="_root_", dloc="#/body")  # List[RefItem] = []
+    body: GroupItem = GroupItem(name="_root_", self_ref="#/body")  # List[RefItem] = []
 
 
 class PageItem(BaseModel):
@@ -587,18 +620,31 @@ class PageItem(BaseModel):
 
     # A page carries separate root items for furniture and body,
     # only referencing items on the page
-    hash: str  # page hash
+    hash: Uint64 = (
+        0  # dummy default, correct value ensured through
+        # field_serializer on DoclingDocument
+    )
     size: Size
     image: Optional[ImageRef] = None
     page_no: int
 
 
+class DescriptionItem(BaseModel):
+    """DescriptionItem."""
+
+
 class DoclingDocument(DocumentTrees):
     """DoclingDocument."""
 
-    version: str = "0.0.1"  # = SemanticVersion(version="0.0.1")
-    description: Any
-    file_info: FileInfo
+    version: str = "0.1.0"  # use SemanticVersion type instead
+    description: DescriptionItem
+    name: str  # The working name of this document, without extensions
+    # (could be taken from originating doc, or just "Untitled 1")
+    origin: Optional[DocumentOrigin] = (
+        None  # DoclingDocuments may specify an origin (converted to DoclingDocument).
+        # This is optional, e.g. a DoclingDocument could also be entirely
+        # generated from synthetic data.
+    )
 
     groups: List[GroupItem] = []
     texts: List[TextItem] = []
@@ -608,6 +654,41 @@ class DoclingDocument(DocumentTrees):
 
     pages: Dict[int, PageItem] = {}  # empty as default
 
+    def _compute_hash(self, obj):
+        hash_object = hashlib.sha256(obj.encode("utf-8"))
+        # Convert the hash to an integer
+        hash_int = int.from_bytes(hash_object.digest(), "big")
+        # Mask it to fit within 64 bits
+        return Uint64(hash_int & 0xFFFFFFFFFFFFFFFF)  # 64-bit unsigned integer mask
+
+    @computed_field
+    def hash(self) -> Uint64:
+        """hash."""
+        # Get a dictionary representation of the model, excluding the computed field.
+        # explicitly include fields to be sure the hash is stable.
+        # Must not include hash itself or the pages.
+        model_dict = self.model_dump(
+            mode="json",
+            by_alias=True,
+            include={
+                "version",
+                "name",
+                "description",
+                "origin",
+                "groups",
+                "texts",
+                "figures",
+                "tables",
+                "key_value_items",
+                # "furniture",
+                # "body",
+            },
+        )
+
+        json_string = json.dumps(model_dict, sort_keys=True)
+
+        return self._compute_hash(json_string)
+
     def add_group(
         self,
         label: Optional[GroupLabel] = None,
@@ -626,12 +707,11 @@ def add_group(
 
         group_index = len(self.groups)
         cref = f"#/groups/{group_index}"
-        dloc = f"{self.file_info.document_hash}{cref}"
 
-        group = GroupItem(dloc=dloc, parent=parent.get_ref())
-        if name:
+        group = GroupItem(self_ref=cref, parent=parent.get_ref())
+        if name is not None:
             group.name = name
-        if label:
+        if label is not None:
             group.label = label
 
         self.groups.append(group)
@@ -666,12 +746,11 @@ def add_paragraph(
 
         text_index = len(self.texts)
         cref = f"#/texts/{text_index}"
-        dloc = f"{self.file_info.document_hash}{cref}"
         text_item = item_cls(
             label=label,
             text=text,
             orig=orig,
-            dloc=dloc,
+            self_ref=cref,
             parent=parent.get_ref(),
         )
         if prov:
@@ -703,10 +782,9 @@ def add_table(
 
         table_index = len(self.tables)
         cref = f"#/tables/{table_index}"
-        dloc = f"{self.file_info.document_hash}{cref}"
 
         tbl_item = TableItem(
-            label=DocItemLabel.TABLE, data=data, dloc=dloc, parent=parent.get_ref()
+            label=DocItemLabel.TABLE, data=data, self_ref=cref, parent=parent.get_ref()
         )
         if prov:
             tbl_item.prov.append(prov)
@@ -739,10 +817,12 @@ def add_figure(
 
         figure_index = len(self.figures)
         cref = f"#/figures/{figure_index}"
-        dloc = f"{self.file_info.document_hash}{cref}"
 
         fig_item = FigureItem(
-            label=DocItemLabel.PICTURE, data=data, dloc=dloc, parent=parent.get_ref()
+            label=DocItemLabel.PICTURE,
+            data=data,
+            self_ref=cref,
+            parent=parent.get_ref(),
         )
         if prov:
             fig_item.prov.append(prov)
@@ -789,7 +869,7 @@ def iterate_elements(
         with_groups: bool = False,
         traverse_figures: bool = True,
         page_no: Optional[int] = None,
-        _level=0,  # fixed parameter, carries through the node nesting level
+        _level: int = 0,  # fixed parameter, carries through the node nesting level
     ) -> typing.Iterable[Tuple[NodeItem, int]]:  # tuple of node and level
         """iterate_elements.
 
@@ -839,14 +919,13 @@ def export_to_markdown(
         delim: str = "\n\n",
         from_element: int = 0,
         to_element: Optional[int] = None,
-        labels: list[str] = [
-            "title",
-            "subtitle-level-1",
-            "paragraph",
-            "caption",
-            "table",
-            "Text",
-            "text",
+        labels: list[DocItemLabel] = [
+            DocItemLabel.TITLE,
+            DocItemLabel.SECTION_HEADER,
+            DocItemLabel.PARAGRAPH,
+            DocItemLabel.CAPTION,
+            DocItemLabel.TABLE,
+            DocItemLabel.TEXT,
         ],
         strict_text: bool = False,
     ) -> str:
@@ -867,7 +946,7 @@ def export_to_markdown(
         :param delim: str:  (Default value = "\n\n")
         :param from_element: int:  (Default value = 0)
         :param to_element: Optional[int]:  (Default value = None)
-        :param labels: list[str]:  (Default value = ["title")
+        :param labels: list[DocItemLabel]
         :param "subtitle-level-1":
         :param "paragraph":
         :param "caption":
@@ -966,15 +1045,13 @@ def export_to_document_tokens(
         delim: str = "\n\n",
         from_element: int = 0,
         to_element: Optional[int] = None,
-        labels: list[str] = [
-            "title",
-            "subtitle-level-1",
-            "Section-header" "paragraph",
-            "caption",
-            "table",
-            "figure",
-            "text",
-            "Text",
+        labels: list[DocItemLabel] = [
+            DocItemLabel.TITLE,
+            DocItemLabel.SECTION_HEADER,
+            DocItemLabel.PARAGRAPH,
+            DocItemLabel.CAPTION,
+            DocItemLabel.TABLE,
+            DocItemLabel.TEXT,
         ],
         xsize: int = 100,
         ysize: int = 100,
@@ -994,7 +1071,7 @@ def export_to_document_tokens(
         :param delim: str:  (Default value = "\n\n")
         :param from_element: int:  (Default value = 0)
         :param to_element: Optional[int]:  (Default value = None)
-        :param labels: list[str]:  (Default value = ["title")
+        :param labels: list[DocItemLabel]
         :param "subtitle-level-1":
         :param "Section-header" "paragraph":
         :param "caption":
@@ -1092,7 +1169,7 @@ def export_to_document_tokens(
 
         return doctags
 
-    def add_page(self, page_no: int, size: Size, hash: str) -> PageItem:
+    def add_page(self, page_no: int, size: Size) -> PageItem:
         """add_page.
 
         :param page_no: int:
@@ -1100,7 +1177,41 @@ def add_page(self, page_no: int, size: Size, hash: str) -> PageItem:
         :param hash: str:
 
         """
-        pitem = PageItem(page_no=page_no, size=size, hash=hash)
+        pitem = PageItem(page_no=page_no, size=size, hash=page_no)
 
         self.pages[page_no] = pitem
         return pitem
+
+    @field_serializer("body", "furniture", mode="wrap")
+    def serialize_tree(self, value: NodeItem, handler):
+        """serialize_tree."""
+        for node, level in self.iterate_elements(root=value, with_groups=True):
+            node.hash = self._derive_hash(node.self_ref)
+
+        return handler(value)
+
+    @field_serializer("pages", mode="wrap")
+    def serialize_pages(self, pages: Dict[int, PageItem], handler):
+        """serialize_pages."""
+        for page in pages.values():
+            page.hash = self._derive_hash(str(page.page_no))
+
+        return handler(pages)
+
+    def update_hashes(self):
+        """update_hashes."""
+        # Updates the hashes on all elements, based on the computed document hash
+        for node, level in self.iterate_elements(root=self.body, with_groups=True):
+            node.hash = self._derive_hash(node.self_ref)
+
+        for node, level in self.iterate_elements(root=self.furniture, with_groups=True):
+            node.hash = self._derive_hash(node.self_ref)
+
+        for page in self.pages.values():
+            page.hash = self._derive_hash(str(page.page_no))
+
+    def _derive_hash(self, data: str) -> Uint64:
+        doc_hash = self.hash
+        combined = f"{doc_hash}{data}"
+
+        return self._compute_hash(combined)
diff --git a/docling_core/types/experimental/labels.py b/docling_core/types/experimental/labels.py
index 99abfca..117ccd1 100644
--- a/docling_core/types/experimental/labels.py
+++ b/docling_core/types/experimental/labels.py
@@ -29,8 +29,6 @@ class DocItemLabel(str, Enum):
     PARAGRAPH = "paragraph"  # explicitly a paragraph and not arbitrary text
     REFERENCE = "reference"
 
-    # To be completed...
-
 
 class GroupLabel(str, Enum):
     """GroupLabel."""
@@ -39,8 +37,8 @@ class GroupLabel(str, Enum):
     LIST = "list"  # group label for list container (not the list-items)
     CHAPTER = "chapter"
     SECTION = "section"
-
-    # ...
+    SHEET = "sheet"
+    SLIDE = "slide"
 
 
 class TableCellLabel(str, Enum):
@@ -48,5 +46,5 @@ class TableCellLabel(str, Enum):
 
     COLUMN_HEADER = "col_header"
     ROW_HEADER = "row_header"
-    SECTION = "row_section"
+    ROW_SECTION = "row_section"
     BODY = "body"
diff --git a/pyproject.toml b/pyproject.toml
index 3d353b7..b5bf7a9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -96,11 +96,16 @@ include_trailing_comma = true
 
 [tool.autoflake]
 in-place = true
+ignore-init-module-imports = true
 remove-all-unused-imports = true
 remove-unused-variables = true
 expand-star-imports = true
 recursive = true
 
+[tool.flake8]
+per-file-ignores = "__init__.py:F401"
+classmethod-decorators = "classmethod,validator"
+
 [tool.mypy]
 pretty = true
 # strict = true
diff --git a/test/data/experimental/2206.01062.experimental.yaml b/test/data/experimental/2206.01062.experimental.yaml
index 1d72988..88449bb 100644
--- a/test/data/experimental/2206.01062.experimental.yaml
+++ b/test/data/experimental/2206.01062.experimental.yaml
@@ -138,149 +138,203 @@ body:
   - $ref: '#/texts/125'
   - $ref: '#/texts/126'
   - $ref: '#/texts/127'
-  dloc: '#/body'
-  hash: 1876595454579351028
+  hash: 2982977287550829877
   label: unspecified
   name: _root_
   parent: null
+  self_ref: '#/body'
 description: {}
 figures:
 - captions:
   - $ref: '#/texts/12'
   children: []
   data: {}
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/figures/0
   footnotes: []
-  hash: 3823827261264467155
+  hash: 8845849056743015509
   image: null
   label: picture
   parent:
     $ref: '#/body'
-  prov: []
+  prov:
+  - bbox:
+      b: 266.1221618652344
+      coord_origin: BOTTOMLEFT
+      l: 324.3027038574219
+      r: 554.91796875
+      t: 543.5838623046875
+    charspan:
+    - 0
+    - 84
+    page_no: 1
   references: []
+  self_ref: '#/figures/0'
 - captions:
   - $ref: '#/texts/39'
   children: []
   data: {}
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/figures/1
   footnotes: []
-  hash: 2717789230650946439
+  hash: 9157218593054708372
   image: null
   label: picture
   parent:
     $ref: '#/body'
-  prov: []
+  prov:
+  - bbox:
+      b: 569.726806640625
+      coord_origin: BOTTOMLEFT
+      l: 88.16680145263672
+      r: 264.2818298339844
+      t: 698.8894653320312
+    charspan:
+    - 0
+    - 69
+    page_no: 3
   references: []
+  self_ref: '#/figures/1'
 - captions:
   - $ref: '#/texts/70'
   children: []
   data: {}
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/figures/2
   footnotes: []
-  hash: 11874686886604579344
+  hash: 2268600251493203652
   image: null
   label: picture
   parent:
     $ref: '#/body'
-  prov: []
+  prov:
+  - bbox:
+      b: 331.43994140625
+      coord_origin: BOTTOMLEFT
+      l: 315.8857116699219
+      r: 559.6527709960938
+      t: 707.0224609375
+    charspan:
+    - 0
+    - 130
+    page_no: 5
   references: []
+  self_ref: '#/figures/2'
 - captions:
   - $ref: '#/texts/76'
   children: []
   data: {}
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/figures/3
   footnotes: []
-  hash: 13157758373214615403
+  hash: 15280179124146488233
   image: null
   label: picture
   parent:
     $ref: '#/body'
-  prov: []
+  prov:
+  - bbox:
+      b: 531.372314453125
+      coord_origin: BOTTOMLEFT
+      l: 322.7086486816406
+      r: 553.7246704101562
+      t: 701.6975708007812
+    charspan:
+    - 0
+    - 71
+    page_no: 6
   references: []
+  self_ref: '#/figures/3'
 - captions:
   - $ref: '#/texts/117'
   children: []
   data: {}
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/figures/4
   footnotes: []
-  hash: 3241646916892239195
+  hash: 7078061312183845001
   image: null
   label: picture
   parent:
     $ref: '#/body'
-  prov: []
+  prov:
+  - bbox:
+      b: 343.73516845703125
+      coord_origin: BOTTOMLEFT
+      l: 53.59891891479492
+      r: 554.9424438476562
+      t: 708.443115234375
+    charspan:
+    - 0
+    - 188
+    page_no: 9
   references: []
-file_info:
-  document_hash: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc
-  filename: "2206.01062.pdf"
+  self_ref: '#/figures/4'
 furniture:
   children: []
-  dloc: '#/furniture'
-  hash: 5280524054814059340
+  hash: 2030260901333211352
   label: unspecified
   name: _root_
   parent: null
+  self_ref: '#/furniture'
 groups: []
+hash: 17981205059156515073
 key_value_items: []
+name: '2206.01062'
+origin:
+  binary_hash: 7156212269791437020
+  filename: 2206.01062.pdf
+  mimetype: application/pdf
+  uri: null
 pages:
   '1':
-    hash: 3c76b6d3fd82865e42c51d5cbd7d1a9996dba7902643b919acc581e866b92716
+    hash: 8509969582596715807
     image: null
     page_no: 1
     size:
       height: 792.0
       width: 612.0
   '2':
-    hash: 5ccfaddd314d3712cbabc857c8c0f33d1268341ce37b27089857cbf09f0522d4
+    hash: 8946042279011020565
     image: null
     page_no: 2
     size:
       height: 792.0
       width: 612.0
   '3':
-    hash: d2dc51ad0a01ee9486ffe248649ee1cd10ce35773de8e4b21abf30d310f4fc26
+    hash: 11952309765409111665
     image: null
     page_no: 3
     size:
       height: 792.0
       width: 612.0
   '4':
-    hash: 310121977375f8f1106412189943bd70f121629b2b4d35394077233dedbfb041
+    hash: 16141549366384907945
     image: null
     page_no: 4
     size:
       height: 792.0
       width: 612.0
   '5':
-    hash: 09fa72b602eb0640669844acabc17ef494802a4a9188aeaaf0e0131c496e6951
+    hash: 13731695325243987934
     image: null
     page_no: 5
     size:
       height: 792.0
       width: 612.0
   '6':
-    hash: ec3fa60f136f3d9f5fa790ab27f5d1c14e5622573c52377b909b591d0be0ea44
+    hash: 16994899611641034686
     image: null
     page_no: 6
     size:
       height: 792.0
       width: 612.0
   '7':
-    hash: ec1bc56fe581ce95615b1fab11c3ba8fc89662acf2f53446decd380a155b06dd
+    hash: 5935321345165759586
     image: null
     page_no: 7
     size:
       height: 792.0
       width: 612.0
   '8':
-    hash: fbd2b06876dddc19ee08e0a9751d978c03e6943b74bedf1d83d6528cd4f8954d
+    hash: 12407706083782784507
     image: null
     page_no: 8
     size:
       height: 792.0
       width: 612.0
   '9':
-    hash: 6cfa4eb4410fa9972da289dbf8d8cc585d317a192e1214c778ddd7768e98f311
+    hash: 14545812751042780836
     image: null
     page_no: 9
     size:
@@ -291,13 +345,3620 @@ tables:
   - $ref: '#/texts/50'
   children: []
   data:
-    grid: []
-    num_cols: 0
-    num_rows: 0
-    table_cells: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/tables/0
+    grid:
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 0
+        text: ''
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 0
+        text: ''
+      - bbox: null
+        col_span: 3
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 0
+        text: '% of Total'
+      - bbox: null
+        col_span: 3
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 0
+        text: '% of Total'
+      - bbox: null
+        col_span: 3
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 0
+        text: '% of Total'
+      - bbox: null
+        col_span: 7
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 0
+        text: triple inter-annotator mAP @ 0.5-0.95 (%)
+      - bbox: null
+        col_span: 7
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 0
+        text: triple inter-annotator mAP @ 0.5-0.95 (%)
+      - bbox: null
+        col_span: 7
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 0
+        text: triple inter-annotator mAP @ 0.5-0.95 (%)
+      - bbox: null
+        col_span: 7
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 0
+        text: triple inter-annotator mAP @ 0.5-0.95 (%)
+      - bbox: null
+        col_span: 7
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 0
+        text: triple inter-annotator mAP @ 0.5-0.95 (%)
+      - bbox: null
+        col_span: 7
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 0
+        text: triple inter-annotator mAP @ 0.5-0.95 (%)
+      - bbox: null
+        col_span: 7
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 0
+        text: triple inter-annotator mAP @ 0.5-0.95 (%)
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 1
+        text: class label
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 1
+        text: Count
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 1
+        text: Train
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 1
+        text: Test
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 1
+        text: Val
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 1
+        text: All
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 7
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 6
+        start_row_offset_idx: 1
+        text: Fin
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 8
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 7
+        start_row_offset_idx: 1
+        text: Man
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 9
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 8
+        start_row_offset_idx: 1
+        text: Sci
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 10
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 9
+        start_row_offset_idx: 1
+        text: Law
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 11
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 10
+        start_row_offset_idx: 1
+        text: Pat
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 11
+        start_row_offset_idx: 1
+        text: Ten
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 2
+        text: Caption
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 2
+        text: '22524'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 2
+        text: '2.04'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 2
+        text: '1.77'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 2
+        text: '2.32'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 2
+        text: 84-89
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 7
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 6
+        start_row_offset_idx: 2
+        text: 40-61
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 8
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 7
+        start_row_offset_idx: 2
+        text: 86-92
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 9
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 8
+        start_row_offset_idx: 2
+        text: 94-99
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 10
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 9
+        start_row_offset_idx: 2
+        text: 95-99
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 11
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 10
+        start_row_offset_idx: 2
+        text: 69-78
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 11
+        start_row_offset_idx: 2
+        text: n/a
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 3
+        text: Footnote
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 3
+        text: '6318'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 3
+        text: '0.60'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 3
+        text: '0.31'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 3
+        text: '0.58'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 3
+        text: 83-91
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 7
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 6
+        start_row_offset_idx: 3
+        text: n/a
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 8
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 7
+        start_row_offset_idx: 3
+        text: '100'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 9
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 8
+        start_row_offset_idx: 3
+        text: 62-88
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 10
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 9
+        start_row_offset_idx: 3
+        text: 85-94
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 11
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 10
+        start_row_offset_idx: 3
+        text: n/a
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 11
+        start_row_offset_idx: 3
+        text: 82-97
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 4
+        text: Formula
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 4
+        text: '25027'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 4
+        text: '2.25'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 4
+        text: '1.90'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 4
+        text: '2.96'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 4
+        text: 83-85
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 7
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 6
+        start_row_offset_idx: 4
+        text: n/a
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 8
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 7
+        start_row_offset_idx: 4
+        text: n/a
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 9
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 8
+        start_row_offset_idx: 4
+        text: 84-87
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 10
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 9
+        start_row_offset_idx: 4
+        text: 86-96
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 11
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 10
+        start_row_offset_idx: 4
+        text: n/a
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 11
+        start_row_offset_idx: 4
+        text: n/a
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 5
+        text: List-item
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 5
+        text: '185660'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 5
+        text: '17.19'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 5
+        text: '13.34'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 5
+        text: '15.82'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 5
+        text: 87-88
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 7
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 6
+        start_row_offset_idx: 5
+        text: 74-83
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 8
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 7
+        start_row_offset_idx: 5
+        text: 90-92
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 9
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 8
+        start_row_offset_idx: 5
+        text: 97-97
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 10
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 9
+        start_row_offset_idx: 5
+        text: 81-85
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 11
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 10
+        start_row_offset_idx: 5
+        text: 75-88
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 11
+        start_row_offset_idx: 5
+        text: 93-95
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 6
+        text: Page-footer
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 6
+        text: '70878'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 6
+        text: '6.51'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 6
+        text: '5.58'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 6
+        text: '6.00'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 6
+        text: 93-94
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 7
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 6
+        start_row_offset_idx: 6
+        text: 88-90
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 8
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 7
+        start_row_offset_idx: 6
+        text: 95-96
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 9
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 8
+        start_row_offset_idx: 6
+        text: '100'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 10
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 9
+        start_row_offset_idx: 6
+        text: 92-97
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 11
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 10
+        start_row_offset_idx: 6
+        text: '100'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 11
+        start_row_offset_idx: 6
+        text: 96-98
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 7
+        text: Page-header
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 7
+        text: '58022'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 7
+        text: '5.10'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 7
+        text: '6.70'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 7
+        text: '5.06'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 7
+        text: 85-89
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 7
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 6
+        start_row_offset_idx: 7
+        text: 66-76
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 8
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 7
+        start_row_offset_idx: 7
+        text: 90-94
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 9
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 8
+        start_row_offset_idx: 7
+        text: 98-100
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 10
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 9
+        start_row_offset_idx: 7
+        text: 91-92
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 11
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 10
+        start_row_offset_idx: 7
+        text: 97-99
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 11
+        start_row_offset_idx: 7
+        text: 81-86
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 8
+        text: Picture
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 8
+        text: '45976'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 8
+        text: '4.21'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 8
+        text: '2.78'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 8
+        text: '5.31'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 8
+        text: 69-71
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 7
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 6
+        start_row_offset_idx: 8
+        text: 56-59
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 8
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 7
+        start_row_offset_idx: 8
+        text: 82-86
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 9
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 8
+        start_row_offset_idx: 8
+        text: 69-82
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 10
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 9
+        start_row_offset_idx: 8
+        text: 80-95
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 11
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 10
+        start_row_offset_idx: 8
+        text: 66-71
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 11
+        start_row_offset_idx: 8
+        text: 59-76
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 9
+        text: Section-header
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 9
+        text: '142884'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 9
+        text: '12.60'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 9
+        text: '15.77'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 9
+        text: '12.85'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 9
+        text: 83-84
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 7
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 6
+        start_row_offset_idx: 9
+        text: 76-81
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 8
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 7
+        start_row_offset_idx: 9
+        text: 90-92
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 9
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 8
+        start_row_offset_idx: 9
+        text: 94-95
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 10
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 9
+        start_row_offset_idx: 9
+        text: 87-94
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 11
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 10
+        start_row_offset_idx: 9
+        text: 69-73
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 11
+        start_row_offset_idx: 9
+        text: 78-86
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 10
+        text: Table
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 10
+        text: '34733'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 10
+        text: '3.20'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 10
+        text: '2.27'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 10
+        text: '3.60'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 10
+        text: 77-81
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 7
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 6
+        start_row_offset_idx: 10
+        text: 75-80
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 8
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 7
+        start_row_offset_idx: 10
+        text: 83-86
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 9
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 8
+        start_row_offset_idx: 10
+        text: 98-99
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 10
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 9
+        start_row_offset_idx: 10
+        text: 58-80
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 11
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 10
+        start_row_offset_idx: 10
+        text: 79-84
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 11
+        start_row_offset_idx: 10
+        text: 70-85
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 11
+        text: Text
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 11
+        text: '510377'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 11
+        text: '45.82'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 11
+        text: '49.28'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 11
+        text: '45.00'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 11
+        text: 84-86
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 7
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 6
+        start_row_offset_idx: 11
+        text: 81-86
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 8
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 7
+        start_row_offset_idx: 11
+        text: 88-93
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 9
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 8
+        start_row_offset_idx: 11
+        text: 89-93
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 10
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 9
+        start_row_offset_idx: 11
+        text: 87-92
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 11
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 10
+        start_row_offset_idx: 11
+        text: 71-79
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 11
+        start_row_offset_idx: 11
+        text: 87-95
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 12
+        text: Title
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 12
+        text: '5071'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 12
+        text: '0.47'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 12
+        text: '0.30'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 12
+        text: '0.50'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 12
+        text: 60-72
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 7
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 6
+        start_row_offset_idx: 12
+        text: 24-63
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 8
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 7
+        start_row_offset_idx: 12
+        text: 50-63
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 9
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 8
+        start_row_offset_idx: 12
+        text: 94-100
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 10
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 9
+        start_row_offset_idx: 12
+        text: 82-96
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 11
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 10
+        start_row_offset_idx: 12
+        text: 68-79
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 11
+        start_row_offset_idx: 12
+        text: 24-56
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 13
+        text: Total
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 13
+        text: '1107470'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 13
+        text: '941123'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 13
+        text: '99816'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 13
+        text: '66531'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 13
+        text: 82-83
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 7
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 6
+        start_row_offset_idx: 13
+        text: 71-74
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 8
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 7
+        start_row_offset_idx: 13
+        text: 79-81
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 9
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 8
+        start_row_offset_idx: 13
+        text: 89-94
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 10
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 9
+        start_row_offset_idx: 13
+        text: 86-91
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 11
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 10
+        start_row_offset_idx: 13
+        text: 71-76
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 12
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 11
+        start_row_offset_idx: 13
+        text: 68-85
+    num_cols: 12
+    num_rows: 14
+    table_cells:
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 0
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 0
+      text: ''
+    - bbox: null
+      col_span: 3
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 0
+      text: '% of Total'
+    - bbox: null
+      col_span: 7
+      column_header: false
+      end_col_offset_idx: 12
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 0
+      text: triple inter-annotator mAP @ 0.5-0.95 (%)
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 1
+      text: class label
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 1
+      text: Count
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 1
+      text: Train
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 1
+      text: Test
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 1
+      text: Val
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 1
+      text: All
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 7
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 6
+      start_row_offset_idx: 1
+      text: Fin
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 8
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 7
+      start_row_offset_idx: 1
+      text: Man
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 9
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 8
+      start_row_offset_idx: 1
+      text: Sci
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 10
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 9
+      start_row_offset_idx: 1
+      text: Law
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 11
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 10
+      start_row_offset_idx: 1
+      text: Pat
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 12
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 11
+      start_row_offset_idx: 1
+      text: Ten
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 2
+      text: Caption
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 2
+      text: '22524'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 2
+      text: '2.04'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 2
+      text: '1.77'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 2
+      text: '2.32'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 2
+      text: 84-89
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 7
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 6
+      start_row_offset_idx: 2
+      text: 40-61
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 8
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 7
+      start_row_offset_idx: 2
+      text: 86-92
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 9
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 8
+      start_row_offset_idx: 2
+      text: 94-99
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 10
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 9
+      start_row_offset_idx: 2
+      text: 95-99
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 11
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 10
+      start_row_offset_idx: 2
+      text: 69-78
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 12
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 11
+      start_row_offset_idx: 2
+      text: n/a
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 3
+      text: Footnote
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 3
+      text: '6318'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 3
+      text: '0.60'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 3
+      text: '0.31'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 3
+      text: '0.58'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 3
+      text: 83-91
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 7
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 6
+      start_row_offset_idx: 3
+      text: n/a
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 8
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 7
+      start_row_offset_idx: 3
+      text: '100'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 9
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 8
+      start_row_offset_idx: 3
+      text: 62-88
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 10
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 9
+      start_row_offset_idx: 3
+      text: 85-94
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 11
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 10
+      start_row_offset_idx: 3
+      text: n/a
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 12
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 11
+      start_row_offset_idx: 3
+      text: 82-97
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 4
+      text: Formula
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 4
+      text: '25027'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 4
+      text: '2.25'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 4
+      text: '1.90'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 4
+      text: '2.96'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 4
+      text: 83-85
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 7
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 6
+      start_row_offset_idx: 4
+      text: n/a
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 8
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 7
+      start_row_offset_idx: 4
+      text: n/a
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 9
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 8
+      start_row_offset_idx: 4
+      text: 84-87
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 10
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 9
+      start_row_offset_idx: 4
+      text: 86-96
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 11
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 10
+      start_row_offset_idx: 4
+      text: n/a
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 12
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 11
+      start_row_offset_idx: 4
+      text: n/a
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 5
+      text: List-item
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 5
+      text: '185660'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 5
+      text: '17.19'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 5
+      text: '13.34'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 5
+      text: '15.82'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 5
+      text: 87-88
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 7
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 6
+      start_row_offset_idx: 5
+      text: 74-83
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 8
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 7
+      start_row_offset_idx: 5
+      text: 90-92
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 9
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 8
+      start_row_offset_idx: 5
+      text: 97-97
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 10
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 9
+      start_row_offset_idx: 5
+      text: 81-85
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 11
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 10
+      start_row_offset_idx: 5
+      text: 75-88
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 12
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 11
+      start_row_offset_idx: 5
+      text: 93-95
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 6
+      text: Page-footer
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 6
+      text: '70878'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 6
+      text: '6.51'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 6
+      text: '5.58'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 6
+      text: '6.00'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 6
+      text: 93-94
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 7
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 6
+      start_row_offset_idx: 6
+      text: 88-90
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 8
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 7
+      start_row_offset_idx: 6
+      text: 95-96
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 9
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 8
+      start_row_offset_idx: 6
+      text: '100'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 10
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 9
+      start_row_offset_idx: 6
+      text: 92-97
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 11
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 10
+      start_row_offset_idx: 6
+      text: '100'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 12
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 11
+      start_row_offset_idx: 6
+      text: 96-98
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 7
+      text: Page-header
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 7
+      text: '58022'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 7
+      text: '5.10'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 7
+      text: '6.70'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 7
+      text: '5.06'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 7
+      text: 85-89
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 7
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 6
+      start_row_offset_idx: 7
+      text: 66-76
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 8
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 7
+      start_row_offset_idx: 7
+      text: 90-94
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 9
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 8
+      start_row_offset_idx: 7
+      text: 98-100
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 10
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 9
+      start_row_offset_idx: 7
+      text: 91-92
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 11
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 10
+      start_row_offset_idx: 7
+      text: 97-99
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 12
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 11
+      start_row_offset_idx: 7
+      text: 81-86
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 8
+      text: Picture
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 8
+      text: '45976'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 8
+      text: '4.21'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 8
+      text: '2.78'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 8
+      text: '5.31'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 8
+      text: 69-71
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 7
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 6
+      start_row_offset_idx: 8
+      text: 56-59
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 8
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 7
+      start_row_offset_idx: 8
+      text: 82-86
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 9
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 8
+      start_row_offset_idx: 8
+      text: 69-82
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 10
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 9
+      start_row_offset_idx: 8
+      text: 80-95
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 11
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 10
+      start_row_offset_idx: 8
+      text: 66-71
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 12
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 11
+      start_row_offset_idx: 8
+      text: 59-76
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 9
+      text: Section-header
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 9
+      text: '142884'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 9
+      text: '12.60'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 9
+      text: '15.77'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 9
+      text: '12.85'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 9
+      text: 83-84
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 7
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 6
+      start_row_offset_idx: 9
+      text: 76-81
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 8
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 7
+      start_row_offset_idx: 9
+      text: 90-92
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 9
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 8
+      start_row_offset_idx: 9
+      text: 94-95
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 10
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 9
+      start_row_offset_idx: 9
+      text: 87-94
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 11
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 10
+      start_row_offset_idx: 9
+      text: 69-73
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 12
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 11
+      start_row_offset_idx: 9
+      text: 78-86
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 10
+      text: Table
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 10
+      text: '34733'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 10
+      text: '3.20'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 10
+      text: '2.27'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 10
+      text: '3.60'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 10
+      text: 77-81
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 7
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 6
+      start_row_offset_idx: 10
+      text: 75-80
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 8
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 7
+      start_row_offset_idx: 10
+      text: 83-86
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 9
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 8
+      start_row_offset_idx: 10
+      text: 98-99
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 10
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 9
+      start_row_offset_idx: 10
+      text: 58-80
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 11
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 10
+      start_row_offset_idx: 10
+      text: 79-84
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 12
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 11
+      start_row_offset_idx: 10
+      text: 70-85
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 11
+      text: Text
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 11
+      text: '510377'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 11
+      text: '45.82'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 11
+      text: '49.28'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 11
+      text: '45.00'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 11
+      text: 84-86
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 7
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 6
+      start_row_offset_idx: 11
+      text: 81-86
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 8
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 7
+      start_row_offset_idx: 11
+      text: 88-93
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 9
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 8
+      start_row_offset_idx: 11
+      text: 89-93
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 10
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 9
+      start_row_offset_idx: 11
+      text: 87-92
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 11
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 10
+      start_row_offset_idx: 11
+      text: 71-79
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 12
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 11
+      start_row_offset_idx: 11
+      text: 87-95
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 12
+      text: Title
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 12
+      text: '5071'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 12
+      text: '0.47'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 12
+      text: '0.30'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 12
+      text: '0.50'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 12
+      text: 60-72
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 7
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 6
+      start_row_offset_idx: 12
+      text: 24-63
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 8
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 7
+      start_row_offset_idx: 12
+      text: 50-63
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 9
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 8
+      start_row_offset_idx: 12
+      text: 94-100
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 10
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 9
+      start_row_offset_idx: 12
+      text: 82-96
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 11
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 10
+      start_row_offset_idx: 12
+      text: 68-79
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 12
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 11
+      start_row_offset_idx: 12
+      text: 24-56
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 13
+      text: Total
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 13
+      text: '1107470'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 13
+      text: '941123'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 13
+      text: '99816'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 13
+      text: '66531'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 13
+      text: 82-83
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 7
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 6
+      start_row_offset_idx: 13
+      text: 71-74
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 8
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 7
+      start_row_offset_idx: 13
+      text: 79-81
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 9
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 8
+      start_row_offset_idx: 13
+      text: 89-94
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 10
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 9
+      start_row_offset_idx: 13
+      text: 86-91
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 11
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 10
+      start_row_offset_idx: 13
+      text: 71-76
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 12
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 11
+      start_row_offset_idx: 13
+      text: 68-85
   footnotes: []
-  hash: 14148577749296175318
+  hash: 7038790146519691597
   image: null
   label: table
   parent:
@@ -314,17 +3975,1843 @@ tables:
     - 0
     page_no: 4
   references: []
+  self_ref: '#/tables/0'
 - captions:
   - $ref: '#/texts/73'
   children: []
   data:
-    grid: []
-    num_cols: 0
-    num_rows: 0
-    table_cells: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/tables/1
+    grid:
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 0
+        text: ''
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 2
+        start_col_offset_idx: 1
+        start_row_offset_idx: 0
+        text: human
+      - bbox: null
+        col_span: 2
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 0
+        text: MRCNN
+      - bbox: null
+        col_span: 2
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 0
+        text: MRCNN
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 0
+        text: FRCNN
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 0
+        text: YOLO
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 1
+        text: ''
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 2
+        start_col_offset_idx: 1
+        start_row_offset_idx: 0
+        text: human
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 1
+        text: R50
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 1
+        text: R101
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 1
+        text: R101
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 1
+        text: v5x6
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 2
+        text: Caption
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 2
+        text: 84-89
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 2
+        text: '68.4'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 2
+        text: '71.5'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 2
+        text: '70.1'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 2
+        text: '77.7'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 3
+        text: Footnote
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 3
+        text: 83-91
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 3
+        text: '70.9'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 3
+        text: '71.8'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 3
+        text: '73.7'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 3
+        text: '77.2'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 4
+        text: Formula
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 4
+        text: 83-85
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 4
+        text: '60.1'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 4
+        text: '63.4'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 4
+        text: '63.5'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 4
+        text: '66.2'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 5
+        text: List-item
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 5
+        text: 87-88
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 5
+        text: '81.2'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 5
+        text: '80.8'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 5
+        text: '81.0'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 5
+        text: '86.2'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 6
+        text: Page-footer
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 6
+        text: 93-94
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 6
+        text: '61.6'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 6
+        text: '59.3'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 6
+        text: '58.9'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 6
+        text: '61.1'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 7
+        text: Page-header
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 7
+        text: 85-89
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 7
+        text: '71.9'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 7
+        text: '70.0'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 7
+        text: '72.0'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 7
+        text: '67.9'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 8
+        text: Picture
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 8
+        text: 69-71
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 8
+        text: '71.7'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 8
+        text: '72.7'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 8
+        text: '72.0'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 8
+        text: '77.1'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 9
+        text: Section-header
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 9
+        text: 83-84
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 9
+        text: '67.6'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 9
+        text: '69.3'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 9
+        text: '68.4'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 9
+        text: '74.6'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 10
+        text: Table
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 10
+        text: 77-81
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 10
+        text: '82.2'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 10
+        text: '82.9'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 10
+        text: '82.2'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 10
+        text: '86.3'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 11
+        text: Text
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 11
+        text: 84-86
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 11
+        text: '84.6'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 11
+        text: '85.8'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 11
+        text: '85.4'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 11
+        text: '88.1'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 12
+        text: Title
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 12
+        text: 60-72
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 12
+        text: '76.7'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 12
+        text: '80.4'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 12
+        text: '79.9'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 12
+        text: '82.7'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 13
+        text: All
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 13
+        text: 82-83
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 13
+        text: '72.4'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 13
+        text: '73.5'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 13
+        text: '73.4'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 6
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 5
+        start_row_offset_idx: 13
+        text: '76.8'
+    num_cols: 6
+    num_rows: 14
+    table_cells:
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 0
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 2
+      start_col_offset_idx: 1
+      start_row_offset_idx: 0
+      text: human
+    - bbox: null
+      col_span: 2
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 0
+      text: MRCNN
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 0
+      text: FRCNN
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 0
+      text: YOLO
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 1
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 1
+      text: R50
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 1
+      text: R101
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 1
+      text: R101
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 1
+      text: v5x6
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 2
+      text: Caption
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 2
+      text: 84-89
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 2
+      text: '68.4'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 2
+      text: '71.5'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 2
+      text: '70.1'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 2
+      text: '77.7'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 3
+      text: Footnote
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 3
+      text: 83-91
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 3
+      text: '70.9'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 3
+      text: '71.8'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 3
+      text: '73.7'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 3
+      text: '77.2'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 4
+      text: Formula
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 4
+      text: 83-85
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 4
+      text: '60.1'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 4
+      text: '63.4'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 4
+      text: '63.5'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 4
+      text: '66.2'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 5
+      text: List-item
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 5
+      text: 87-88
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 5
+      text: '81.2'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 5
+      text: '80.8'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 5
+      text: '81.0'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 5
+      text: '86.2'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 6
+      text: Page-footer
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 6
+      text: 93-94
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 6
+      text: '61.6'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 6
+      text: '59.3'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 6
+      text: '58.9'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 6
+      text: '61.1'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 7
+      text: Page-header
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 7
+      text: 85-89
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 7
+      text: '71.9'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 7
+      text: '70.0'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 7
+      text: '72.0'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 7
+      text: '67.9'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 8
+      text: Picture
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 8
+      text: 69-71
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 8
+      text: '71.7'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 8
+      text: '72.7'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 8
+      text: '72.0'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 8
+      text: '77.1'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 9
+      text: Section-header
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 9
+      text: 83-84
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 9
+      text: '67.6'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 9
+      text: '69.3'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 9
+      text: '68.4'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 9
+      text: '74.6'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 10
+      text: Table
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 10
+      text: 77-81
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 10
+      text: '82.2'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 10
+      text: '82.9'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 10
+      text: '82.2'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 10
+      text: '86.3'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 11
+      text: Text
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 11
+      text: 84-86
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 11
+      text: '84.6'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 11
+      text: '85.8'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 11
+      text: '85.4'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 11
+      text: '88.1'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 12
+      text: Title
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 12
+      text: 60-72
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 12
+      text: '76.7'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 12
+      text: '80.4'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 12
+      text: '79.9'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 12
+      text: '82.7'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 13
+      text: All
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 13
+      text: 82-83
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 13
+      text: '72.4'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 13
+      text: '73.5'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 13
+      text: '73.4'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 6
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 5
+      start_row_offset_idx: 13
+      text: '76.8'
   footnotes: []
-  hash: 17333450552515386005
+  hash: 12013044154325944789
   image: null
   label: table
   parent:
@@ -341,17 +5828,1447 @@ tables:
     - 0
     page_no: 6
   references: []
+  self_ref: '#/tables/1'
 - captions:
   - $ref: '#/texts/82'
   children: []
   data:
-    grid: []
-    num_cols: 0
-    num_rows: 0
-    table_cells: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/tables/2
+    grid:
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 0
+        text: Class-count
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 0
+        text: '11'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 0
+        text: '6'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 0
+        text: '5'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 0
+        text: '4'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 1
+        text: Caption
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 1
+        text: '68'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 1
+        text: Text
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 1
+        text: Text
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 1
+        text: Text
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 2
+        text: Footnote
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 2
+        text: '71'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 2
+        text: Text
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 2
+        text: Text
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 2
+        text: Text
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 3
+        text: Formula
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 3
+        text: '60'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 3
+        text: Text
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 3
+        text: Text
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 3
+        text: Text
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 4
+        text: List-item
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 4
+        text: '81'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 4
+        text: Text
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 4
+        text: '82'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 4
+        text: Text
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 5
+        text: Page-footer
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 5
+        text: '62'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 5
+        text: '62'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 5
+        text: '-'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 5
+        text: '-'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 6
+        text: Page-header
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 6
+        text: '72'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 6
+        text: '68'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 6
+        text: '-'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 6
+        text: '-'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 7
+        text: Picture
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 7
+        text: '72'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 7
+        text: '72'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 7
+        text: '72'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 7
+        text: '72'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 8
+        text: Section-header
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 8
+        text: '68'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 8
+        text: '67'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 8
+        text: '69'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 8
+        text: '68'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 9
+        text: Table
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 9
+        text: '82'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 9
+        text: '83'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 9
+        text: '82'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 9
+        text: '82'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 10
+        text: Text
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 10
+        text: '85'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 10
+        text: '84'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 10
+        text: '84'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 10
+        text: '84'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 11
+        text: Title
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 11
+        text: '77'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 11
+        text: Sec.-h.
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 11
+        text: Sec.-h.
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 11
+        text: Sec.-h.
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 12
+        text: Overall
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 12
+        text: '72'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 12
+        text: '73'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 12
+        text: '78'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 12
+        text: '77'
+    num_cols: 5
+    num_rows: 13
+    table_cells:
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 0
+      text: Class-count
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 0
+      text: '11'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 0
+      text: '6'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 0
+      text: '5'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 0
+      text: '4'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 1
+      text: Caption
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 1
+      text: '68'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 1
+      text: Text
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 1
+      text: Text
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 1
+      text: Text
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 2
+      text: Footnote
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 2
+      text: '71'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 2
+      text: Text
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 2
+      text: Text
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 2
+      text: Text
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 3
+      text: Formula
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 3
+      text: '60'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 3
+      text: Text
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 3
+      text: Text
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 3
+      text: Text
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 4
+      text: List-item
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 4
+      text: '81'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 4
+      text: Text
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 4
+      text: '82'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 4
+      text: Text
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 5
+      text: Page-footer
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 5
+      text: '62'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 5
+      text: '62'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 5
+      text: '-'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 5
+      text: '-'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 6
+      text: Page-header
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 6
+      text: '72'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 6
+      text: '68'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 6
+      text: '-'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 6
+      text: '-'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 7
+      text: Picture
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 7
+      text: '72'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 7
+      text: '72'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 7
+      text: '72'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 7
+      text: '72'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 8
+      text: Section-header
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 8
+      text: '68'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 8
+      text: '67'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 8
+      text: '69'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 8
+      text: '68'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 9
+      text: Table
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 9
+      text: '82'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 9
+      text: '83'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 9
+      text: '82'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 9
+      text: '82'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 10
+      text: Text
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 10
+      text: '85'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 10
+      text: '84'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 10
+      text: '84'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 10
+      text: '84'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 11
+      text: Title
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 11
+      text: '77'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 11
+      text: Sec.-h.
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 11
+      text: Sec.-h.
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 11
+      text: Sec.-h.
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 12
+      text: Overall
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 12
+      text: '72'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 12
+      text: '73'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 12
+      text: '78'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 12
+      text: '77'
   footnotes: []
-  hash: 16080913497667217474
+  hash: 123696755438675010
   image: null
   label: table
   parent:
@@ -368,17 +7285,1535 @@ tables:
     - 0
     page_no: 7
   references: []
+  self_ref: '#/tables/2'
 - captions:
   - $ref: '#/texts/87'
   children: []
   data:
-    grid: []
-    num_cols: 0
-    num_rows: 0
-    table_cells: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/tables/3
+    grid:
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 0
+        text: Class-count
+      - bbox: null
+        col_span: 2
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 0
+        text: '11'
+      - bbox: null
+        col_span: 2
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 0
+        text: '11'
+      - bbox: null
+        col_span: 2
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 0
+        text: '5'
+      - bbox: null
+        col_span: 2
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 0
+        text: '5'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 1
+        text: Split
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 1
+        text: Doc
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 1
+        text: Page
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 1
+        text: Doc
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 1
+        text: Page
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 2
+        text: Caption
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 2
+        text: '68'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 2
+        text: '83'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 2
+        text: ''
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 2
+        text: ''
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 3
+        text: Footnote
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 3
+        text: '71'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 3
+        text: '84'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 3
+        text: ''
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 3
+        text: ''
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 4
+        text: Formula
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 4
+        text: '60'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 4
+        text: '66'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 4
+        text: ''
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 4
+        text: ''
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 5
+        text: List-item
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 5
+        text: '81'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 5
+        text: '88'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 5
+        text: '82'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 5
+        text: '88'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 6
+        text: Page-footer
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 6
+        text: '62'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 6
+        text: '89'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 6
+        text: ''
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 6
+        text: ''
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 7
+        text: Page-header
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 7
+        text: '72'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 7
+        text: '90'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 7
+        text: ''
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 7
+        text: ''
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 8
+        text: Picture
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 8
+        text: '72'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 8
+        text: '82'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 8
+        text: '72'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 8
+        text: '82'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 9
+        text: Section-header
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 9
+        text: '68'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 9
+        text: '83'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 9
+        text: '69'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 9
+        text: '83'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 10
+        text: Table
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 10
+        text: '82'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 10
+        text: '89'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 10
+        text: '82'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 10
+        text: '90'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 11
+        text: Text
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 11
+        text: '85'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 11
+        text: '91'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 11
+        text: '84'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 11
+        text: '90'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 12
+        text: Title
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 12
+        text: '77'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 12
+        text: '81'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 12
+        text: ''
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 12
+        text: ''
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 13
+        text: All
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 13
+        text: '72'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 13
+        text: '84'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 13
+        text: '78'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 5
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 4
+        start_row_offset_idx: 13
+        text: '87'
+    num_cols: 5
+    num_rows: 14
+    table_cells:
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 0
+      text: Class-count
+    - bbox: null
+      col_span: 2
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 0
+      text: '11'
+    - bbox: null
+      col_span: 2
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 0
+      text: '5'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 1
+      text: Split
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 1
+      text: Doc
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 1
+      text: Page
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 1
+      text: Doc
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 1
+      text: Page
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 2
+      text: Caption
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 2
+      text: '68'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 2
+      text: '83'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 2
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 2
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 3
+      text: Footnote
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 3
+      text: '71'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 3
+      text: '84'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 3
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 3
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 4
+      text: Formula
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 4
+      text: '60'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 4
+      text: '66'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 4
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 4
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 5
+      text: List-item
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 5
+      text: '81'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 5
+      text: '88'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 5
+      text: '82'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 5
+      text: '88'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 6
+      text: Page-footer
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 6
+      text: '62'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 6
+      text: '89'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 6
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 6
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 7
+      text: Page-header
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 7
+      text: '72'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 7
+      text: '90'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 7
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 7
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 8
+      text: Picture
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 8
+      text: '72'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 8
+      text: '82'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 8
+      text: '72'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 8
+      text: '82'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 9
+      text: Section-header
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 9
+      text: '68'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 9
+      text: '83'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 9
+      text: '69'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 9
+      text: '83'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 10
+      text: Table
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 10
+      text: '82'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 10
+      text: '89'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 10
+      text: '82'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 10
+      text: '90'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 11
+      text: Text
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 11
+      text: '85'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 11
+      text: '91'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 11
+      text: '84'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 11
+      text: '90'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 12
+      text: Title
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 12
+      text: '77'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 12
+      text: '81'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 12
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 12
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 13
+      text: All
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 13
+      text: '72'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 13
+      text: '84'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 13
+      text: '78'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 13
+      text: '87'
   footnotes: []
-  hash: 7071974284449481758
+  hash: 1302023060946254192
   image: null
   label: table
   parent:
@@ -395,17 +8830,1315 @@ tables:
     - 0
     page_no: 7
   references: []
+  self_ref: '#/tables/3'
 - captions:
   - $ref: '#/texts/93'
   children: []
   data:
-    grid: []
-    num_cols: 0
-    num_rows: 0
-    table_cells: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/tables/4
+    grid:
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 0
+        text: ''
+      - bbox: null
+        col_span: 3
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 0
+        text: Testing on
+      - bbox: null
+        col_span: 3
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 0
+        text: Testing on
+      - bbox: null
+        col_span: 3
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 1
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 0
+        text: Testing on
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 1
+        text: labels
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 1
+        text: PLN
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 1
+        text: DB
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 2
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 1
+        text: DLN
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 2
+        text: Figure
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 2
+        text: '96'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 2
+        text: '43'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 3
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 2
+        text: '23'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 3
+        text: Sec-header
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 3
+        text: '87'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 3
+        text: '-'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 4
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 3
+        text: '32'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 4
+        text: Table
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 4
+        text: '95'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 4
+        text: '24'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 5
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 4
+        text: '49'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 5
+        text: Text
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 5
+        text: '96'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 5
+        text: '-'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 6
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 5
+        text: '42'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 6
+        text: total
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 6
+        text: '93'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 6
+        text: '34'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 7
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 6
+        text: '30'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 7
+        text: Figure
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 7
+        text: '77'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 7
+        text: '71'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 8
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 7
+        text: '31'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 8
+        text: Table
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 8
+        text: '19'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 8
+        text: '65'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 9
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 8
+        text: '22'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 9
+        text: total
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 9
+        text: '48'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 9
+        text: '68'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 10
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 9
+        text: '27'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 10
+        text: Figure
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 10
+        text: '67'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 10
+        text: '51'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 11
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 10
+        text: '72'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 11
+        text: Sec-header
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 11
+        text: '53'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 11
+        text: '-'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 12
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 11
+        text: '68'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 12
+        text: Table
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 12
+        text: '87'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 12
+        text: '43'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 13
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 12
+        text: '82'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 13
+        text: Text
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 13
+        text: '77'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 13
+        text: '-'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 14
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 13
+        text: '84'
+    - - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 1
+        end_row_offset_idx: 15
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 0
+        start_row_offset_idx: 14
+        text: total
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 2
+        end_row_offset_idx: 15
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 1
+        start_row_offset_idx: 14
+        text: '59'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 3
+        end_row_offset_idx: 15
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 2
+        start_row_offset_idx: 14
+        text: '47'
+      - bbox: null
+        col_span: 1
+        column_header: false
+        end_col_offset_idx: 4
+        end_row_offset_idx: 15
+        row_header: false
+        row_section: false
+        row_span: 1
+        start_col_offset_idx: 3
+        start_row_offset_idx: 14
+        text: '78'
+    num_cols: 4
+    num_rows: 15
+    table_cells:
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 0
+      text: ''
+    - bbox: null
+      col_span: 3
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 0
+      text: Testing on
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 1
+      text: labels
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 1
+      text: PLN
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 1
+      text: DB
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 1
+      text: DLN
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 2
+      text: Figure
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 2
+      text: '96'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 2
+      text: '43'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 2
+      text: '23'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 3
+      text: Sec-header
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 3
+      text: '87'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 3
+      text: '-'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 4
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 3
+      text: '32'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 4
+      text: Table
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 4
+      text: '95'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 4
+      text: '24'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 5
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 4
+      text: '49'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 5
+      text: Text
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 5
+      text: '96'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 5
+      text: '-'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 6
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 5
+      text: '42'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 6
+      text: total
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 6
+      text: '93'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 6
+      text: '34'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 7
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 6
+      text: '30'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 7
+      text: Figure
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 7
+      text: '77'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 7
+      text: '71'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 8
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 7
+      text: '31'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 8
+      text: Table
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 8
+      text: '19'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 8
+      text: '65'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 9
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 8
+      text: '22'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 9
+      text: total
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 9
+      text: '48'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 9
+      text: '68'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 10
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 9
+      text: '27'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 10
+      text: Figure
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 10
+      text: '67'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 10
+      text: '51'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 11
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 10
+      text: '72'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 11
+      text: Sec-header
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 11
+      text: '53'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 11
+      text: '-'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 12
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 11
+      text: '68'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 12
+      text: Table
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 12
+      text: '87'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 12
+      text: '43'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 13
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 12
+      text: '82'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 13
+      text: Text
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 13
+      text: '77'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 13
+      text: '-'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 14
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 13
+      text: '84'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 15
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 14
+      text: total
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 15
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 14
+      text: '59'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 15
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 14
+      text: '47'
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 15
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 14
+      text: '78'
   footnotes: []
-  hash: 8754037299649738038
+  hash: 14690562278911182124
   image: null
   label: table
   parent:
@@ -422,10 +10155,10 @@ tables:
     - 0
     page_no: 8
   references: []
+  self_ref: '#/tables/4'
 texts:
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/0
-  hash: 5801389470470321019
+  hash: 5522753358710955051
   label: section_header
   orig: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
   parent:
@@ -441,10 +10174,10 @@ texts:
     - 0
     - 71
     page_no: 1
+  self_ref: '#/texts/0'
   text: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/1
-  hash: 8511179082257553176
+  hash: 8271824637245472778
   label: text
   orig: Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com
   parent:
@@ -460,10 +10193,10 @@ texts:
     - 0
     - 73
     page_no: 1
+  self_ref: '#/texts/1'
   text: Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/2
-  hash: 8279137503716887272
+  hash: 8306016912873407413
   label: text
   orig: Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com
   parent:
@@ -479,10 +10212,10 @@ texts:
     - 0
     - 71
     page_no: 1
+  self_ref: '#/texts/2'
   text: Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/3
-  hash: 16452346600845753706
+  hash: 18359905356795742945
   label: text
   orig: Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com
   parent:
@@ -498,10 +10231,10 @@ texts:
     - 0
     - 70
     page_no: 1
+  self_ref: '#/texts/3'
   text: Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/4
-  hash: 5753518757297767565
+  hash: 13640485470030436649
   label: text
   orig: Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com
   parent:
@@ -517,10 +10250,10 @@ texts:
     - 0
     - 72
     page_no: 1
+  self_ref: '#/texts/4'
   text: Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/5
-  hash: 400399309987224909
+  hash: 18369908591937398930
   label: text
   orig: Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com
   parent:
@@ -536,10 +10269,10 @@ texts:
     - 0
     - 68
     page_no: 1
+  self_ref: '#/texts/5'
   text: Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/6
-  hash: 7981313731349902307
+  hash: 9138081927775942786
   label: section_header
   orig: ABSTRACT
   parent:
@@ -555,10 +10288,10 @@ texts:
     - 0
     - 8
     page_no: 1
+  self_ref: '#/texts/6'
   text: ABSTRACT
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/7
-  hash: 18185954695676845569
+  hash: 14128818639649693389
   label: text
   orig: Accurate document layout analysis is a key requirement for highquality PDF
     document conversion. With the recent availability of public, large ground-truth
@@ -593,6 +10326,7 @@ texts:
     - 0
     - 1595
     page_no: 1
+  self_ref: '#/texts/7'
   text: Accurate document layout analysis is a key requirement for highquality PDF
     document conversion. With the recent availability of public, large ground-truth
     datasets such as PubLayNet and DocBank, deep-learning models have proven to be
@@ -614,8 +10348,7 @@ texts:
     that layout predictions of the DocLayNettrained models are more robust and thus
     the preferred choice for general-purpose document-layout analysis.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/8
-  hash: 17745163365450621279
+  hash: 13652067706470412099
   label: section_header
   orig: CCS CONCEPTS
   parent:
@@ -631,10 +10364,10 @@ texts:
     - 0
     - 12
     page_no: 1
+  self_ref: '#/texts/8'
   text: CCS CONCEPTS
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/9
-  hash: 12529050007388097730
+  hash: 14921674925616302289
   label: text
   orig: "\xB7 Information systems \u2192 Document structure ; \xB7 Applied computing\
     \ \u2192 Document analysis ; \xB7 Computing methodologies \u2192 Machine learning\
@@ -652,12 +10385,12 @@ texts:
     - 0
     - 170
     page_no: 1
+  self_ref: '#/texts/9'
   text: "\xB7 Information systems \u2192 Document structure ; \xB7 Applied computing\
     \ \u2192 Document analysis ; \xB7 Computing methodologies \u2192 Machine learning\
     \ ; Computer vision ; Object detection ;"
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/10
-  hash: 11569407347437572994
+  hash: 2534709576924392255
   label: text
   orig: Permission to make digital or hard copies of part or all of this work for
     personal or classroom use is granted without fee provided that copies are not
@@ -677,14 +10410,14 @@ texts:
     - 0
     - 397
     page_no: 1
+  self_ref: '#/texts/10'
   text: Permission to make digital or hard copies of part or all of this work for
     personal or classroom use is granted without fee provided that copies are not
     made or distributed for profit or commercial advantage and that copies bear this
     notice and the full citation on the first page. Copyrights for third-party components
     of this work must be honored. For all other uses, contact the owner/author(s).
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/11
-  hash: 13344389659304761998
+  hash: 1842005346019600268
   label: text
   orig: "KDD '22, August 14-18, 2022, Washington, DC, USA \xA9 2022 Copyright held\
     \ by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043"
@@ -701,11 +10434,11 @@ texts:
     - 0
     - 168
     page_no: 1
+  self_ref: '#/texts/11'
   text: "KDD '22, August 14-18, 2022, Washington, DC, USA \xA9 2022 Copyright held\
     \ by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043"
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/12
-  hash: 3162927929825665449
+  hash: 773478380528488902
   label: caption
   orig: 'Figure 1: Four examples of complex page layouts across different document
     categories'
@@ -722,11 +10455,11 @@ texts:
     - 0
     - 84
     page_no: 1
+  self_ref: '#/texts/12'
   text: 'Figure 1: Four examples of complex page layouts across different document
     categories'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/13
-  hash: 13011367304084404613
+  hash: 10700740223146899046
   label: section_header
   orig: KEYWORDS
   parent:
@@ -742,10 +10475,10 @@ texts:
     - 0
     - 8
     page_no: 1
+  self_ref: '#/texts/13'
   text: KEYWORDS
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/14
-  hash: 16726456449567869739
+  hash: 17170823355966624845
   label: text
   orig: PDF document conversion, layout segmentation, object-detection, data set,
     Machine Learning
@@ -762,11 +10495,11 @@ texts:
     - 0
     - 90
     page_no: 1
+  self_ref: '#/texts/14'
   text: PDF document conversion, layout segmentation, object-detection, data set,
     Machine Learning
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/15
-  hash: 5718230321549514887
+  hash: 10573589843195006501
   label: section_header
   orig: 'ACM Reference Format:'
   parent:
@@ -782,10 +10515,10 @@ texts:
     - 0
     - 21
     page_no: 1
+  self_ref: '#/texts/15'
   text: 'ACM Reference Format:'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/16
-  hash: 17635312130661974579
+  hash: 14872842244838520232
   label: text
   orig: 'Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter
     Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis.
@@ -805,14 +10538,14 @@ texts:
     - 0
     - 374
     page_no: 1
+  self_ref: '#/texts/16'
   text: 'Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter
     Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis.
     In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data
     Mining (KDD ''22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY,
     USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/17
-  hash: 5293186016864745982
+  hash: 8886316323367582203
   label: page_header
   orig: "KDD \u201922, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph\
     \ Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar"
@@ -829,11 +10562,11 @@ texts:
     - 0
     - 130
     page_no: 2
+  self_ref: '#/texts/17'
   text: "KDD \u201922, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph\
     \ Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar"
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/18
-  hash: 5428450824043951937
+  hash: 14036706213632695213
   label: section_header
   orig: 1 INTRODUCTION
   parent:
@@ -849,10 +10582,10 @@ texts:
     - 0
     - 14
     page_no: 2
+  self_ref: '#/texts/18'
   text: 1 INTRODUCTION
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/19
-  hash: 15747194476520587400
+  hash: 16763060144345832331
   label: text
   orig: Despite the substantial improvements achieved with machine-learning (ML) approaches
     and deep neural networks in recent years, document conversion remains a challenging
@@ -876,6 +10609,7 @@ texts:
     - 0
     - 702
     page_no: 2
+  self_ref: '#/texts/19'
   text: Despite the substantial improvements achieved with machine-learning (ML) approaches
     and deep neural networks in recent years, document conversion remains a challenging
     problem, as demonstrated by the numerous public competitions held on this topic
@@ -886,8 +10620,7 @@ texts:
     [5]. To highlight the variability in document layouts, we show a few example documents
     from the DocLayNet dataset in Figure 1.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/20
-  hash: 9815825593984971365
+  hash: 6179967949039028227
   label: text
   orig: 'A key problem in the process of document conversion is to understand the
     structure of a single document page, i.e. which segments of text should be grouped
@@ -922,6 +10655,7 @@ texts:
     - 0
     - 1580
     page_no: 2
+  self_ref: '#/texts/20'
   text: 'A key problem in the process of document conversion is to understand the
     structure of a single document page, i.e. which segments of text should be grouped
     together in a unit. To train models for this task, there are currently two large
@@ -943,8 +10677,7 @@ texts:
     more artistic or free-style layouts, we see sub-par prediction quality from these
     models, which we demonstrate in Section 5.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/21
-  hash: 562391038162260731
+  hash: 5928701079679201823
   label: text
   orig: 'In this paper, we present the DocLayNet dataset. It provides pageby-page
     layout annotation ground-truth using bounding-boxes for 11 distinct class labels
@@ -965,6 +10698,7 @@ texts:
     - 0
     - 462
     page_no: 2
+  self_ref: '#/texts/21'
   text: 'In this paper, we present the DocLayNet dataset. It provides pageby-page
     layout annotation ground-truth using bounding-boxes for 11 distinct class labels
     on 80863 unique document pages, of which a fraction carry double- or triple-annotations.
@@ -972,8 +10706,7 @@ texts:
     available to the public 1 in order to stimulate the document-layout analysis community.
     It distinguishes itself in the following aspects:'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/22
-  hash: 1687994490476660946
+  hash: 7285805654125410296
   label: list_item
   orig: '(1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on
     human annotation instead of automation approaches to generate the data set.'
@@ -990,11 +10723,11 @@ texts:
     - 0
     - 149
     page_no: 2
+  self_ref: '#/texts/22'
   text: '(1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on
     human annotation instead of automation approaches to generate the data set.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/23
-  hash: 8977022680477147526
+  hash: 12390464209458256957
   label: list_item
   orig: '(2) Large Layout Variability : We include diverse and complex layouts from
     a large variety of public sources.'
@@ -1011,11 +10744,11 @@ texts:
     - 0
     - 109
     page_no: 2
+  self_ref: '#/texts/23'
   text: '(2) Large Layout Variability : We include diverse and complex layouts from
     a large variety of public sources.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/24
-  hash: 2133234466113940345
+  hash: 11081073573051959825
   label: list_item
   orig: '(3) Detailed Label Set : We define 11 class labels to distinguish layout
     features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although
@@ -1033,12 +10766,12 @@ texts:
     - 0
     - 180
     page_no: 2
+  self_ref: '#/texts/24'
   text: '(3) Detailed Label Set : We define 11 class labels to distinguish layout
     features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although
     not a superset of ours.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/25
-  hash: 15972412295294232993
+  hash: 970302989994738402
   label: list_item
   orig: '(4) Redundant Annotations : A fraction of the pages in the DocLayNet data
     set carry more than one human annotation.'
@@ -1055,11 +10788,11 @@ texts:
     - 0
     - 115
     page_no: 2
+  self_ref: '#/texts/25'
   text: '(4) Redundant Annotations : A fraction of the pages in the DocLayNet data
     set carry more than one human annotation.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/26
-  hash: 338444530349878300
+  hash: 9624797869966588960
   label: footnote
   orig: $^{1}$https://developer.ibm.com/exchanges/data/all/doclaynet
   parent:
@@ -1075,10 +10808,10 @@ texts:
     - 0
     - 60
     page_no: 2
+  self_ref: '#/texts/26'
   text: $^{1}$https://developer.ibm.com/exchanges/data/all/doclaynet
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/27
-  hash: 3617521057052249807
+  hash: 15746426817481687610
   label: text
   orig: This enables experimentation with annotation uncertainty and quality control
     analysis.
@@ -1095,11 +10828,11 @@ texts:
     - 0
     - 86
     page_no: 2
+  self_ref: '#/texts/27'
   text: This enables experimentation with annotation uncertainty and quality control
     analysis.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/28
-  hash: 14788267481324200655
+  hash: 16075660714767766450
   label: list_item
   orig: '(5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide
     fixed train-, test- & validation-sets to ensure proportional representation of
@@ -1118,13 +10851,13 @@ texts:
     - 0
     - 280
     page_no: 2
+  self_ref: '#/texts/28'
   text: '(5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide
     fixed train-, test- & validation-sets to ensure proportional representation of
     the class-labels. Further, we prevent leakage of unique layouts across sets, which
     has a large effect on model accuracy scores.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/29
-  hash: 18119595765995049833
+  hash: 12188049037985059716
   label: text
   orig: All aspects outlined above are detailed in Section 3. In Section 4, we will
     elaborate on how we designed and executed this large-scale human annotation campaign.
@@ -1143,13 +10876,13 @@ texts:
     - 0
     - 297
     page_no: 2
+  self_ref: '#/texts/29'
   text: All aspects outlined above are detailed in Section 3. In Section 4, we will
     elaborate on how we designed and executed this large-scale human annotation campaign.
     We will also share key insights and lessons learned that might prove helpful for
     other parties planning to set up annotation campaigns.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/30
-  hash: 2634162194000949275
+  hash: 12509950877710825568
   label: text
   orig: In Section 5, we will present baseline accuracy numbers for a variety of object
     detection methods (Faster R-CNN, Mask R-CNN and YOLOv5) trained on DocLayNet.
@@ -1171,6 +10904,7 @@ texts:
     - 0
     - 506
     page_no: 2
+  self_ref: '#/texts/30'
   text: In Section 5, we will present baseline accuracy numbers for a variety of object
     detection methods (Faster R-CNN, Mask R-CNN and YOLOv5) trained on DocLayNet.
     We further show how the model performance is impacted by varying the DocLayNet
@@ -1179,8 +10913,7 @@ texts:
     and DocLayNet and demonstrate that a model trained on DocLayNet provides overall
     more robust layout recovery.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/31
-  hash: 12785294041260556899
+  hash: 1443485254974715578
   label: section_header
   orig: 2 RELATED WORK
   parent:
@@ -1196,10 +10929,10 @@ texts:
     - 0
     - 14
     page_no: 2
+  self_ref: '#/texts/31'
   text: 2 RELATED WORK
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/32
-  hash: 15532515360198720027
+  hash: 8122071803335578615
   label: text
   orig: While early approaches in document-layout analysis used rulebased algorithms
     and heuristics [8], the problem is lately addressed with deep learning methods.
@@ -1223,6 +10956,7 @@ texts:
     - 0
     - 655
     page_no: 2
+  self_ref: '#/texts/32'
   text: While early approaches in document-layout analysis used rulebased algorithms
     and heuristics [8], the problem is lately addressed with deep learning methods.
     The most common approach is to leverage object detection models [9-15]. In the
@@ -1233,8 +10967,7 @@ texts:
     such as PubLayNet [6] and DocBank provide their data in the commonly accepted
     COCO format [16].
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/33
-  hash: 7759908539731899164
+  hash: 842592930694523879
   label: text
   orig: Lately, new types of ML models for document-layout analysis have emerged in
     the community [18-21]. These models do not approach the problem of layout analysis
@@ -1256,6 +10989,7 @@ texts:
     - 0
     - 500
     page_no: 2
+  self_ref: '#/texts/33'
   text: Lately, new types of ML models for document-layout analysis have emerged in
     the community [18-21]. These models do not approach the problem of layout analysis
     purely based on an image representation of the page, as computer vision methods
@@ -1264,8 +10998,7 @@ texts:
     a broadly accepted data format which links geometric and textual features has
     yet to establish.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/34
-  hash: 13224162835784204794
+  hash: 10305639277644886816
   label: section_header
   orig: 3 THE DOCLAYNET DATASET
   parent:
@@ -1281,10 +11014,10 @@ texts:
     - 0
     - 23
     page_no: 2
+  self_ref: '#/texts/34'
   text: 3 THE DOCLAYNET DATASET
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/35
-  hash: 13840788721079437184
+  hash: 13253192885407465025
   label: text
   orig: DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances
     of human annotations, and 1591 carry three. This amounts to 91104 total annotation
@@ -1306,6 +11039,7 @@ texts:
     - 0
     - 522
     page_no: 2
+  self_ref: '#/texts/35'
   text: DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances
     of human annotations, and 1591 carry three. This amounts to 91104 total annotation
     instances. The annotations provide layout information in the shape of labeled,
@@ -1314,8 +11048,7 @@ texts:
     , Section-header , Table , Text , and Title . Our reasoning for picking this particular
     label set is detailed in Section 4.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/36
-  hash: 8382469735566893423
+  hash: 4009549806736392409
   label: text
   orig: In addition to open intellectual property constraints for the source documents,
     we required that the documents in DocLayNet adhere to a few conditions. Firstly,
@@ -1333,12 +11066,12 @@ texts:
     - 0
     - 186
     page_no: 2
+  self_ref: '#/texts/36'
   text: In addition to open intellectual property constraints for the source documents,
     we required that the documents in DocLayNet adhere to a few conditions. Firstly,
     we kept scanned documents
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/37
-  hash: 15463014254960213695
+  hash: 17470228760559609425
   label: page_header
   orig: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
   parent:
@@ -1354,10 +11087,10 @@ texts:
     - 0
     - 71
     page_no: 3
+  self_ref: '#/texts/37'
   text: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/38
-  hash: 202003194997475932
+  hash: 17964098560616781264
   label: page_header
   orig: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
   parent:
@@ -1373,10 +11106,10 @@ texts:
     - 0
     - 48
     page_no: 3
+  self_ref: '#/texts/38'
   text: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/39
-  hash: 16454164006377695992
+  hash: 2152771201290039654
   label: caption
   orig: 'Figure 2: Distribution of DocLayNet pages across document categories.'
   parent:
@@ -1392,10 +11125,10 @@ texts:
     - 0
     - 513
     page_no: 3
+  self_ref: '#/texts/39'
   text: 'Figure 2: Distribution of DocLayNet pages across document categories.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/40
-  hash: 17429156214159736783
+  hash: 18326974418691293350
   label: text
   orig: The pages in DocLayNet can be grouped into six distinct categories, namely
     Financial Reports , Manuals , Scientific Articles , Laws & Regulations , Patents
@@ -1421,6 +11154,7 @@ texts:
     - 0
     - 810
     page_no: 3
+  self_ref: '#/texts/40'
   text: The pages in DocLayNet can be grouped into six distinct categories, namely
     Financial Reports , Manuals , Scientific Articles , Laws & Regulations , Patents
     and Government Tenders . Each document category was sourced from various repositories.
@@ -1433,8 +11167,7 @@ texts:
     Figure 2, we show the document categories contained in DocLayNet with their respective
     sizes.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/41
-  hash: 4835758972077135061
+  hash: 17331677447037592116
   label: text
   orig: We did not control the document selection with regard to language. The vast
     majority of documents contained in DocLayNet (close to 95%) are published in English
@@ -1456,6 +11189,7 @@ texts:
     - 0
     - 535
     page_no: 3
+  self_ref: '#/texts/41'
   text: We did not control the document selection with regard to language. The vast
     majority of documents contained in DocLayNet (close to 95%) are published in English
     language. However, DocLayNet also contains a number of documents in other languages
@@ -1464,8 +11198,7 @@ texts:
     detection and segmentation models, it might prove challenging for layout analysis
     methods which exploit textual features.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/42
-  hash: 6442074878702101187
+  hash: 8050499577265375805
   label: text
   orig: To ensure that future benchmarks in the document-layout analysis community
     can be easily compared, we have split up DocLayNet into pre-defined train-, test-
@@ -1485,14 +11218,14 @@ texts:
     - 0
     - 413
     page_no: 3
+  self_ref: '#/texts/42'
   text: To ensure that future benchmarks in the document-layout analysis community
     can be easily compared, we have split up DocLayNet into pre-defined train-, test-
     and validation-sets. In this way, we can avoid spurious variations in the evaluation
     scores due to random splitting in train-, test- and validation-sets. We also ensured
     that less frequent labels are represented in train and test sets in equal proportions.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/43
-  hash: 13873304636238013732
+  hash: 12951529389829894886
   label: footnote
   orig: $^{2}$e.g. AAPL from https://www.annualreports.com/
   parent:
@@ -1508,10 +11241,10 @@ texts:
     - 0
     - 51
     page_no: 3
+  self_ref: '#/texts/43'
   text: $^{2}$e.g. AAPL from https://www.annualreports.com/
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/44
-  hash: 6259711523792429489
+  hash: 18123465042552214371
   label: text
   orig: Table 1 shows the overall frequency and distribution of the labels among the
     different sets. Importantly, we ensure that subsets are only split on full-document
@@ -1532,6 +11265,7 @@ texts:
     - 0
     - 435
     page_no: 3
+  self_ref: '#/texts/44'
   text: Table 1 shows the overall frequency and distribution of the labels among the
     different sets. Importantly, we ensure that subsets are only split on full-document
     boundaries. This avoids that pages of the same document are spread over train,
@@ -1539,8 +11273,7 @@ texts:
     and lead to overestimation of their prediction accuracy. We will show the impact
     of this decision in Section 5.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/45
-  hash: 9126253445878309540
+  hash: 14073537799186996310
   label: text
   orig: "In order to accommodate the different types of models currently in use by\
     \ the community, we provide DocLayNet in an augmented COCO format [16]. This entails\
@@ -1564,6 +11297,7 @@ texts:
     - 0
     - 645
     page_no: 3
+  self_ref: '#/texts/45'
   text: "In order to accommodate the different types of models currently in use by\
     \ the community, we provide DocLayNet in an augmented COCO format [16]. This entails\
     \ the standard COCO ground-truth file (in JSON format) with the associated page\
@@ -1574,8 +11308,7 @@ texts:
     \ (in JSON). All additional files are linked to the primary page images by their\
     \ matching filenames."
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/46
-  hash: 17722516482300246985
+  hash: 18386447901530976124
   label: text
   orig: Despite being cost-intense and far less scalable than automation, human annotation
     has several benefits over automated groundtruth generation. The first and most
@@ -1613,6 +11346,7 @@ texts:
     - 0
     - 1854
     page_no: 3
+  self_ref: '#/texts/46'
   text: Despite being cost-intense and far less scalable than automation, human annotation
     has several benefits over automated groundtruth generation. The first and most
     obvious reason to leverage human annotations is the freedom to annotate any type
@@ -1637,8 +11371,7 @@ texts:
     (see Table 1). On the flip side, achieving high annotation consistency proved
     to be a key challenge in human annotation, as we outline in Section 4.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/47
-  hash: 8217803899333050095
+  hash: 4988296339029657054
   label: section_header
   orig: 4 ANNOTATION CAMPAIGN
   parent:
@@ -1654,10 +11387,10 @@ texts:
     - 0
     - 21
     page_no: 3
+  self_ref: '#/texts/47'
   text: 4 ANNOTATION CAMPAIGN
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/48
-  hash: 11051011402111064878
+  hash: 9049409901183805554
   label: text
   orig: The annotation campaign was carried out in four phases. In phase one, we identified
     and prepared the data sources for annotation. In phase two, we determined the
@@ -1678,6 +11411,7 @@ texts:
     - 0
     - 457
     page_no: 3
+  self_ref: '#/texts/48'
   text: The annotation campaign was carried out in four phases. In phase one, we identified
     and prepared the data sources for annotation. In phase two, we determined the
     class labels and how annotations should be done on the documents in order to obtain
@@ -1685,8 +11419,7 @@ texts:
     and exhaustive experiments. In phase three, we trained the annotation staff and
     performed exams for quality assurance. In phase four,
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/49
-  hash: 6768525952307611424
+  hash: 5565740545948543009
   label: page_header
   orig: "KDD \u201922, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph\
     \ Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar"
@@ -1703,11 +11436,11 @@ texts:
     - 0
     - 130
     page_no: 4
+  self_ref: '#/texts/49'
   text: "KDD \u201922, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph\
     \ Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar"
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/50
-  hash: 5520931533029632037
+  hash: 7404461851293909485
   label: caption
   orig: ''
   parent:
@@ -1723,10 +11456,10 @@ texts:
     - 0
     - 0
     page_no: 4
+  self_ref: '#/texts/50'
   text: ''
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/51
-  hash: 10610193690990616567
+  hash: 11276398568228129142
   label: text
   orig: we distributed the annotation workload and performed continuous quality controls.
     Phase one and two required a small team of experts only. For phases three and
@@ -1744,12 +11477,12 @@ texts:
     - 0
     - 231
     page_no: 4
+  self_ref: '#/texts/51'
   text: we distributed the annotation workload and performed continuous quality controls.
     Phase one and two required a small team of experts only. For phases three and
     four, a group of 40 dedicated annotators were assembled and supervised.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/52
-  hash: 8450678124529756923
+  hash: 5214138199614338209
   label: text
   orig: 'Phase 1: Data selection and preparation. Our inclusion criteria for documents
     were described in Section 3. A large effort went into ensuring that all documents
@@ -1767,12 +11500,12 @@ texts:
     - 0
     - 193
     page_no: 4
+  self_ref: '#/texts/52'
   text: 'Phase 1: Data selection and preparation. Our inclusion criteria for documents
     were described in Section 3. A large effort went into ensuring that all documents
     are free to use. The data sources'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/53
-  hash: 12151724778915504838
+  hash: 1652349545620087391
   label: text
   orig: include publication repositories such as arXiv$^{3}$, government offices,
     company websites as well as data directory services for financial reports and
@@ -1792,14 +11525,14 @@ texts:
     - 0
     - 376
     page_no: 4
+  self_ref: '#/texts/53'
   text: include publication repositories such as arXiv$^{3}$, government offices,
     company websites as well as data directory services for financial reports and
     patents. Scanned documents were excluded wherever possible because they can be
     rotated or skewed. This would not allow us to perform annotation with rectangular
     bounding-boxes and therefore complicate the annotation process.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/54
-  hash: 15887983992023577324
+  hash: 16847780312916319884
   label: text
   orig: Preparation work included uploading and parsing the sourced PDF documents
     in the Corpus Conversion Service (CCS) [22], a cloud-native platform which provides
@@ -1824,6 +11557,7 @@ texts:
     - 0
     - 746
     page_no: 4
+  self_ref: '#/texts/54'
   text: Preparation work included uploading and parsing the sourced PDF documents
     in the Corpus Conversion Service (CCS) [22], a cloud-native platform which provides
     a visual annotation interface and allows for dataset inspection and analysis.
@@ -1835,8 +11569,7 @@ texts:
     detection models from PubLayNet, which helped us estimate how many figures and
     tables a given page contains.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/55
-  hash: 11975880209884411763
+  hash: 853856630909111423
   label: text
   orig: 'Phase 2: Label selection and guideline. We reviewed the collected documents
     and identified the most common structural features they exhibit. This was achieved
@@ -1866,6 +11599,7 @@ texts:
     - 0
     - 1159
     page_no: 4
+  self_ref: '#/texts/55'
   text: 'Phase 2: Label selection and guideline. We reviewed the collected documents
     and identified the most common structural features they exhibit. This was achieved
     by identifying recurrent layout elements and lead us to the definition of 11 distinct
@@ -1882,8 +11616,7 @@ texts:
     Labels such as Author and Affiliation , as seen in DocBank, are often only distinguishable
     by discriminating on'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/56
-  hash: 723840717012406728
+  hash: 10820126744691405163
   label: footnote
   orig: $^{3}$https://arxiv.org/
   parent:
@@ -1899,10 +11632,10 @@ texts:
     - 0
     - 24
     page_no: 4
+  self_ref: '#/texts/56'
   text: $^{3}$https://arxiv.org/
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/57
-  hash: 15020658425504633198
+  hash: 15137025955302266836
   label: page_header
   orig: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
   parent:
@@ -1918,10 +11651,10 @@ texts:
     - 0
     - 71
     page_no: 5
+  self_ref: '#/texts/57'
   text: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/58
-  hash: 17688098678887076514
+  hash: 16594789809220859729
   label: page_header
   orig: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
   parent:
@@ -1937,10 +11670,10 @@ texts:
     - 0
     - 48
     page_no: 5
+  self_ref: '#/texts/58'
   text: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/59
-  hash: 938373213925944417
+  hash: 13348689006806549459
   label: text
   orig: the textual content of an element, which goes beyond visual layout recognition,
     in particular outside the Scientific Articles category.
@@ -1957,11 +11690,11 @@ texts:
     - 0
     - 135
     page_no: 5
+  self_ref: '#/texts/59'
   text: the textual content of an element, which goes beyond visual layout recognition,
     in particular outside the Scientific Articles category.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/60
-  hash: 11375379645979730878
+  hash: 2899629868160691152
   label: text
   orig: At first sight, the task of visual document-layout interpretation appears
     intuitive enough to obtain plausible annotations in most cases. However, during
@@ -1987,6 +11720,7 @@ texts:
     - 0
     - 812
     page_no: 5
+  self_ref: '#/texts/60'
   text: At first sight, the task of visual document-layout interpretation appears
     intuitive enough to obtain plausible annotations in most cases. However, during
     early trial-runs in the core team, we observed many cases in which annotators
@@ -1999,8 +11733,7 @@ texts:
     in Figure 4 multiple examples of plausible but inconsistent annotations on the
     same pages.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/61
-  hash: 9627588927681567008
+  hash: 9530250172263586436
   label: text
   orig: 'Obviously, this inconsistency in annotations is not desirable for datasets
     which are intended to be used for model training. To minimise these inconsistencies,
@@ -2021,6 +11754,7 @@ texts:
     - 0
     - 465
     page_no: 5
+  self_ref: '#/texts/61'
   text: 'Obviously, this inconsistency in annotations is not desirable for datasets
     which are intended to be used for model training. To minimise these inconsistencies,
     we created a detailed annotation guideline. While perfect consistency across 40
@@ -2028,8 +11762,7 @@ texts:
     in annotation consistency after the introduction of our annotation guideline.
     A few selected, non-trivial highlights of the guideline are:'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/62
-  hash: 5251173547193772936
+  hash: 16012774257828431546
   label: list_item
   orig: (1) Every list-item is an individual object instance with class label List-item
     . This definition is different from PubLayNet and DocBank, where all list-items
@@ -2047,12 +11780,12 @@ texts:
     - 0
     - 202
     page_no: 5
+  self_ref: '#/texts/62'
   text: (1) Every list-item is an individual object instance with class label List-item
     . This definition is different from PubLayNet and DocBank, where all list-items
     are grouped together into one List object.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/63
-  hash: 4202075218951637034
+  hash: 15871134704762394430
   label: list_item
   orig: (2) A List-item is a paragraph with hanging indentation. Singleline elements
     can qualify as List-item if the neighbour elements expose hanging indentation.
@@ -2070,12 +11803,12 @@ texts:
     - 0
     - 208
     page_no: 5
+  self_ref: '#/texts/63'
   text: (2) A List-item is a paragraph with hanging indentation. Singleline elements
     can qualify as List-item if the neighbour elements expose hanging indentation.
     Bullet or enumeration symbols are not a requirement.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/64
-  hash: 1780046845976491258
+  hash: 17395883987622603550
   label: list_item
   orig: (3) For every Caption , there must be exactly one corresponding Picture or
     Table .
@@ -2092,11 +11825,11 @@ texts:
     - 0
     - 82
     page_no: 5
+  self_ref: '#/texts/64'
   text: (3) For every Caption , there must be exactly one corresponding Picture or
     Table .
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/65
-  hash: 3653862969821232020
+  hash: 7087799182536849687
   label: list_item
   orig: (4) Connected sub-pictures are grouped together in one Picture object.
   parent:
@@ -2112,10 +11845,10 @@ texts:
     - 0
     - 70
     page_no: 5
+  self_ref: '#/texts/65'
   text: (4) Connected sub-pictures are grouped together in one Picture object.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/66
-  hash: 5448053117976841193
+  hash: 8990831048276827416
   label: list_item
   orig: (5) Formula numbers are included in a Formula object.
   parent:
@@ -2131,10 +11864,10 @@ texts:
     - 0
     - 53
     page_no: 5
+  self_ref: '#/texts/66'
   text: (5) Formula numbers are included in a Formula object.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/67
-  hash: 5907142507865067888
+  hash: 15080088485878212709
   label: list_item
   orig: (6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph
     is not considered a Section-header , unless it appears exclusively on its own
@@ -2152,12 +11885,12 @@ texts:
     - 0
     - 160
     page_no: 5
+  self_ref: '#/texts/67'
   text: (6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph
     is not considered a Section-header , unless it appears exclusively on its own
     line.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/68
-  hash: 13967274008596264343
+  hash: 340695852381472969
   label: text
   orig: The complete annotation guideline is over 100 pages long and a detailed description
     is obviously out of scope for this paper. Nevertheless, it will be made publicly
@@ -2175,12 +11908,12 @@ texts:
     - 0
     - 221
     page_no: 5
+  self_ref: '#/texts/68'
   text: The complete annotation guideline is over 100 pages long and a detailed description
     is obviously out of scope for this paper. Nevertheless, it will be made publicly
     available alongside with DocLayNet for future reference.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/69
-  hash: 889554423716143140
+  hash: 18427360413523808725
   label: text
   orig: 'Phase 3: Training. After a first trial with a small group of people, we realised
     that providing the annotation guideline and a set of random practice pages did
@@ -2205,6 +11938,7 @@ texts:
     - 0
     - 792
     page_no: 5
+  self_ref: '#/texts/69'
   text: 'Phase 3: Training. After a first trial with a small group of people, we realised
     that providing the annotation guideline and a set of random practice pages did
     not yield the desired quality level for layout annotation. Therefore we prepared
@@ -2216,8 +11950,7 @@ texts:
     the reference. Only after passing two exam levels with high annotation quality,
     staff were admitted into the production phase. Practice iterations'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/70
-  hash: 15729892622341582110
+  hash: 17039526178394070759
   label: caption
   orig: 'Figure 4: Examples of plausible annotation alternatives for the same page.
     Criteria in our annotation guideline can resolve cases '
@@ -2234,11 +11967,11 @@ texts:
     - 0
     - 130
     page_no: 6
+  self_ref: '#/texts/70'
   text: 'Figure 4: Examples of plausible annotation alternatives for the same page.
     Criteria in our annotation guideline can resolve cases '
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/71
-  hash: 14428809639626034083
+  hash: 9171696979434376961
   label: text
   orig: were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially
     allocated annotators did not pass the bar.
@@ -2255,11 +11988,11 @@ texts:
     - 0
     - 123
     page_no: 5
+  self_ref: '#/texts/71'
   text: were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially
     allocated annotators did not pass the bar.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/72
-  hash: 15056578085083744975
+  hash: 258948183867512950
   label: text
   orig: 'Phase 4: Production annotation. The previously selected 80K pages were annotated
     with the defined 11 class labels by 32 annotators. This production phase took
@@ -2287,6 +12020,7 @@ texts:
     - 0
     - 987
     page_no: 5
+  self_ref: '#/texts/72'
   text: 'Phase 4: Production annotation. The previously selected 80K pages were annotated
     with the defined 11 class labels by 32 annotators. This production phase took
     around three months to complete. All annotations were created online through CCS,
@@ -2301,8 +12035,7 @@ texts:
     which could skew the numbers of the inter-annotator agreement (see Table 1). We
     wanted'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/73
-  hash: 2641059782471010186
+  hash: 6581136044412754340
   label: caption
   orig: 'Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks
     on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models
@@ -2324,6 +12057,7 @@ texts:
     - 0
     - 584
     page_no: 6
+  self_ref: '#/texts/73'
   text: 'Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks
     on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models
     with ResNet-50 or ResNet-101 backbone were trained based on the network architectures
@@ -2332,8 +12066,7 @@ texts:
     [13]. All models were initialised using pre-trained weights from the COCO 2017
     dataset.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/74
-  hash: 2611073847515650604
+  hash: 7321418918830647717
   label: text
   orig: to avoid this at any cost in order to have clear, unbiased baseline numbers
     for human document-layout annotation. Third, we introduced the feature of snapping
@@ -2364,6 +12097,7 @@ texts:
     - 0
     - 1252
     page_no: 6
+  self_ref: '#/texts/74'
   text: to avoid this at any cost in order to have clear, unbiased baseline numbers
     for human document-layout annotation. Third, we introduced the feature of snapping
     boxes around text segments to obtain a pixel-accurate annotation and again reduce
@@ -2381,8 +12115,7 @@ texts:
     annotation staff managed to annotate a single page in a typical timeframe of 20s
     to 60s, depending on its complexity.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/75
-  hash: 19275708379815350
+  hash: 1563151945244295808
   label: section_header
   orig: 5 EXPERIMENTS
   parent:
@@ -2398,10 +12131,10 @@ texts:
     - 0
     - 13
     page_no: 6
+  self_ref: '#/texts/75'
   text: 5 EXPERIMENTS
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/76
-  hash: 12611643145785449119
+  hash: 17481407849465975419
   label: caption
   orig: 'Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network'
   parent:
@@ -2417,10 +12150,10 @@ texts:
     - 0
     - 71
     page_no: 7
+  self_ref: '#/texts/76'
   text: 'Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/77
-  hash: 10405449111938146973
+  hash: 7803676745991559807
   label: text
   orig: paper and leave the detailed evaluation of more recent methods mentioned in
     Section 2 for future work.
@@ -2437,11 +12170,11 @@ texts:
     - 0
     - 102
     page_no: 6
+  self_ref: '#/texts/77'
   text: paper and leave the detailed evaluation of more recent methods mentioned in
     Section 2 for future work.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/78
-  hash: 5970556147693056683
+  hash: 17701866649923828991
   label: text
   orig: In this section, we will present several aspects related to the performance
     of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate
@@ -2461,14 +12194,14 @@ texts:
     - 0
     - 397
     page_no: 6
+  self_ref: '#/texts/78'
   text: In this section, we will present several aspects related to the performance
     of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate
     the quality of their predictions using mean average precision (mAP) with 10 overlaps
     that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are
     computed by leveraging the evaluation code provided by the COCO API [16].
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/79
-  hash: 7797862272567426572
+  hash: 13749192044324312506
   label: section_header
   orig: Baselines for Object Detection
   parent:
@@ -2484,10 +12217,10 @@ texts:
     - 0
     - 30
     page_no: 6
+  self_ref: '#/texts/79'
   text: Baselines for Object Detection
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/80
-  hash: 7611035121604324850
+  hash: 14253474500249711401
   label: text
   orig: "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN\
     \ [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were\
@@ -2517,6 +12250,7 @@ texts:
     - 0
     - 1146
     page_no: 6
+  self_ref: '#/texts/80'
   text: "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN\
     \ [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were\
     \ performed on RGB images with dimensions of 1025 \xD7 1025 pixels. For training,\
@@ -2533,8 +12267,7 @@ texts:
     \ . This is not entirely surprising, as Text , Table and Picture are abundant\
     \ and the most visually distinctive in a document."
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/81
-  hash: 4524736109232879114
+  hash: 15112326022753491828
   label: page_header
   orig: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
   parent:
@@ -2550,10 +12283,10 @@ texts:
     - 0
     - 48
     page_no: 7
+  self_ref: '#/texts/81'
   text: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/82
-  hash: 8640149219266946286
+  hash: 4483721399241932553
   label: caption
   orig: 'Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained
     on DocLayNet with different class label sets. The reduced label sets were obtained
@@ -2571,12 +12304,12 @@ texts:
     - 0
     - 189
     page_no: 7
+  self_ref: '#/texts/82'
   text: 'Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained
     on DocLayNet with different class label sets. The reduced label sets were obtained
     by either down-mapping or '
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/83
-  hash: 6812192561276511295
+  hash: 12787012541434343355
   label: section_header
   orig: Learning Curve
   parent:
@@ -2592,10 +12325,10 @@ texts:
     - 0
     - 14
     page_no: 7
+  self_ref: '#/texts/83'
   text: Learning Curve
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/84
-  hash: 5745526209173602420
+  hash: 10649518406512702432
   label: text
   orig: One of the fundamental questions related to any dataset is if it is "large
     enough". To answer this question for DocLayNet, we performed a data ablation study
@@ -2625,6 +12358,7 @@ texts:
     - 0
     - 1157
     page_no: 7
+  self_ref: '#/texts/84'
   text: One of the fundamental questions related to any dataset is if it is "large
     enough". To answer this question for DocLayNet, we performed a data ablation study
     in which we evaluated a Mask R-CNN model trained on increasing fractions of the
@@ -2641,8 +12375,7 @@ texts:
     in Section 3), data augmentation methods [23], or the addition of more document
     categories and styles.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/85
-  hash: 7824280854281589640
+  hash: 15793718183219261273
   label: section_header
   orig: Impact of Class Labels
   parent:
@@ -2658,10 +12391,10 @@ texts:
     - 0
     - 22
     page_no: 7
+  self_ref: '#/texts/85'
   text: Impact of Class Labels
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/86
-  hash: 17522119297822048539
+  hash: 3813128593852817568
   label: text
   orig: "The choice and number of labels can have a significant effect on the overall\
     \ model performance. Since PubLayNet, DocBank and DocLayNet all have different\
@@ -2688,6 +12421,7 @@ texts:
     - 0
     - 910
     page_no: 7
+  self_ref: '#/texts/86'
   text: "The choice and number of labels can have a significant effect on the overall\
     \ model performance. Since PubLayNet, DocBank and DocLayNet all have different\
     \ label sets, it is of particular interest to understand and quantify this influence\
@@ -2701,8 +12435,7 @@ texts:
     \ labels respectively. The set of 5 labels contains the same labels as PubLayNet.\
     \ However, due to the different definition of"
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/87
-  hash: 12736595303563933946
+  hash: 17259116100352000220
   label: caption
   orig: 'Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise
     split for different label sets. Naive page-wise '
@@ -2719,11 +12452,11 @@ texts:
     - 0
     - 130
     page_no: 7
+  self_ref: '#/texts/87'
   text: 'Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise
     split for different label sets. Naive page-wise '
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/88
-  hash: 7783869837125225
+  hash: 1606264416890747238
   label: text
   orig: lists in PubLayNet (grouped list-items) versus DocLayNet (separate list-items),
     the label set of size 4 is the closest to PubLayNet, in the assumption that the
@@ -2744,6 +12477,7 @@ texts:
     - 0
     - 469
     page_no: 7
+  self_ref: '#/texts/88'
   text: lists in PubLayNet (grouped list-items) versus DocLayNet (separate list-items),
     the label set of size 4 is the closest to PubLayNet, in the assumption that the
     List is down-mapped to Text in PubLayNet. The results in Table 3 show that the
@@ -2751,8 +12485,7 @@ texts:
     when other classes are merged into them. The overall macro-average improves by
     around 5%, in particular when Page-footer and Page-header are excluded.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/89
-  hash: 5117058535300881242
+  hash: 9916492842404420210
   label: section_header
   orig: Impact of Document Split in Train and Test Set
   parent:
@@ -2768,10 +12501,10 @@ texts:
     - 0
     - 46
     page_no: 7
+  self_ref: '#/texts/89'
   text: Impact of Document Split in Train and Test Set
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/90
-  hash: 1569991188631703948
+  hash: 7367861061956826313
   label: text
   orig: "Many documents in DocLayNet have a unique styling. In order to avoid overfitting\
     \ on a particular style, we have split the train-, test- and validation-sets of\
@@ -2797,6 +12530,7 @@ texts:
     - 0
     - 852
     page_no: 7
+  self_ref: '#/texts/90'
   text: "Many documents in DocLayNet have a unique styling. In order to avoid overfitting\
     \ on a particular style, we have split the train-, test- and validation-sets of\
     \ DocLayNet on document boundaries, i.e. every document contributes pages to only\
@@ -2809,8 +12543,7 @@ texts:
     \ Thus, random page-wise splitting of DocLayNet can easily lead to accidental\
     \ overestimation of model performance and should be avoided."
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/91
-  hash: 16424003151594388576
+  hash: 12266316638387602552
   label: section_header
   orig: Dataset Comparison
   parent:
@@ -2826,10 +12559,10 @@ texts:
     - 0
     - 18
     page_no: 7
+  self_ref: '#/texts/91'
   text: Dataset Comparison
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/92
-  hash: 3914983503582730759
+  hash: 11873954123712452037
   label: text
   orig: Throughout this paper, we claim that DocLayNet's wider variety of document
     layouts leads to more robust layout detection models. In Table 5, we provide evidence
@@ -2851,6 +12584,7 @@ texts:
     - 0
     - 521
     page_no: 7
+  self_ref: '#/texts/92'
   text: Throughout this paper, we claim that DocLayNet's wider variety of document
     layouts leads to more robust layout detection models. In Table 5, we provide evidence
     for that. We trained models on each of the available datasets (PubLayNet, DocBank
@@ -2859,8 +12593,7 @@ texts:
     possible. Hence, we focussed on the common labels among the datasets. Between
     PubLayNet and DocLayNet, these are Picture ,
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/93
-  hash: 1407046376659880848
+  hash: 351624657815269469
   label: caption
   orig: 'Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network
     across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label
@@ -2879,13 +12612,13 @@ texts:
     - 0
     - 573
     page_no: 8
+  self_ref: '#/texts/93'
   text: 'Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network
     across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label
     classes of each dataset, we observe that the DocLayNet-trained model has much
     less pronounced variations in performance across all datasets.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/94
-  hash: 908797690688183444
+  hash: 9365044532577791157
   label: text
   orig: Section-header , Table and Text . Before training, we either mapped or excluded
     DocLayNet's other labels as specified in table 3, and also PubLayNet's List to
@@ -2904,13 +12637,13 @@ texts:
     - 0
     - 295
     page_no: 8
+  self_ref: '#/texts/94'
   text: Section-header , Table and Text . Before training, we either mapped or excluded
     DocLayNet's other labels as specified in table 3, and also PubLayNet's List to
     Text . Note that the different clustering of lists (by list-element vs. whole
     list objects) naturally decreases the mAP score for Text .
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/95
-  hash: 10214562574915474626
+  hash: 10416729032588567700
   label: text
   orig: For comparison of DocBank with DocLayNet, we trained only on Picture and Table
     clusters of each dataset. We had to exclude Text because successive paragraphs
@@ -2935,6 +12668,7 @@ texts:
     - 0
     - 793
     page_no: 8
+  self_ref: '#/texts/95'
   text: For comparison of DocBank with DocLayNet, we trained only on Picture and Table
     clusters of each dataset. We had to exclude Text because successive paragraphs
     are often grouped together into a single object in DocBank. This paragraph grouping
@@ -2946,8 +12680,7 @@ texts:
     Thus we conclude that DocLayNet trained models are overall more robust and will
     produce better results for challenging, unseen layouts.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/96
-  hash: 13986119087538501170
+  hash: 2679590577170717304
   label: section_header
   orig: Example Predictions
   parent:
@@ -2963,10 +12696,10 @@ texts:
     - 0
     - 19
     page_no: 8
+  self_ref: '#/texts/96'
   text: Example Predictions
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/97
-  hash: 1038155047615801598
+  hash: 16470836271801010735
   label: text
   orig: To conclude this section, we illustrate the quality of layout predictions
     one can expect from DocLayNet-trained models by providing a selection of examples
@@ -2987,6 +12720,7 @@ texts:
     - 0
     - 481
     page_no: 8
+  self_ref: '#/texts/97'
   text: To conclude this section, we illustrate the quality of layout predictions
     one can expect from DocLayNet-trained models by providing a selection of examples
     without any further post-processing applied. Figure 6 shows selected layout predictions
@@ -2994,8 +12728,7 @@ texts:
     document categories, however one can also observe mistakes such as overlapping
     clusters of different classes, or entirely missing boxes due to low confidence.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/98
-  hash: 8801089031972856173
+  hash: 6141036410505996597
   label: section_header
   orig: 6 CONCLUSION
   parent:
@@ -3011,10 +12744,10 @@ texts:
     - 0
     - 12
     page_no: 8
+  self_ref: '#/texts/98'
   text: 6 CONCLUSION
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/99
-  hash: 15710626894768820561
+  hash: 6656340812057528524
   label: text
   orig: In this paper, we presented the DocLayNet dataset. It provides the document
     conversion and layout analysis research community a new and challenging dataset
@@ -3036,6 +12769,7 @@ texts:
     - 0
     - 507
     page_no: 8
+  self_ref: '#/texts/99'
   text: In this paper, we presented the DocLayNet dataset. It provides the document
     conversion and layout analysis research community a new and challenging dataset
     to improve and fine-tune novel ML methods on. In contrast to many other datasets,
@@ -3044,8 +12778,7 @@ texts:
     of documents outside the scientific publishing domain adds significant value in
     this respect.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/100
-  hash: 5999203225419292280
+  hash: 5486338775911770271
   label: text
   orig: To date, there is still a significant gap between human and ML accuracy on
     the layout interpretation task, and we hope that this work will inspire the research
@@ -3063,12 +12796,12 @@ texts:
     - 0
     - 188
     page_no: 8
+  self_ref: '#/texts/100'
   text: To date, there is still a significant gap between human and ML accuracy on
     the layout interpretation task, and we hope that this work will inspire the research
     community to close that gap.
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/101
-  hash: 4445410344359338123
+  hash: 1559959727945683203
   label: section_header
   orig: REFERENCES
   parent:
@@ -3084,10 +12817,10 @@ texts:
     - 0
     - 10
     page_no: 8
+  self_ref: '#/texts/101'
   text: REFERENCES
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/102
-  hash: 16616106884325138631
+  hash: 3725067383473502802
   label: list_item
   orig: "[1] Max G\xF6bel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013\
     \ table competition. In 2013 12th International Conference on Document Analysis\
@@ -3105,12 +12838,12 @@ texts:
     - 0
     - 191
     page_no: 8
+  self_ref: '#/texts/102'
   text: "[1] Max G\xF6bel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013\
     \ table competition. In 2013 12th International Conference on Document Analysis\
     \ and Recognition , pages 1449-1453, 2013."
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/103
-  hash: 16787040176255257341
+  hash: 12754243453844555097
   label: list_item
   orig: '[2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher.
     Icdar2017 competition on recognition of documents with complex layouts rdcl2017.
@@ -3129,13 +12862,13 @@ texts:
     - 0
     - 279
     page_no: 8
+  self_ref: '#/texts/103'
   text: '[2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher.
     Icdar2017 competition on recognition of documents with complex layouts rdcl2017.
     In 2017 14th IAPR International Conference on Document Analysis and Recognition
     (ICDAR) , volume 01, pages 1404-1410, 2017.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/104
-  hash: 16229494543393695243
+  hash: 829063111244650808
   label: list_item
   orig: "[3] Herv\xE9 D\xE9jean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang,\
     \ Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection\
@@ -3153,12 +12886,12 @@ texts:
     - 0
     - 213
     page_no: 8
+  self_ref: '#/texts/104'
   text: "[3] Herv\xE9 D\xE9jean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang,\
     \ Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection\
     \ and Recognition (cTDaR), April 2019. http://sac.founderit.com/."
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/105
-  hash: 15423145939859734104
+  hash: 11422390329416293389
   label: list_item
   orig: '[4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on
     scientific literature parsing. In Proceedings of the International Conference
@@ -3177,13 +12910,13 @@ texts:
     - 0
     - 251
     page_no: 8
+  self_ref: '#/texts/105'
   text: '[4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on
     scientific literature parsing. In Proceedings of the International Conference
     on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag,
     sep 2021.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/106
-  hash: 5249151387680038785
+  hash: 6413209670831350329
   label: list_item
   orig: '[5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang
     Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis:
@@ -3202,13 +12935,13 @@ texts:
     - 0
     - 261
     page_no: 8
+  self_ref: '#/texts/106'
   text: '[5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang
     Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis:
     not dead yet. International Journal on Document Analysis and Recognition (IJDAR)
     , pages 1-11, 01 2022.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/107
-  hash: 16511389590086473870
+  hash: 11041707939752995350
   label: list_item
   orig: '[6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest
     dataset ever for document layout analysis. In Proceedings of the International
@@ -3227,13 +12960,13 @@ texts:
     - 0
     - 235
     page_no: 8
+  self_ref: '#/texts/107'
   text: '[6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest
     dataset ever for document layout analysis. In Proceedings of the International
     Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep
     2019.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/108
-  hash: 5841239213590061604
+  hash: 5474854589015621307
   label: list_item
   orig: '[7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li,
     and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings
@@ -3252,13 +12985,13 @@ texts:
     - 0
     - 316
     page_no: 8
+  self_ref: '#/texts/108'
   text: '[7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li,
     and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings
     of the 28th International Conference on Computational Linguistics , COLING, pages
     949-960. International Committee on Computational Linguistics, dec 2020.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/109
-  hash: 11745041684012725305
+  hash: 5675312684168383010
   label: list_item
   orig: '[8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction
     from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC
@@ -3276,12 +13009,12 @@ texts:
     - 0
     - 172
     page_no: 8
+  self_ref: '#/texts/109'
   text: '[8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction
     from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC
     , 2016.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/110
-  hash: 8213734949810000799
+  hash: 1888275270300033803
   label: list_item
   orig: '[9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich
     feature hierarchies for accurate object detection and semantic segmentation. In
@@ -3300,13 +13033,13 @@ texts:
     - 0
     - 271
     page_no: 8
+  self_ref: '#/texts/110'
   text: '[9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich
     feature hierarchies for accurate object detection and semantic segmentation. In
     IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587.
     IEEE Computer Society, jun 2014.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/111
-  hash: 4869209929442963000
+  hash: 16377833059391204670
   label: list_item
   orig: '[10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference
     on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015.'
@@ -3323,11 +13056,11 @@ texts:
     - 0
     - 149
     page_no: 8
+  self_ref: '#/texts/111'
   text: '[10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference
     on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/112
-  hash: 16420654594074141837
+  hash: 14299439328578373439
   label: list_item
   orig: '[11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn:
     Towards real-time object detection with region proposal networks. IEEE Transactions
@@ -3345,12 +13078,12 @@ texts:
     - 0
     - 227
     page_no: 8
+  self_ref: '#/texts/112'
   text: '[11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn:
     Towards real-time object detection with region proposal networks. IEEE Transactions
     on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/113
-  hash: 453358893855311407
+  hash: 6277082541628795791
   label: list_item
   orig: "[12] Kaiming He, Georgia Gkioxari, Piotr Doll\xE1r, and Ross B. Girshick.\
     \ Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages\
@@ -3368,12 +13101,12 @@ texts:
     - 0
     - 192
     page_no: 8
+  self_ref: '#/texts/113'
   text: "[12] Kaiming He, Georgia Gkioxari, Piotr Doll\xE1r, and Ross B. Girshick.\
     \ Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages\
     \ 2980-2988. IEEE Computer Society, Oct 2017."
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/114
-  hash: 3393294654140361785
+  hash: 11395721618283747445
   label: list_item
   orig: '[13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012,
     TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing,
@@ -3392,13 +13125,13 @@ texts:
     - 0
     - 305
     page_no: 8
+  self_ref: '#/texts/114'
   text: '[13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012,
     TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing,
     tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana,
     Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/115
-  hash: 13779849536941554365
+  hash: 10645821634971805003
   label: page_header
   orig: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
   parent:
@@ -3414,10 +13147,10 @@ texts:
     - 0
     - 71
     page_no: 9
+  self_ref: '#/texts/115'
   text: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/116
-  hash: 16675011465179482522
+  hash: 16669730915702730112
   label: page_header
   orig: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
   parent:
@@ -3433,10 +13166,10 @@ texts:
     - 0
     - 48
     page_no: 9
+  self_ref: '#/texts/116'
   text: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/117
-  hash: 12604501010340547619
+  hash: 10284244877080556618
   label: caption
   orig: 'Figure 6: Example layout predictions on selected pages from the DocLayNet
     test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show
@@ -3454,12 +13187,12 @@ texts:
     - 0
     - 188
     page_no: 9
+  self_ref: '#/texts/117'
   text: 'Figure 6: Example layout predictions on selected pages from the DocLayNet
     test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show
     accurate list-item and paragraph '
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/118
-  hash: 15606020167439278095
+  hash: 15833845093926891587
   label: text
   orig: 'Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang.
     ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow
@@ -3477,12 +13210,12 @@ texts:
     - 0
     - 195
     page_no: 9
+  self_ref: '#/texts/118'
   text: 'Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang.
     ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow
     export, opencv dnn support, October 2021.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/119
-  hash: 14342144244909907366
+  hash: 17115238742689219449
   label: list_item
   orig: '[14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier,
     Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers.
@@ -3500,12 +13233,12 @@ texts:
     - 0
     - 190
     page_no: 9
+  self_ref: '#/texts/119'
   text: '[14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier,
     Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers.
     CoRR , abs/2005.12872, 2020.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/120
-  hash: 8955370194868803712
+  hash: 9865955364301030107
   label: list_item
   orig: '[15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and
     efficient object detection. CoRR , abs/1911.09070, 2019.'
@@ -3522,11 +13255,11 @@ texts:
     - 0
     - 132
     page_no: 9
+  self_ref: '#/texts/120'
   text: '[15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and
     efficient object detection. CoRR , abs/1911.09070, 2019.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/121
-  hash: 13212807811422473787
+  hash: 1886095681092304576
   label: list_item
   orig: "[16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev,\
     \ Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\xE1r,\
@@ -3544,12 +13277,12 @@ texts:
     - 0
     - 219
     page_no: 9
+  self_ref: '#/texts/121'
   text: "[16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev,\
     \ Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\xE1r,\
     \ and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014."
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/122
-  hash: 7441487755804462640
+  hash: 725047809578653716
   label: list_item
   orig: '[17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross
     Girshick. Detectron2, 2019.'
@@ -3566,11 +13299,11 @@ texts:
     - 0
     - 100
     page_no: 9
+  self_ref: '#/texts/122'
   text: '[17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross
     Girshick. Detectron2, 2019.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/123
-  hash: 17408271425993029853
+  hash: 12769367635298110033
   label: list_item
   orig: '[18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk,
     Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and
@@ -3590,14 +13323,14 @@ texts:
     - 0
     - 339
     page_no: 9
+  self_ref: '#/texts/123'
   text: '[18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk,
     Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and
     Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks.
     In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages
     1513715145, feb 2021.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/124
-  hash: 8781691199018342705
+  hash: 3125069241793675330
   label: list_item
   orig: '[19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou.
     Layoutlm: Pre-training of text and layout for document image understanding. In
@@ -3617,14 +13350,14 @@ texts:
     - 0
     - 336
     page_no: 9
+  self_ref: '#/texts/124'
   text: '[19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou.
     Layoutlm: Pre-training of text and layout for document image understanding. In
     Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery
     and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing
     Machinery.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/125
-  hash: 2159895940565677367
+  hash: 7474367108240490719
   label: list_item
   orig: '[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang.
     Vtlayout: Fusion of visual and text features for document layout analysis, 2021.'
@@ -3641,11 +13374,11 @@ texts:
     - 0
     - 153
     page_no: 9
+  self_ref: '#/texts/125'
   text: '[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang.
     Vtlayout: Fusion of visual and text features for document layout analysis, 2021.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/126
-  hash: 15008793456124101567
+  hash: 15036724265562104961
   label: list_item
   orig: '[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus
     conversion service: A machine learning platform to ingest documents at scale.
@@ -3664,13 +13397,13 @@ texts:
     - 0
     - 290
     page_no: 9
+  self_ref: '#/texts/126'
   text: '[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus
     conversion service: A machine learning platform to ingest documents at scale.
     In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery
     and Data Mining , KDD, pages 774-782. ACM, 2018.'
 - children: []
-  dloc: 5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc#/texts/127
-  hash: 17416865681467935095
+  hash: 17388007058339574232
   label: list_item
   orig: '[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation
     for deep learning. Journal of Big Data , 6(1):60, 2019.'
@@ -3687,6 +13420,7 @@ texts:
     - 0
     - 138
     page_no: 9
+  self_ref: '#/texts/127'
   text: '[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation
     for deep learning. Journal of Big Data , 6(1):60, 2019.'
-version: 0.0.1
+version: 0.1.0
diff --git a/test/data/experimental/dummy_doc.yaml b/test/data/experimental/dummy_doc.yaml
index 720a357..8767349 100644
--- a/test/data/experimental/dummy_doc.yaml
+++ b/test/data/experimental/dummy_doc.yaml
@@ -1,14 +1,17 @@
 ---
 ## Document with content + optional layout info
 description: { } # DescriptionType - TBD
-file_info: # FileInfo type
-  document_hash: e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5
-  filename: dummy_doc
+name: "dummy_doc"
+origin:
+  binary_hash: 7954723514066505909
+  filename: dummy_doc.pdf
+  mimetype: application/pdf
+  uri: null
 
 # Root element for any headers, footers, framing, navigation elements, all other non-body text, type GroupItem
 furniture:
   name: "_root_"
-  dloc: "#/furniture"
+  self_ref: "#/furniture"
   parent: null # Only root elements have no parent.
   children: # only the first-level children appear here, as references (RefItem)
     - $ref: "/texts/0"
@@ -16,7 +19,7 @@ furniture:
 # Root element for anything in the document body, type GroupItem
 body:
   name: "_root_"
-  dloc: "#/body"
+  self_ref: "#/body"
   parent: null # Only root elements have no parent.
   children: # only the first-level children appear here, as references (RefItem)
     - $ref: "/texts/1"
@@ -32,7 +35,7 @@ groups: [] # The parent + children relations capture nesting and reading-order.
 texts:
   - orig: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
     text: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
-    dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/0"
+    self_ref: "#/texts/0"
     hash: 132103230
     label: "page_header"
     parent:
@@ -48,8 +51,8 @@ texts:
         charspan: [ 1,423 ] # 2-tuple, references to "orig"
   - orig: "DocLayNet: A Large Human-Annotated Dataset for\nDocument-Layout Analysis"
     text: "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
-    dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/1"
-    hash: 2349732 # uint64 hash of dloc
+    self_ref: "#/texts/1"
+    hash: 2349732 # uint64 hash of self_ref
     label: "title"
     parent:
       $ref: "#/body"
@@ -64,7 +67,7 @@ texts:
         charspan: [ 1,423 ] # 2-tuple, references to "orig"
   - orig: "OPERATION (cont.)" # nested inside the figure
     text: "OPERATION (cont.)"
-    dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/2"
+    self_ref: "#/texts/2"
     hash: 6978483
     label: "section_header"
     parent:
@@ -80,7 +83,7 @@ texts:
         charspan: [ 0,734 ]
   - orig: "Figure 1: Four examples of complex page layouts across dif-\nferent document categories" # nested inside the figure
     text: "Figure 1: Four examples of complex page layouts across different document categories"
-    dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/3"
+    self_ref: "#/texts/3"
     hash: 6978483
     label: "caption"
     parent:
@@ -98,19 +101,19 @@ texts:
 
 
 tables: # All tables...
-  - dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/table/0"
+  - self_ref: "#/table/0"
     hash: 98574
     label: "table"
     parent:
       $ref: "#/body"
     children: [ ]
     image:
-      format: png
+      mimetype: image/png
       dpi: 72
       size:
         width: 231
         height: 351
-      uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/tables/0.png"
+      uri: "file:///dummy_doc/tables/0.png"
       #alternatives: base64 encoded striong
     data: # TableData Type
       table_cells: [] # flat list of TableCell type
@@ -128,7 +131,7 @@ tables: # All tables...
         charspan: [ 1,423 ] # 2-tuple, references to "orig"
 
 figures: # All figures...
-  - dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/figures/0"
+  - self_ref: "#/figures/0"
     hash: 7782482
     label: "picture"
     parent:
@@ -141,12 +144,12 @@ figures: # All figures...
       description: "...."
       # content structure?
     image:
-      format: png
+      mimetype: image/png
       dpi: 72
       size:
         width: 231
         height: 351
-      uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/figures/0.png"
+      uri: "file:///dummy_doc/figures/0.png"
       #alternatives: base64 encoded striong
     children:
       - $ref: "/texts/2" # This text element appears inside the figure, hence it is a child.
@@ -164,17 +167,17 @@ key_value_items: [ ] # All KV-items
 # We should consider this for pages
 pages: # Optional, for layout documents
   1:
-    hash: "5b0916ed3ead46e69efcddb2c932afd91d0e25ce6828c39e5617e6ee2bd0cf6e"
+    hash: 6203680922337857390
     size:
       width: 768.23
       height: 583.15
     image:
-      format: png
+      mimetype: image/png
       dpi: 144
       size:
         width: 1536
         height: 1166
-      uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/pages/1.png"
+      uri: "file:///dummy_doc/pages/1.png"
       #alternatives: base64 encoded string
     num_elements: 23
     page_no: 1
\ No newline at end of file
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index 54c78ec..ee4d351 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -3,8 +3,8 @@
 from docling_core.types.experimental.document import (
     BaseFigureData,
     BaseTableData,
+    DescriptionItem,
     DoclingDocument,
-    FileInfo,
     TableCell,
 )
 from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
@@ -80,8 +80,8 @@ def _test_export_methods(doc):
     ### Iterate all elements
     doc.print_element_tree()
     ## Export stuff
-    print(doc.export_to_markdown())
-    print(doc.export_to_document_tokens())
+    doc.export_to_markdown()
+    doc.export_to_document_tokens()
     for table in doc.tables:
         table.export_to_html()
         table.export_to_dataframe()
@@ -91,9 +91,7 @@ def _test_export_methods(doc):
 
 
 def _construct_doc() -> DoclingDocument:
-    doc = DoclingDocument(
-        description={}, file_info=FileInfo(filename="dummy", document_hash="xyz")
-    )
+    doc = DoclingDocument(description=DescriptionItem(), name="Untitled 1")
     # group, heading, paragraph, table, figure, title, list, provenance
     doc.add_paragraph(label=DocItemLabel.TEXT, text="Author 1\nAffiliation 1")
     doc.add_paragraph(label=DocItemLabel.TEXT, text="Author 2\nAffiliation 2")

From 67109c427eb928c837d4334d2baed25d560ad85d Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Fri, 27 Sep 2024 12:38:54 +0200
Subject: [PATCH 21/34] Fix flake8 config

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 .flake8        | 3 ++-
 pyproject.toml | 4 ----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/.flake8 b/.flake8
index b927c16..d3655a9 100644
--- a/.flake8
+++ b/.flake8
@@ -1,7 +1,8 @@
 [flake8]
+per-file-ignores = __init__.py:F401
 max-line-length = 88
 exclude = test/*
 max-complexity = 18
 docstring-convention = google
 ignore = W503,E203
-classmethod-decorators = classmethod,validator
\ No newline at end of file
+classmethod-decorators = classmethod,validator
diff --git a/pyproject.toml b/pyproject.toml
index b5bf7a9..285610c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -102,10 +102,6 @@ remove-unused-variables = true
 expand-star-imports = true
 recursive = true
 
-[tool.flake8]
-per-file-ignores = "__init__.py:F401"
-classmethod-decorators = "classmethod,validator"
-
 [tool.mypy]
 pretty = true
 # strict = true

From 23e1a52bb03725415fb436c86feb1ebd6d7b7e11 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Fri, 27 Sep 2024 15:17:40 +0200
Subject: [PATCH 22/34] Remove hash, renamings

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/experimental/__init__.py   |   7 +-
 docling_core/types/experimental/document.py   | 154 ++-------
 docs/Document.json                            |   6 +-
 docs/Document.md                              |   2 +-
 docs/Generic.json                             |   6 +-
 docs/Generic.md                               |   2 +-
 .../experimental/2206.01062.experimental.yaml | 310 +++++-------------
 test/data/experimental/dummy_doc.yaml         |  10 +-
 test/test_docling_doc.py                      |  11 +-
 9 files changed, 143 insertions(+), 365 deletions(-)

diff --git a/docling_core/types/experimental/__init__.py b/docling_core/types/experimental/__init__.py
index fdad4f0..af9f7ed 100644
--- a/docling_core/types/experimental/__init__.py
+++ b/docling_core/types/experimental/__init__.py
@@ -7,23 +7,22 @@
 
 from .base import BoundingBox, CoordOrigin, Size
 from .document import (
-    BaseFigureData,
+    BasePictureData,
     BaseTableData,
     DescriptionItem,
     DocItem,
     DoclingDocument,
     DocumentOrigin,
-    DocumentTrees,
-    FigureItem,
     FloatingItem,
     GroupItem,
     ImageRef,
     KeyValueItem,
     NodeItem,
     PageItem,
+    PictureItem,
     ProvenanceItem,
     RefItem,
-    Section,
+    SectionItem,
     TableCell,
     TableItem,
     TextItem,
diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index db343cc..80f7a32 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -1,7 +1,6 @@
 """Models for the Docling Document data type."""
 
 import hashlib
-import json
 import mimetypes
 import typing
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -13,7 +12,6 @@
     ConfigDict,
     Field,
     computed_field,
-    field_serializer,
     field_validator,
     model_validator,
 )
@@ -27,8 +25,8 @@
 LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
 
 
-class BaseFigureData(BaseModel):  # TBD
-    """BaseFigureData."""
+class BasePictureData(BaseModel):  # TBD
+    """BasePictureData."""
 
 
 class TableCell(BaseModel):
@@ -212,7 +210,6 @@ class NodeItem(BaseModel):
     self_ref: str  # format spec: json-path
     parent: Optional[RefItem] = None
     children: List[RefItem] = []
-    hash: Uint64 = 0
 
     def get_ref(self):
         """get_ref."""
@@ -222,7 +219,7 @@ def get_ref(self):
 class GroupItem(NodeItem):  # Container type, can't be a leaf node
     """GroupItem."""
 
-    name: Optional[str] = None
+    name: str = "group"
     label: GroupLabel = GroupLabel.UNSPECIFIED
 
 
@@ -318,7 +315,7 @@ def export_to_document_tokens(
         return body
 
 
-class Section(TextItem):
+class SectionItem(TextItem):
     """Section."""
 
     level: LevelNumber = 1
@@ -333,17 +330,15 @@ class FloatingItem(DocItem):
     image: Optional[ImageRef] = None
 
 
-class FigureItem(FloatingItem):
-    """FigureItem."""
+class PictureItem(FloatingItem):
+    """PictureItem."""
 
-    data: BaseFigureData
+    data: BasePictureData
 
     def export_to_document_tokens(
         self,
         doc: "DoclingDocument",
         new_line: str = "\n",
-        page_w: float = 0.0,
-        page_h: float = 0.0,
         xsize: int = 100,
         ysize: int = 100,
         add_location: bool = True,
@@ -351,12 +346,10 @@ def export_to_document_tokens(
         add_content: bool = True,  # not used at the moment
         add_page_index: bool = True,
     ):
-        r"""Export figure to document tokens format.
+        r"""Export picture to document tokens format.
 
         :param doc: "DoclingDocument":
         :param new_line: str:  (Default value = "\n")
-        :param page_w: float:  (Default value = 0.0)
-        :param page_h: float:  (Default value = 0.0)
         :param xsize: int:  (Default value = 100)
         :param ysize: int:  (Default value = 100)
         :param add_location: bool:  (Default value = True)
@@ -603,16 +596,7 @@ class KeyValueItem(DocItem):
     """KeyValueItem."""
 
 
-ContentItem = Union[TextItem, FigureItem, TableItem, KeyValueItem]
-
-
-class DocumentTrees(BaseModel):
-    """DocumentTrees."""
-
-    furniture: GroupItem = GroupItem(
-        name="_root_", self_ref="#/furniture"
-    )  # List[RefItem] = []
-    body: GroupItem = GroupItem(name="_root_", self_ref="#/body")  # List[RefItem] = []
+ContentItem = Union[TextItem, PictureItem, TableItem, KeyValueItem]
 
 
 class PageItem(BaseModel):
@@ -620,10 +604,6 @@ class PageItem(BaseModel):
 
     # A page carries separate root items for furniture and body,
     # only referencing items on the page
-    hash: Uint64 = (
-        0  # dummy default, correct value ensured through
-        # field_serializer on DoclingDocument
-    )
     size: Size
     image: Optional[ImageRef] = None
     page_no: int
@@ -633,7 +613,7 @@ class DescriptionItem(BaseModel):
     """DescriptionItem."""
 
 
-class DoclingDocument(DocumentTrees):
+class DoclingDocument(BaseModel):
     """DoclingDocument."""
 
     version: str = "0.1.0"  # use SemanticVersion type instead
@@ -646,9 +626,14 @@ class DoclingDocument(DocumentTrees):
         # generated from synthetic data.
     )
 
+    furniture: GroupItem = GroupItem(
+        name="_root_", self_ref="#/furniture"
+    )  # List[RefItem] = []
+    body: GroupItem = GroupItem(name="_root_", self_ref="#/body")  # List[RefItem] = []
+
     groups: List[GroupItem] = []
     texts: List[TextItem] = []
-    figures: List[FigureItem] = []
+    pictures: List[PictureItem] = []
     tables: List[TableItem] = []
     key_value_items: List[KeyValueItem] = []
 
@@ -661,34 +646,6 @@ def _compute_hash(self, obj):
         # Mask it to fit within 64 bits
         return Uint64(hash_int & 0xFFFFFFFFFFFFFFFF)  # 64-bit unsigned integer mask
 
-    @computed_field
-    def hash(self) -> Uint64:
-        """hash."""
-        # Get a dictionary representation of the model, excluding the computed field.
-        # explicitly include fields to be sure the hash is stable.
-        # Must not include hash itself or the pages.
-        model_dict = self.model_dump(
-            mode="json",
-            by_alias=True,
-            include={
-                "version",
-                "name",
-                "description",
-                "origin",
-                "groups",
-                "texts",
-                "figures",
-                "tables",
-                "key_value_items",
-                # "furniture",
-                # "body",
-            },
-        )
-
-        json_string = json.dumps(model_dict, sort_keys=True)
-
-        return self._compute_hash(json_string)
-
     def add_group(
         self,
         label: Optional[GroupLabel] = None,
@@ -796,16 +753,16 @@ def add_table(
 
         return tbl_item
 
-    def add_figure(
+    def add_picture(
         self,
-        data: BaseFigureData,
+        data: BasePictureData,
         caption: Optional[Union[TextItem, RefItem]] = None,
         prov: Optional[ProvenanceItem] = None,
         parent: Optional[GroupItem] = None,
     ):
-        """add_figure.
+        """add_picture.
 
-        :param data: BaseFigureData:
+        :param data: BasePictureData:
         :param caption: Optional[Union[TextItem:
         :param RefItem]]:  (Default value = None)
         :param prov: Optional[ProvenanceItem]:  (Default value = None)
@@ -815,10 +772,10 @@ def add_figure(
         if not parent:
             parent = self.body
 
-        figure_index = len(self.figures)
-        cref = f"#/figures/{figure_index}"
+        picture_index = len(self.pictures)
+        cref = f"#/pictures/{picture_index}"
 
-        fig_item = FigureItem(
+        fig_item = PictureItem(
             label=DocItemLabel.PICTURE,
             data=data,
             self_ref=cref,
@@ -829,14 +786,13 @@ def add_figure(
         if caption:
             fig_item.captions.append(caption.get_ref())
 
-        self.figures.append(fig_item)
+        self.pictures.append(fig_item)
         parent.children.append(RefItem(cref=cref))
 
         return fig_item
 
     def add_heading(
         self,
-        label: DocItemLabel,
         text: str,
         orig: Optional[str] = None,
         level: LevelNumber = 1,
@@ -853,8 +809,13 @@ def add_heading(
         :param parent: Optional[GroupItem]:  (Default value = None)
 
         """
-        item: Section = self.add_paragraph(
-            label, text, orig, prov, parent, item_cls=Section
+        item: SectionItem = self.add_paragraph(
+            label=DocItemLabel.SECTION_HEADER,
+            text=text,
+            orig=orig,
+            prov=prov,
+            parent=parent,
+            item_cls=SectionItem,
         )
         item.level = level
         return item
@@ -867,7 +828,7 @@ def iterate_elements(
         self,
         root: Optional[NodeItem] = None,
         with_groups: bool = False,
-        traverse_figures: bool = True,
+        traverse_pictures: bool = True,
         page_no: Optional[int] = None,
         _level: int = 0,  # fixed parameter, carries through the node nesting level
     ) -> typing.Iterable[Tuple[NodeItem, int]]:  # tuple of node and level
@@ -875,7 +836,7 @@ def iterate_elements(
 
         :param root: Optional[NodeItem]:  (Default value = None)
         :param with_groups: bool:  (Default value = False)
-        :param traverse_figures: bool:  (Default value = True)
+        :param traverse_pictures: bool:  (Default value = True)
         :param page_no: Optional[int]:  (Default value = None)
         :param _level:  (Default value = 0)
         :param # fixed parameter:
@@ -901,7 +862,7 @@ def iterate_elements(
 
             if isinstance(child, NodeItem):
                 # If the child is a NodeItem, recursively traverse it
-                if not isinstance(child, FigureItem) or traverse_figures:
+                if not isinstance(child, PictureItem) or traverse_pictures:
                     yield from self.iterate_elements(
                         child, _level=_level + 1, with_groups=with_groups
                     )
@@ -1072,14 +1033,6 @@ def export_to_document_tokens(
         :param from_element: int:  (Default value = 0)
         :param to_element: Optional[int]:  (Default value = None)
         :param labels: list[DocItemLabel]
-        :param "subtitle-level-1":
-        :param "Section-header" "paragraph":
-        :param "caption":
-        :param "table":
-        :param "figure":
-        :param "text":
-        :param "Text":
-        :param ]:
         :param xsize: int:  (Default value = 100)
         :param ysize: int:  (Default value = 100)
         :param add_location: bool:  (Default value = True)
@@ -1152,7 +1105,7 @@ def export_to_document_tokens(
                     add_page_index=add_page_index,
                 )
 
-            elif isinstance(item, FigureItem) and (item_type in labels):
+            elif isinstance(item, PictureItem) and (item_type in labels):
 
                 doctags += item.export_to_document_tokens(
                     doc=self,
@@ -1174,44 +1127,9 @@ def add_page(self, page_no: int, size: Size) -> PageItem:
 
         :param page_no: int:
         :param size: Size:
-        :param hash: str:
 
         """
-        pitem = PageItem(page_no=page_no, size=size, hash=page_no)
+        pitem = PageItem(page_no=page_no, size=size)
 
         self.pages[page_no] = pitem
         return pitem
-
-    @field_serializer("body", "furniture", mode="wrap")
-    def serialize_tree(self, value: NodeItem, handler):
-        """serialize_tree."""
-        for node, level in self.iterate_elements(root=value, with_groups=True):
-            node.hash = self._derive_hash(node.self_ref)
-
-        return handler(value)
-
-    @field_serializer("pages", mode="wrap")
-    def serialize_pages(self, pages: Dict[int, PageItem], handler):
-        """serialize_pages."""
-        for page in pages.values():
-            page.hash = self._derive_hash(str(page.page_no))
-
-        return handler(pages)
-
-    def update_hashes(self):
-        """update_hashes."""
-        # Updates the hashes on all elements, based on the computed document hash
-        for node, level in self.iterate_elements(root=self.body, with_groups=True):
-            node.hash = self._derive_hash(node.self_ref)
-
-        for node, level in self.iterate_elements(root=self.furniture, with_groups=True):
-            node.hash = self._derive_hash(node.self_ref)
-
-        for page in self.pages.values():
-            page.hash = self._derive_hash(str(page.page_no))
-
-    def _derive_hash(self, data: str) -> Uint64:
-        doc_hash = self.hash
-        combined = f"{doc_hash}{data}"
-
-        return self._compute_hash(combined)
diff --git a/docs/Document.json b/docs/Document.json
index 55eda68..2580e70 100644
--- a/docs/Document.json
+++ b/docs/Document.json
@@ -323,7 +323,11 @@
           "type": "string"
         },
         "bounding_box": {
-          "$ref": "#/$defs/BoundingBoxContainer",
+          "allOf": [
+            {
+              "$ref": "#/$defs/BoundingBoxContainer"
+            }
+          ],
           "x-es-suppress": true
         },
         "prov": {
diff --git a/docs/Document.md b/docs/Document.md
index 40ee7fe..ab50400 100644
--- a/docs/Document.md
+++ b/docs/Document.md
@@ -6052,7 +6052,7 @@ Must be one of:
 | **Type**                  | `object`                                                                  |
 | **Required**              | Yes                                                                       |
 | **Additional properties** | [[Any type: allowed]](# "Additional Properties of any type are allowed.") |
-| **Defined in**            | #/$defs/BoundingBoxContainer                                              |
+| **Defined in**            |                                                                           |
 
 **Description:** Bounding box container.
 
diff --git a/docs/Generic.json b/docs/Generic.json
index c900e11..6b15029 100644
--- a/docs/Generic.json
+++ b/docs/Generic.json
@@ -58,7 +58,11 @@
       "x-es-type": "text"
     },
     "file-info": {
-      "$ref": "#/$defs/FileInfoObject",
+      "allOf": [
+        {
+          "$ref": "#/$defs/FileInfoObject"
+        }
+      ],
       "description": "Minimal identification information of the document within a collection.",
       "title": "Document information"
     }
diff --git a/docs/Generic.md b/docs/Generic.md
index 32b5066..175187f 100644
--- a/docs/Generic.md
+++ b/docs/Generic.md
@@ -75,7 +75,7 @@
 | **Type**                  | `object`                                                                  |
 | **Required**              | Yes                                                                       |
 | **Additional properties** | [[Any type: allowed]](# "Additional Properties of any type are allowed.") |
-| **Defined in**            | #/$defs/FileInfoObject                                                    |
+| **Defined in**            |                                                                           |
 
 **Description:** Minimal identification information of the document within a collection.
 
diff --git a/test/data/experimental/2206.01062.experimental.yaml b/test/data/experimental/2206.01062.experimental.yaml
index 88449bb..e927205 100644
--- a/test/data/experimental/2206.01062.experimental.yaml
+++ b/test/data/experimental/2206.01062.experimental.yaml
@@ -13,7 +13,7 @@ body:
   - $ref: '#/texts/10'
   - $ref: '#/texts/11'
   - $ref: '#/texts/12'
-  - $ref: '#/figures/0'
+  - $ref: '#/pictures/0'
   - $ref: '#/texts/13'
   - $ref: '#/texts/14'
   - $ref: '#/texts/15'
@@ -41,7 +41,7 @@ body:
   - $ref: '#/texts/37'
   - $ref: '#/texts/38'
   - $ref: '#/texts/39'
-  - $ref: '#/figures/1'
+  - $ref: '#/pictures/1'
   - $ref: '#/texts/40'
   - $ref: '#/texts/41'
   - $ref: '#/texts/42'
@@ -74,7 +74,7 @@ body:
   - $ref: '#/texts/68'
   - $ref: '#/texts/69'
   - $ref: '#/texts/70'
-  - $ref: '#/figures/2'
+  - $ref: '#/pictures/2'
   - $ref: '#/texts/71'
   - $ref: '#/texts/72'
   - $ref: '#/texts/73'
@@ -82,7 +82,7 @@ body:
   - $ref: '#/texts/74'
   - $ref: '#/texts/75'
   - $ref: '#/texts/76'
-  - $ref: '#/figures/3'
+  - $ref: '#/pictures/3'
   - $ref: '#/texts/77'
   - $ref: '#/texts/78'
   - $ref: '#/texts/79'
@@ -127,7 +127,7 @@ body:
   - $ref: '#/texts/115'
   - $ref: '#/texts/116'
   - $ref: '#/texts/117'
-  - $ref: '#/figures/4'
+  - $ref: '#/pictures/4'
   - $ref: '#/texts/118'
   - $ref: '#/texts/119'
   - $ref: '#/texts/120'
@@ -138,19 +138,86 @@ body:
   - $ref: '#/texts/125'
   - $ref: '#/texts/126'
   - $ref: '#/texts/127'
-  hash: 2982977287550829877
   label: unspecified
   name: _root_
   parent: null
   self_ref: '#/body'
 description: {}
-figures:
+furniture:
+  children: []
+  label: unspecified
+  name: _root_
+  parent: null
+  self_ref: '#/furniture'
+groups: []
+key_value_items: []
+name: '2206.01062'
+origin:
+  binary_hash: 7156212269791437020
+  filename: 2206.01062.pdf
+  mimetype: application/pdf
+  uri: null
+pages:
+  '1':
+    image: null
+    page_no: 1
+    size:
+      height: 792.0
+      width: 612.0
+  '2':
+    image: null
+    page_no: 2
+    size:
+      height: 792.0
+      width: 612.0
+  '3':
+    image: null
+    page_no: 3
+    size:
+      height: 792.0
+      width: 612.0
+  '4':
+    image: null
+    page_no: 4
+    size:
+      height: 792.0
+      width: 612.0
+  '5':
+    image: null
+    page_no: 5
+    size:
+      height: 792.0
+      width: 612.0
+  '6':
+    image: null
+    page_no: 6
+    size:
+      height: 792.0
+      width: 612.0
+  '7':
+    image: null
+    page_no: 7
+    size:
+      height: 792.0
+      width: 612.0
+  '8':
+    image: null
+    page_no: 8
+    size:
+      height: 792.0
+      width: 612.0
+  '9':
+    image: null
+    page_no: 9
+    size:
+      height: 792.0
+      width: 612.0
+pictures:
 - captions:
   - $ref: '#/texts/12'
   children: []
   data: {}
   footnotes: []
-  hash: 8845849056743015509
   image: null
   label: picture
   parent:
@@ -167,13 +234,12 @@ figures:
     - 84
     page_no: 1
   references: []
-  self_ref: '#/figures/0'
+  self_ref: '#/pictures/0'
 - captions:
   - $ref: '#/texts/39'
   children: []
   data: {}
   footnotes: []
-  hash: 9157218593054708372
   image: null
   label: picture
   parent:
@@ -190,13 +256,12 @@ figures:
     - 69
     page_no: 3
   references: []
-  self_ref: '#/figures/1'
+  self_ref: '#/pictures/1'
 - captions:
   - $ref: '#/texts/70'
   children: []
   data: {}
   footnotes: []
-  hash: 2268600251493203652
   image: null
   label: picture
   parent:
@@ -213,13 +278,12 @@ figures:
     - 130
     page_no: 5
   references: []
-  self_ref: '#/figures/2'
+  self_ref: '#/pictures/2'
 - captions:
   - $ref: '#/texts/76'
   children: []
   data: {}
   footnotes: []
-  hash: 15280179124146488233
   image: null
   label: picture
   parent:
@@ -236,13 +300,12 @@ figures:
     - 71
     page_no: 6
   references: []
-  self_ref: '#/figures/3'
+  self_ref: '#/pictures/3'
 - captions:
   - $ref: '#/texts/117'
   children: []
   data: {}
   footnotes: []
-  hash: 7078061312183845001
   image: null
   label: picture
   parent:
@@ -259,87 +322,7 @@ figures:
     - 188
     page_no: 9
   references: []
-  self_ref: '#/figures/4'
-furniture:
-  children: []
-  hash: 2030260901333211352
-  label: unspecified
-  name: _root_
-  parent: null
-  self_ref: '#/furniture'
-groups: []
-hash: 17981205059156515073
-key_value_items: []
-name: '2206.01062'
-origin:
-  binary_hash: 7156212269791437020
-  filename: 2206.01062.pdf
-  mimetype: application/pdf
-  uri: null
-pages:
-  '1':
-    hash: 8509969582596715807
-    image: null
-    page_no: 1
-    size:
-      height: 792.0
-      width: 612.0
-  '2':
-    hash: 8946042279011020565
-    image: null
-    page_no: 2
-    size:
-      height: 792.0
-      width: 612.0
-  '3':
-    hash: 11952309765409111665
-    image: null
-    page_no: 3
-    size:
-      height: 792.0
-      width: 612.0
-  '4':
-    hash: 16141549366384907945
-    image: null
-    page_no: 4
-    size:
-      height: 792.0
-      width: 612.0
-  '5':
-    hash: 13731695325243987934
-    image: null
-    page_no: 5
-    size:
-      height: 792.0
-      width: 612.0
-  '6':
-    hash: 16994899611641034686
-    image: null
-    page_no: 6
-    size:
-      height: 792.0
-      width: 612.0
-  '7':
-    hash: 5935321345165759586
-    image: null
-    page_no: 7
-    size:
-      height: 792.0
-      width: 612.0
-  '8':
-    hash: 12407706083782784507
-    image: null
-    page_no: 8
-    size:
-      height: 792.0
-      width: 612.0
-  '9':
-    hash: 14545812751042780836
-    image: null
-    page_no: 9
-    size:
-      height: 792.0
-      width: 612.0
+  self_ref: '#/pictures/4'
 tables:
 - captions:
   - $ref: '#/texts/50'
@@ -3958,7 +3941,6 @@ tables:
       start_row_offset_idx: 13
       text: 68-85
   footnotes: []
-  hash: 7038790146519691597
   image: null
   label: table
   parent:
@@ -5811,7 +5793,6 @@ tables:
       start_row_offset_idx: 13
       text: '76.8'
   footnotes: []
-  hash: 12013044154325944789
   image: null
   label: table
   parent:
@@ -7268,7 +7249,6 @@ tables:
       start_row_offset_idx: 12
       text: '77'
   footnotes: []
-  hash: 123696755438675010
   image: null
   label: table
   parent:
@@ -8813,7 +8793,6 @@ tables:
       start_row_offset_idx: 13
       text: '87'
   footnotes: []
-  hash: 1302023060946254192
   image: null
   label: table
   parent:
@@ -10138,7 +10117,6 @@ tables:
       start_row_offset_idx: 14
       text: '78'
   footnotes: []
-  hash: 14690562278911182124
   image: null
   label: table
   parent:
@@ -10158,7 +10136,6 @@ tables:
   self_ref: '#/tables/4'
 texts:
 - children: []
-  hash: 5522753358710955051
   label: section_header
   orig: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
   parent:
@@ -10177,7 +10154,6 @@ texts:
   self_ref: '#/texts/0'
   text: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
 - children: []
-  hash: 8271824637245472778
   label: text
   orig: Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com
   parent:
@@ -10196,7 +10172,6 @@ texts:
   self_ref: '#/texts/1'
   text: Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com
 - children: []
-  hash: 8306016912873407413
   label: text
   orig: Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com
   parent:
@@ -10215,7 +10190,6 @@ texts:
   self_ref: '#/texts/2'
   text: Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com
 - children: []
-  hash: 18359905356795742945
   label: text
   orig: Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com
   parent:
@@ -10234,7 +10208,6 @@ texts:
   self_ref: '#/texts/3'
   text: Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com
 - children: []
-  hash: 13640485470030436649
   label: text
   orig: Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com
   parent:
@@ -10253,7 +10226,6 @@ texts:
   self_ref: '#/texts/4'
   text: Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com
 - children: []
-  hash: 18369908591937398930
   label: text
   orig: Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com
   parent:
@@ -10272,7 +10244,6 @@ texts:
   self_ref: '#/texts/5'
   text: Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com
 - children: []
-  hash: 9138081927775942786
   label: section_header
   orig: ABSTRACT
   parent:
@@ -10291,7 +10262,6 @@ texts:
   self_ref: '#/texts/6'
   text: ABSTRACT
 - children: []
-  hash: 14128818639649693389
   label: text
   orig: Accurate document layout analysis is a key requirement for highquality PDF
     document conversion. With the recent availability of public, large ground-truth
@@ -10348,7 +10318,6 @@ texts:
     that layout predictions of the DocLayNettrained models are more robust and thus
     the preferred choice for general-purpose document-layout analysis.
 - children: []
-  hash: 13652067706470412099
   label: section_header
   orig: CCS CONCEPTS
   parent:
@@ -10367,7 +10336,6 @@ texts:
   self_ref: '#/texts/8'
   text: CCS CONCEPTS
 - children: []
-  hash: 14921674925616302289
   label: text
   orig: "\xB7 Information systems \u2192 Document structure ; \xB7 Applied computing\
     \ \u2192 Document analysis ; \xB7 Computing methodologies \u2192 Machine learning\
@@ -10390,7 +10358,6 @@ texts:
     \ \u2192 Document analysis ; \xB7 Computing methodologies \u2192 Machine learning\
     \ ; Computer vision ; Object detection ;"
 - children: []
-  hash: 2534709576924392255
   label: text
   orig: Permission to make digital or hard copies of part or all of this work for
     personal or classroom use is granted without fee provided that copies are not
@@ -10417,7 +10384,6 @@ texts:
     notice and the full citation on the first page. Copyrights for third-party components
     of this work must be honored. For all other uses, contact the owner/author(s).
 - children: []
-  hash: 1842005346019600268
   label: text
   orig: "KDD '22, August 14-18, 2022, Washington, DC, USA \xA9 2022 Copyright held\
     \ by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043"
@@ -10438,7 +10404,6 @@ texts:
   text: "KDD '22, August 14-18, 2022, Washington, DC, USA \xA9 2022 Copyright held\
     \ by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043"
 - children: []
-  hash: 773478380528488902
   label: caption
   orig: 'Figure 1: Four examples of complex page layouts across different document
     categories'
@@ -10459,7 +10424,6 @@ texts:
   text: 'Figure 1: Four examples of complex page layouts across different document
     categories'
 - children: []
-  hash: 10700740223146899046
   label: section_header
   orig: KEYWORDS
   parent:
@@ -10478,7 +10442,6 @@ texts:
   self_ref: '#/texts/13'
   text: KEYWORDS
 - children: []
-  hash: 17170823355966624845
   label: text
   orig: PDF document conversion, layout segmentation, object-detection, data set,
     Machine Learning
@@ -10499,7 +10462,6 @@ texts:
   text: PDF document conversion, layout segmentation, object-detection, data set,
     Machine Learning
 - children: []
-  hash: 10573589843195006501
   label: section_header
   orig: 'ACM Reference Format:'
   parent:
@@ -10518,7 +10480,6 @@ texts:
   self_ref: '#/texts/15'
   text: 'ACM Reference Format:'
 - children: []
-  hash: 14872842244838520232
   label: text
   orig: 'Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter
     Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis.
@@ -10545,7 +10506,6 @@ texts:
     Mining (KDD ''22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY,
     USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043'
 - children: []
-  hash: 8886316323367582203
   label: page_header
   orig: "KDD \u201922, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph\
     \ Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar"
@@ -10566,7 +10526,6 @@ texts:
   text: "KDD \u201922, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph\
     \ Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar"
 - children: []
-  hash: 14036706213632695213
   label: section_header
   orig: 1 INTRODUCTION
   parent:
@@ -10585,7 +10544,6 @@ texts:
   self_ref: '#/texts/18'
   text: 1 INTRODUCTION
 - children: []
-  hash: 16763060144345832331
   label: text
   orig: Despite the substantial improvements achieved with machine-learning (ML) approaches
     and deep neural networks in recent years, document conversion remains a challenging
@@ -10620,7 +10578,6 @@ texts:
     [5]. To highlight the variability in document layouts, we show a few example documents
     from the DocLayNet dataset in Figure 1.
 - children: []
-  hash: 6179967949039028227
   label: text
   orig: 'A key problem in the process of document conversion is to understand the
     structure of a single document page, i.e. which segments of text should be grouped
@@ -10677,7 +10634,6 @@ texts:
     more artistic or free-style layouts, we see sub-par prediction quality from these
     models, which we demonstrate in Section 5.'
 - children: []
-  hash: 5928701079679201823
   label: text
   orig: 'In this paper, we present the DocLayNet dataset. It provides pageby-page
     layout annotation ground-truth using bounding-boxes for 11 distinct class labels
@@ -10706,7 +10662,6 @@ texts:
     available to the public 1 in order to stimulate the document-layout analysis community.
     It distinguishes itself in the following aspects:'
 - children: []
-  hash: 7285805654125410296
   label: list_item
   orig: '(1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on
     human annotation instead of automation approaches to generate the data set.'
@@ -10727,7 +10682,6 @@ texts:
   text: '(1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on
     human annotation instead of automation approaches to generate the data set.'
 - children: []
-  hash: 12390464209458256957
   label: list_item
   orig: '(2) Large Layout Variability : We include diverse and complex layouts from
     a large variety of public sources.'
@@ -10748,7 +10702,6 @@ texts:
   text: '(2) Large Layout Variability : We include diverse and complex layouts from
     a large variety of public sources.'
 - children: []
-  hash: 11081073573051959825
   label: list_item
   orig: '(3) Detailed Label Set : We define 11 class labels to distinguish layout
     features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although
@@ -10771,7 +10724,6 @@ texts:
     features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although
     not a superset of ours.'
 - children: []
-  hash: 970302989994738402
   label: list_item
   orig: '(4) Redundant Annotations : A fraction of the pages in the DocLayNet data
     set carry more than one human annotation.'
@@ -10792,7 +10744,6 @@ texts:
   text: '(4) Redundant Annotations : A fraction of the pages in the DocLayNet data
     set carry more than one human annotation.'
 - children: []
-  hash: 9624797869966588960
   label: footnote
   orig: $^{1}$https://developer.ibm.com/exchanges/data/all/doclaynet
   parent:
@@ -10811,7 +10762,6 @@ texts:
   self_ref: '#/texts/26'
   text: $^{1}$https://developer.ibm.com/exchanges/data/all/doclaynet
 - children: []
-  hash: 15746426817481687610
   label: text
   orig: This enables experimentation with annotation uncertainty and quality control
     analysis.
@@ -10832,7 +10782,6 @@ texts:
   text: This enables experimentation with annotation uncertainty and quality control
     analysis.
 - children: []
-  hash: 16075660714767766450
   label: list_item
   orig: '(5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide
     fixed train-, test- & validation-sets to ensure proportional representation of
@@ -10857,7 +10806,6 @@ texts:
     the class-labels. Further, we prevent leakage of unique layouts across sets, which
     has a large effect on model accuracy scores.'
 - children: []
-  hash: 12188049037985059716
   label: text
   orig: All aspects outlined above are detailed in Section 3. In Section 4, we will
     elaborate on how we designed and executed this large-scale human annotation campaign.
@@ -10882,7 +10830,6 @@ texts:
     We will also share key insights and lessons learned that might prove helpful for
     other parties planning to set up annotation campaigns.
 - children: []
-  hash: 12509950877710825568
   label: text
   orig: In Section 5, we will present baseline accuracy numbers for a variety of object
     detection methods (Faster R-CNN, Mask R-CNN and YOLOv5) trained on DocLayNet.
@@ -10913,7 +10860,6 @@ texts:
     and DocLayNet and demonstrate that a model trained on DocLayNet provides overall
     more robust layout recovery.
 - children: []
-  hash: 1443485254974715578
   label: section_header
   orig: 2 RELATED WORK
   parent:
@@ -10932,7 +10878,6 @@ texts:
   self_ref: '#/texts/31'
   text: 2 RELATED WORK
 - children: []
-  hash: 8122071803335578615
   label: text
   orig: While early approaches in document-layout analysis used rulebased algorithms
     and heuristics [8], the problem is lately addressed with deep learning methods.
@@ -10967,7 +10912,6 @@ texts:
     such as PubLayNet [6] and DocBank provide their data in the commonly accepted
     COCO format [16].
 - children: []
-  hash: 842592930694523879
   label: text
   orig: Lately, new types of ML models for document-layout analysis have emerged in
     the community [18-21]. These models do not approach the problem of layout analysis
@@ -10998,7 +10942,6 @@ texts:
     a broadly accepted data format which links geometric and textual features has
     yet to establish.
 - children: []
-  hash: 10305639277644886816
   label: section_header
   orig: 3 THE DOCLAYNET DATASET
   parent:
@@ -11017,7 +10960,6 @@ texts:
   self_ref: '#/texts/34'
   text: 3 THE DOCLAYNET DATASET
 - children: []
-  hash: 13253192885407465025
   label: text
   orig: DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances
     of human annotations, and 1591 carry three. This amounts to 91104 total annotation
@@ -11048,7 +10990,6 @@ texts:
     , Section-header , Table , Text , and Title . Our reasoning for picking this particular
     label set is detailed in Section 4.
 - children: []
-  hash: 4009549806736392409
   label: text
   orig: In addition to open intellectual property constraints for the source documents,
     we required that the documents in DocLayNet adhere to a few conditions. Firstly,
@@ -11071,7 +11012,6 @@ texts:
     we required that the documents in DocLayNet adhere to a few conditions. Firstly,
     we kept scanned documents
 - children: []
-  hash: 17470228760559609425
   label: page_header
   orig: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
   parent:
@@ -11090,7 +11030,6 @@ texts:
   self_ref: '#/texts/37'
   text: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
 - children: []
-  hash: 17964098560616781264
   label: page_header
   orig: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
   parent:
@@ -11109,7 +11048,6 @@ texts:
   self_ref: '#/texts/38'
   text: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
 - children: []
-  hash: 2152771201290039654
   label: caption
   orig: 'Figure 2: Distribution of DocLayNet pages across document categories.'
   parent:
@@ -11128,7 +11066,6 @@ texts:
   self_ref: '#/texts/39'
   text: 'Figure 2: Distribution of DocLayNet pages across document categories.'
 - children: []
-  hash: 18326974418691293350
   label: text
   orig: The pages in DocLayNet can be grouped into six distinct categories, namely
     Financial Reports , Manuals , Scientific Articles , Laws & Regulations , Patents
@@ -11167,7 +11104,6 @@ texts:
     Figure 2, we show the document categories contained in DocLayNet with their respective
     sizes.
 - children: []
-  hash: 17331677447037592116
   label: text
   orig: We did not control the document selection with regard to language. The vast
     majority of documents contained in DocLayNet (close to 95%) are published in English
@@ -11198,7 +11134,6 @@ texts:
     detection and segmentation models, it might prove challenging for layout analysis
     methods which exploit textual features.
 - children: []
-  hash: 8050499577265375805
   label: text
   orig: To ensure that future benchmarks in the document-layout analysis community
     can be easily compared, we have split up DocLayNet into pre-defined train-, test-
@@ -11225,7 +11160,6 @@ texts:
     scores due to random splitting in train-, test- and validation-sets. We also ensured
     that less frequent labels are represented in train and test sets in equal proportions.
 - children: []
-  hash: 12951529389829894886
   label: footnote
   orig: $^{2}$e.g. AAPL from https://www.annualreports.com/
   parent:
@@ -11244,7 +11178,6 @@ texts:
   self_ref: '#/texts/43'
   text: $^{2}$e.g. AAPL from https://www.annualreports.com/
 - children: []
-  hash: 18123465042552214371
   label: text
   orig: Table 1 shows the overall frequency and distribution of the labels among the
     different sets. Importantly, we ensure that subsets are only split on full-document
@@ -11273,7 +11206,6 @@ texts:
     and lead to overestimation of their prediction accuracy. We will show the impact
     of this decision in Section 5.
 - children: []
-  hash: 14073537799186996310
   label: text
   orig: "In order to accommodate the different types of models currently in use by\
     \ the community, we provide DocLayNet in an augmented COCO format [16]. This entails\
@@ -11308,7 +11240,6 @@ texts:
     \ (in JSON). All additional files are linked to the primary page images by their\
     \ matching filenames."
 - children: []
-  hash: 18386447901530976124
   label: text
   orig: Despite being cost-intense and far less scalable than automation, human annotation
     has several benefits over automated groundtruth generation. The first and most
@@ -11371,7 +11302,6 @@ texts:
     (see Table 1). On the flip side, achieving high annotation consistency proved
     to be a key challenge in human annotation, as we outline in Section 4.
 - children: []
-  hash: 4988296339029657054
   label: section_header
   orig: 4 ANNOTATION CAMPAIGN
   parent:
@@ -11390,7 +11320,6 @@ texts:
   self_ref: '#/texts/47'
   text: 4 ANNOTATION CAMPAIGN
 - children: []
-  hash: 9049409901183805554
   label: text
   orig: The annotation campaign was carried out in four phases. In phase one, we identified
     and prepared the data sources for annotation. In phase two, we determined the
@@ -11419,7 +11348,6 @@ texts:
     and exhaustive experiments. In phase three, we trained the annotation staff and
     performed exams for quality assurance. In phase four,
 - children: []
-  hash: 5565740545948543009
   label: page_header
   orig: "KDD \u201922, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph\
     \ Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar"
@@ -11440,7 +11368,6 @@ texts:
   text: "KDD \u201922, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph\
     \ Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar"
 - children: []
-  hash: 7404461851293909485
   label: caption
   orig: ''
   parent:
@@ -11459,7 +11386,6 @@ texts:
   self_ref: '#/texts/50'
   text: ''
 - children: []
-  hash: 11276398568228129142
   label: text
   orig: we distributed the annotation workload and performed continuous quality controls.
     Phase one and two required a small team of experts only. For phases three and
@@ -11482,7 +11408,6 @@ texts:
     Phase one and two required a small team of experts only. For phases three and
     four, a group of 40 dedicated annotators were assembled and supervised.
 - children: []
-  hash: 5214138199614338209
   label: text
   orig: 'Phase 1: Data selection and preparation. Our inclusion criteria for documents
     were described in Section 3. A large effort went into ensuring that all documents
@@ -11505,7 +11430,6 @@ texts:
     were described in Section 3. A large effort went into ensuring that all documents
     are free to use. The data sources'
 - children: []
-  hash: 1652349545620087391
   label: text
   orig: include publication repositories such as arXiv$^{3}$, government offices,
     company websites as well as data directory services for financial reports and
@@ -11532,7 +11456,6 @@ texts:
     rotated or skewed. This would not allow us to perform annotation with rectangular
     bounding-boxes and therefore complicate the annotation process.
 - children: []
-  hash: 16847780312916319884
   label: text
   orig: Preparation work included uploading and parsing the sourced PDF documents
     in the Corpus Conversion Service (CCS) [22], a cloud-native platform which provides
@@ -11569,7 +11492,6 @@ texts:
     detection models from PubLayNet, which helped us estimate how many figures and
     tables a given page contains.
 - children: []
-  hash: 853856630909111423
   label: text
   orig: 'Phase 2: Label selection and guideline. We reviewed the collected documents
     and identified the most common structural features they exhibit. This was achieved
@@ -11616,7 +11538,6 @@ texts:
     Labels such as Author and Affiliation , as seen in DocBank, are often only distinguishable
     by discriminating on'
 - children: []
-  hash: 10820126744691405163
   label: footnote
   orig: $^{3}$https://arxiv.org/
   parent:
@@ -11635,7 +11556,6 @@ texts:
   self_ref: '#/texts/56'
   text: $^{3}$https://arxiv.org/
 - children: []
-  hash: 15137025955302266836
   label: page_header
   orig: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
   parent:
@@ -11654,7 +11574,6 @@ texts:
   self_ref: '#/texts/57'
   text: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
 - children: []
-  hash: 16594789809220859729
   label: page_header
   orig: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
   parent:
@@ -11673,7 +11592,6 @@ texts:
   self_ref: '#/texts/58'
   text: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
 - children: []
-  hash: 13348689006806549459
   label: text
   orig: the textual content of an element, which goes beyond visual layout recognition,
     in particular outside the Scientific Articles category.
@@ -11694,7 +11612,6 @@ texts:
   text: the textual content of an element, which goes beyond visual layout recognition,
     in particular outside the Scientific Articles category.
 - children: []
-  hash: 2899629868160691152
   label: text
   orig: At first sight, the task of visual document-layout interpretation appears
     intuitive enough to obtain plausible annotations in most cases. However, during
@@ -11733,7 +11650,6 @@ texts:
     in Figure 4 multiple examples of plausible but inconsistent annotations on the
     same pages.
 - children: []
-  hash: 9530250172263586436
   label: text
   orig: 'Obviously, this inconsistency in annotations is not desirable for datasets
     which are intended to be used for model training. To minimise these inconsistencies,
@@ -11762,7 +11678,6 @@ texts:
     in annotation consistency after the introduction of our annotation guideline.
     A few selected, non-trivial highlights of the guideline are:'
 - children: []
-  hash: 16012774257828431546
   label: list_item
   orig: (1) Every list-item is an individual object instance with class label List-item
     . This definition is different from PubLayNet and DocBank, where all list-items
@@ -11785,7 +11700,6 @@ texts:
     . This definition is different from PubLayNet and DocBank, where all list-items
     are grouped together into one List object.
 - children: []
-  hash: 15871134704762394430
   label: list_item
   orig: (2) A List-item is a paragraph with hanging indentation. Singleline elements
     can qualify as List-item if the neighbour elements expose hanging indentation.
@@ -11808,7 +11722,6 @@ texts:
     can qualify as List-item if the neighbour elements expose hanging indentation.
     Bullet or enumeration symbols are not a requirement.
 - children: []
-  hash: 17395883987622603550
   label: list_item
   orig: (3) For every Caption , there must be exactly one corresponding Picture or
     Table .
@@ -11829,7 +11742,6 @@ texts:
   text: (3) For every Caption , there must be exactly one corresponding Picture or
     Table .
 - children: []
-  hash: 7087799182536849687
   label: list_item
   orig: (4) Connected sub-pictures are grouped together in one Picture object.
   parent:
@@ -11848,7 +11760,6 @@ texts:
   self_ref: '#/texts/65'
   text: (4) Connected sub-pictures are grouped together in one Picture object.
 - children: []
-  hash: 8990831048276827416
   label: list_item
   orig: (5) Formula numbers are included in a Formula object.
   parent:
@@ -11867,7 +11778,6 @@ texts:
   self_ref: '#/texts/66'
   text: (5) Formula numbers are included in a Formula object.
 - children: []
-  hash: 15080088485878212709
   label: list_item
   orig: (6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph
     is not considered a Section-header , unless it appears exclusively on its own
@@ -11890,7 +11800,6 @@ texts:
     is not considered a Section-header , unless it appears exclusively on its own
     line.
 - children: []
-  hash: 340695852381472969
   label: text
   orig: The complete annotation guideline is over 100 pages long and a detailed description
     is obviously out of scope for this paper. Nevertheless, it will be made publicly
@@ -11913,7 +11822,6 @@ texts:
     is obviously out of scope for this paper. Nevertheless, it will be made publicly
     available alongside with DocLayNet for future reference.
 - children: []
-  hash: 18427360413523808725
   label: text
   orig: 'Phase 3: Training. After a first trial with a small group of people, we realised
     that providing the annotation guideline and a set of random practice pages did
@@ -11950,7 +11858,6 @@ texts:
     the reference. Only after passing two exam levels with high annotation quality,
     staff were admitted into the production phase. Practice iterations'
 - children: []
-  hash: 17039526178394070759
   label: caption
   orig: 'Figure 4: Examples of plausible annotation alternatives for the same page.
     Criteria in our annotation guideline can resolve cases '
@@ -11971,7 +11878,6 @@ texts:
   text: 'Figure 4: Examples of plausible annotation alternatives for the same page.
     Criteria in our annotation guideline can resolve cases '
 - children: []
-  hash: 9171696979434376961
   label: text
   orig: were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially
     allocated annotators did not pass the bar.
@@ -11992,7 +11898,6 @@ texts:
   text: were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially
     allocated annotators did not pass the bar.
 - children: []
-  hash: 258948183867512950
   label: text
   orig: 'Phase 4: Production annotation. The previously selected 80K pages were annotated
     with the defined 11 class labels by 32 annotators. This production phase took
@@ -12035,7 +11940,6 @@ texts:
     which could skew the numbers of the inter-annotator agreement (see Table 1). We
     wanted'
 - children: []
-  hash: 6581136044412754340
   label: caption
   orig: 'Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks
     on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models
@@ -12066,7 +11970,6 @@ texts:
     [13]. All models were initialised using pre-trained weights from the COCO 2017
     dataset.'
 - children: []
-  hash: 7321418918830647717
   label: text
   orig: to avoid this at any cost in order to have clear, unbiased baseline numbers
     for human document-layout annotation. Third, we introduced the feature of snapping
@@ -12115,7 +12018,6 @@ texts:
     annotation staff managed to annotate a single page in a typical timeframe of 20s
     to 60s, depending on its complexity.
 - children: []
-  hash: 1563151945244295808
   label: section_header
   orig: 5 EXPERIMENTS
   parent:
@@ -12134,7 +12036,6 @@ texts:
   self_ref: '#/texts/75'
   text: 5 EXPERIMENTS
 - children: []
-  hash: 17481407849465975419
   label: caption
   orig: 'Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network'
   parent:
@@ -12153,7 +12054,6 @@ texts:
   self_ref: '#/texts/76'
   text: 'Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network'
 - children: []
-  hash: 7803676745991559807
   label: text
   orig: paper and leave the detailed evaluation of more recent methods mentioned in
     Section 2 for future work.
@@ -12174,7 +12074,6 @@ texts:
   text: paper and leave the detailed evaluation of more recent methods mentioned in
     Section 2 for future work.
 - children: []
-  hash: 17701866649923828991
   label: text
   orig: In this section, we will present several aspects related to the performance
     of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate
@@ -12201,7 +12100,6 @@ texts:
     that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are
     computed by leveraging the evaluation code provided by the COCO API [16].
 - children: []
-  hash: 13749192044324312506
   label: section_header
   orig: Baselines for Object Detection
   parent:
@@ -12220,7 +12118,6 @@ texts:
   self_ref: '#/texts/79'
   text: Baselines for Object Detection
 - children: []
-  hash: 14253474500249711401
   label: text
   orig: "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN\
     \ [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were\
@@ -12267,7 +12164,6 @@ texts:
     \ . This is not entirely surprising, as Text , Table and Picture are abundant\
     \ and the most visually distinctive in a document."
 - children: []
-  hash: 15112326022753491828
   label: page_header
   orig: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
   parent:
@@ -12286,7 +12182,6 @@ texts:
   self_ref: '#/texts/81'
   text: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
 - children: []
-  hash: 4483721399241932553
   label: caption
   orig: 'Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained
     on DocLayNet with different class label sets. The reduced label sets were obtained
@@ -12309,7 +12204,6 @@ texts:
     on DocLayNet with different class label sets. The reduced label sets were obtained
     by either down-mapping or '
 - children: []
-  hash: 12787012541434343355
   label: section_header
   orig: Learning Curve
   parent:
@@ -12328,7 +12222,6 @@ texts:
   self_ref: '#/texts/83'
   text: Learning Curve
 - children: []
-  hash: 10649518406512702432
   label: text
   orig: One of the fundamental questions related to any dataset is if it is "large
     enough". To answer this question for DocLayNet, we performed a data ablation study
@@ -12375,7 +12268,6 @@ texts:
     in Section 3), data augmentation methods [23], or the addition of more document
     categories and styles.
 - children: []
-  hash: 15793718183219261273
   label: section_header
   orig: Impact of Class Labels
   parent:
@@ -12394,7 +12286,6 @@ texts:
   self_ref: '#/texts/85'
   text: Impact of Class Labels
 - children: []
-  hash: 3813128593852817568
   label: text
   orig: "The choice and number of labels can have a significant effect on the overall\
     \ model performance. Since PubLayNet, DocBank and DocLayNet all have different\
@@ -12435,7 +12326,6 @@ texts:
     \ labels respectively. The set of 5 labels contains the same labels as PubLayNet.\
     \ However, due to the different definition of"
 - children: []
-  hash: 17259116100352000220
   label: caption
   orig: 'Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise
     split for different label sets. Naive page-wise '
@@ -12456,7 +12346,6 @@ texts:
   text: 'Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise
     split for different label sets. Naive page-wise '
 - children: []
-  hash: 1606264416890747238
   label: text
   orig: lists in PubLayNet (grouped list-items) versus DocLayNet (separate list-items),
     the label set of size 4 is the closest to PubLayNet, in the assumption that the
@@ -12485,7 +12374,6 @@ texts:
     when other classes are merged into them. The overall macro-average improves by
     around 5%, in particular when Page-footer and Page-header are excluded.
 - children: []
-  hash: 9916492842404420210
   label: section_header
   orig: Impact of Document Split in Train and Test Set
   parent:
@@ -12504,7 +12392,6 @@ texts:
   self_ref: '#/texts/89'
   text: Impact of Document Split in Train and Test Set
 - children: []
-  hash: 7367861061956826313
   label: text
   orig: "Many documents in DocLayNet have a unique styling. In order to avoid overfitting\
     \ on a particular style, we have split the train-, test- and validation-sets of\
@@ -12543,7 +12430,6 @@ texts:
     \ Thus, random page-wise splitting of DocLayNet can easily lead to accidental\
     \ overestimation of model performance and should be avoided."
 - children: []
-  hash: 12266316638387602552
   label: section_header
   orig: Dataset Comparison
   parent:
@@ -12562,7 +12448,6 @@ texts:
   self_ref: '#/texts/91'
   text: Dataset Comparison
 - children: []
-  hash: 11873954123712452037
   label: text
   orig: Throughout this paper, we claim that DocLayNet's wider variety of document
     layouts leads to more robust layout detection models. In Table 5, we provide evidence
@@ -12593,7 +12478,6 @@ texts:
     possible. Hence, we focussed on the common labels among the datasets. Between
     PubLayNet and DocLayNet, these are Picture ,
 - children: []
-  hash: 351624657815269469
   label: caption
   orig: 'Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network
     across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label
@@ -12618,7 +12502,6 @@ texts:
     classes of each dataset, we observe that the DocLayNet-trained model has much
     less pronounced variations in performance across all datasets.'
 - children: []
-  hash: 9365044532577791157
   label: text
   orig: Section-header , Table and Text . Before training, we either mapped or excluded
     DocLayNet's other labels as specified in table 3, and also PubLayNet's List to
@@ -12643,7 +12526,6 @@ texts:
     Text . Note that the different clustering of lists (by list-element vs. whole
     list objects) naturally decreases the mAP score for Text .
 - children: []
-  hash: 10416729032588567700
   label: text
   orig: For comparison of DocBank with DocLayNet, we trained only on Picture and Table
     clusters of each dataset. We had to exclude Text because successive paragraphs
@@ -12680,7 +12562,6 @@ texts:
     Thus we conclude that DocLayNet trained models are overall more robust and will
     produce better results for challenging, unseen layouts.
 - children: []
-  hash: 2679590577170717304
   label: section_header
   orig: Example Predictions
   parent:
@@ -12699,7 +12580,6 @@ texts:
   self_ref: '#/texts/96'
   text: Example Predictions
 - children: []
-  hash: 16470836271801010735
   label: text
   orig: To conclude this section, we illustrate the quality of layout predictions
     one can expect from DocLayNet-trained models by providing a selection of examples
@@ -12728,7 +12608,6 @@ texts:
     document categories, however one can also observe mistakes such as overlapping
     clusters of different classes, or entirely missing boxes due to low confidence.
 - children: []
-  hash: 6141036410505996597
   label: section_header
   orig: 6 CONCLUSION
   parent:
@@ -12747,7 +12626,6 @@ texts:
   self_ref: '#/texts/98'
   text: 6 CONCLUSION
 - children: []
-  hash: 6656340812057528524
   label: text
   orig: In this paper, we presented the DocLayNet dataset. It provides the document
     conversion and layout analysis research community a new and challenging dataset
@@ -12778,7 +12656,6 @@ texts:
     of documents outside the scientific publishing domain adds significant value in
     this respect.
 - children: []
-  hash: 5486338775911770271
   label: text
   orig: To date, there is still a significant gap between human and ML accuracy on
     the layout interpretation task, and we hope that this work will inspire the research
@@ -12801,7 +12678,6 @@ texts:
     the layout interpretation task, and we hope that this work will inspire the research
     community to close that gap.
 - children: []
-  hash: 1559959727945683203
   label: section_header
   orig: REFERENCES
   parent:
@@ -12820,7 +12696,6 @@ texts:
   self_ref: '#/texts/101'
   text: REFERENCES
 - children: []
-  hash: 3725067383473502802
   label: list_item
   orig: "[1] Max G\xF6bel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013\
     \ table competition. In 2013 12th International Conference on Document Analysis\
@@ -12843,7 +12718,6 @@ texts:
     \ table competition. In 2013 12th International Conference on Document Analysis\
     \ and Recognition , pages 1449-1453, 2013."
 - children: []
-  hash: 12754243453844555097
   label: list_item
   orig: '[2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher.
     Icdar2017 competition on recognition of documents with complex layouts rdcl2017.
@@ -12868,7 +12742,6 @@ texts:
     In 2017 14th IAPR International Conference on Document Analysis and Recognition
     (ICDAR) , volume 01, pages 1404-1410, 2017.'
 - children: []
-  hash: 829063111244650808
   label: list_item
   orig: "[3] Herv\xE9 D\xE9jean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang,\
     \ Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection\
@@ -12891,7 +12764,6 @@ texts:
     \ Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection\
     \ and Recognition (cTDaR), April 2019. http://sac.founderit.com/."
 - children: []
-  hash: 11422390329416293389
   label: list_item
   orig: '[4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on
     scientific literature parsing. In Proceedings of the International Conference
@@ -12916,7 +12788,6 @@ texts:
     on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag,
     sep 2021.'
 - children: []
-  hash: 6413209670831350329
   label: list_item
   orig: '[5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang
     Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis:
@@ -12941,7 +12812,6 @@ texts:
     not dead yet. International Journal on Document Analysis and Recognition (IJDAR)
     , pages 1-11, 01 2022.'
 - children: []
-  hash: 11041707939752995350
   label: list_item
   orig: '[6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest
     dataset ever for document layout analysis. In Proceedings of the International
@@ -12966,7 +12836,6 @@ texts:
     Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep
     2019.'
 - children: []
-  hash: 5474854589015621307
   label: list_item
   orig: '[7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li,
     and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings
@@ -12991,7 +12860,6 @@ texts:
     of the 28th International Conference on Computational Linguistics , COLING, pages
     949-960. International Committee on Computational Linguistics, dec 2020.'
 - children: []
-  hash: 5675312684168383010
   label: list_item
   orig: '[8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction
     from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC
@@ -13014,7 +12882,6 @@ texts:
     from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC
     , 2016.'
 - children: []
-  hash: 1888275270300033803
   label: list_item
   orig: '[9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich
     feature hierarchies for accurate object detection and semantic segmentation. In
@@ -13039,7 +12906,6 @@ texts:
     IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587.
     IEEE Computer Society, jun 2014.'
 - children: []
-  hash: 16377833059391204670
   label: list_item
   orig: '[10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference
     on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015.'
@@ -13060,7 +12926,6 @@ texts:
   text: '[10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference
     on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015.'
 - children: []
-  hash: 14299439328578373439
   label: list_item
   orig: '[11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn:
     Towards real-time object detection with region proposal networks. IEEE Transactions
@@ -13083,7 +12948,6 @@ texts:
     Towards real-time object detection with region proposal networks. IEEE Transactions
     on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017.'
 - children: []
-  hash: 6277082541628795791
   label: list_item
   orig: "[12] Kaiming He, Georgia Gkioxari, Piotr Doll\xE1r, and Ross B. Girshick.\
     \ Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages\
@@ -13106,7 +12970,6 @@ texts:
     \ Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages\
     \ 2980-2988. IEEE Computer Society, Oct 2017."
 - children: []
-  hash: 11395721618283747445
   label: list_item
   orig: '[13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012,
     TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing,
@@ -13131,7 +12994,6 @@ texts:
     tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana,
     Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu'
 - children: []
-  hash: 10645821634971805003
   label: page_header
   orig: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
   parent:
@@ -13150,7 +13012,6 @@ texts:
   self_ref: '#/texts/115'
   text: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis'
 - children: []
-  hash: 16669730915702730112
   label: page_header
   orig: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
   parent:
@@ -13169,7 +13030,6 @@ texts:
   self_ref: '#/texts/116'
   text: "KDD \u201922, August 14-18, 2022, Washington, DC, USA"
 - children: []
-  hash: 10284244877080556618
   label: caption
   orig: 'Figure 6: Example layout predictions on selected pages from the DocLayNet
     test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show
@@ -13192,7 +13052,6 @@ texts:
     test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show
     accurate list-item and paragraph '
 - children: []
-  hash: 15833845093926891587
   label: text
   orig: 'Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang.
     ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow
@@ -13215,7 +13074,6 @@ texts:
     ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow
     export, opencv dnn support, October 2021.'
 - children: []
-  hash: 17115238742689219449
   label: list_item
   orig: '[14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier,
     Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers.
@@ -13238,7 +13096,6 @@ texts:
     Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers.
     CoRR , abs/2005.12872, 2020.'
 - children: []
-  hash: 9865955364301030107
   label: list_item
   orig: '[15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and
     efficient object detection. CoRR , abs/1911.09070, 2019.'
@@ -13259,7 +13116,6 @@ texts:
   text: '[15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and
     efficient object detection. CoRR , abs/1911.09070, 2019.'
 - children: []
-  hash: 1886095681092304576
   label: list_item
   orig: "[16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev,\
     \ Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\xE1r,\
@@ -13282,7 +13138,6 @@ texts:
     \ Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\xE1r,\
     \ and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014."
 - children: []
-  hash: 725047809578653716
   label: list_item
   orig: '[17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross
     Girshick. Detectron2, 2019.'
@@ -13303,7 +13158,6 @@ texts:
   text: '[17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross
     Girshick. Detectron2, 2019.'
 - children: []
-  hash: 12769367635298110033
   label: list_item
   orig: '[18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk,
     Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and
@@ -13330,7 +13184,6 @@ texts:
     In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages
     1513715145, feb 2021.'
 - children: []
-  hash: 3125069241793675330
   label: list_item
   orig: '[19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou.
     Layoutlm: Pre-training of text and layout for document image understanding. In
@@ -13357,7 +13210,6 @@ texts:
     and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing
     Machinery.'
 - children: []
-  hash: 7474367108240490719
   label: list_item
   orig: '[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang.
     Vtlayout: Fusion of visual and text features for document layout analysis, 2021.'
@@ -13378,7 +13230,6 @@ texts:
   text: '[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang.
     Vtlayout: Fusion of visual and text features for document layout analysis, 2021.'
 - children: []
-  hash: 15036724265562104961
   label: list_item
   orig: '[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus
     conversion service: A machine learning platform to ingest documents at scale.
@@ -13403,7 +13254,6 @@ texts:
     In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery
     and Data Mining , KDD, pages 774-782. ACM, 2018.'
 - children: []
-  hash: 17388007058339574232
   label: list_item
   orig: '[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation
     for deep learning. Journal of Big Data , 6(1):60, 2019.'
diff --git a/test/data/experimental/dummy_doc.yaml b/test/data/experimental/dummy_doc.yaml
index 8767349..13d184b 100644
--- a/test/data/experimental/dummy_doc.yaml
+++ b/test/data/experimental/dummy_doc.yaml
@@ -23,7 +23,7 @@ body:
   parent: null # Only root elements have no parent.
   children: # only the first-level children appear here, as references (RefItem)
     - $ref: "/texts/1"
-    - $ref: "/figures/0"
+    - $ref: "/pictures/0"
     - $ref: "/texts/3"
     - $ref: "/tables/0"
 
@@ -71,7 +71,7 @@ texts:
     hash: 6978483
     label: "section_header"
     parent:
-      $ref: "/figures/0"
+      $ref: "/pictures/0"
     children: [ ]
     prov:
       - page_no: 1
@@ -130,8 +130,8 @@ tables: # All tables...
           coord_origin: BOTTOMLEFT
         charspan: [ 1,423 ] # 2-tuple, references to "orig"
 
-figures: # All figures...
-  - self_ref: "#/figures/0"
+pictures: # All pictures...
+  - self_ref: "#/pictures/0"
     hash: 7782482
     label: "picture"
     parent:
@@ -149,7 +149,7 @@ figures: # All figures...
       size:
         width: 231
         height: 351
-      uri: "file:///dummy_doc/figures/0.png"
+      uri: "file:///dummy_doc/pictures/0.png"
       #alternatives: base64 encoded striong
     children:
       - $ref: "/texts/2" # This text element appears inside the figure, hence it is a child.
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index ee4d351..69cae69 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -1,7 +1,7 @@
 import yaml
 
 from docling_core.types.experimental.document import (
-    BaseFigureData,
+    BasePictureData,
     BaseTableData,
     DescriptionItem,
     DoclingDocument,
@@ -86,7 +86,7 @@ def _test_export_methods(doc):
         table.export_to_html()
         table.export_to_dataframe()
         table.export_to_document_tokens(doc)
-    for fig in doc.figures:
+    for fig in doc.pictures:
         fig.export_to_document_tokens(doc)
 
 
@@ -95,12 +95,13 @@ def _construct_doc() -> DoclingDocument:
     # group, heading, paragraph, table, figure, title, list, provenance
     doc.add_paragraph(label=DocItemLabel.TEXT, text="Author 1\nAffiliation 1")
     doc.add_paragraph(label=DocItemLabel.TEXT, text="Author 2\nAffiliation 2")
+
     chapter1 = doc.add_group(
         label=GroupLabel.CHAPTER, name="Introduction"
     )  # can be done if such information is present, or ommitted.
+
     doc.add_heading(
         parent=chapter1,
-        label=DocItemLabel.SECTION_HEADER,
         text="1. Introduction",
         level=1,
     )
@@ -192,8 +193,10 @@ def _construct_doc() -> DoclingDocument:
     )
     table_el = BaseTableData(num_rows=3, num_cols=3, table_cells=table_cells)
     doc.add_table(data=table_el)
+
     fig_caption = doc.add_paragraph(
         label=DocItemLabel.CAPTION, text="This is the caption of figure 1."
     )
-    doc.add_figure(data=BaseFigureData(), caption=fig_caption.get_ref())
+    fig_item = doc.add_picture(data=BasePictureData(), caption=fig_caption)
+
     return doc

From fe72b3b2008193b095839311c77339e2e3bcb4ee Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Fri, 27 Sep 2024 16:08:58 +0200
Subject: [PATCH 23/34] updating the tests

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 test/test_docling_doc.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index 8cd8a43..0bc92bf 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -1,6 +1,7 @@
 import yaml
 
-from docling_core.types.experimental.document import DoclingDocument, FileInfo
+from docling_core.types.experimental.labels import DocItemLabel
+from docling_core.types.experimental.document import DoclingDocument, DocItem, TextItem, FileInfo
 
 
 def test_load_serialize_doc():
@@ -76,3 +77,25 @@ def test_construct_doc():
     print(f"\n\n{yaml_dump}")
 
     DoclingDocument.model_validate(yaml.safe_load(yaml_dump))
+
+def test_docitems():
+
+    # Iterate over the derived classes of the BaseClass
+    derived_classes = DocItem.__subclasses__()
+    for dc in derived_classes:
+
+        if issubclass(dc, TextItem):
+            _ = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT)
+            yaml_dump = yaml.safe_dump(_.model_dump(mode="json", by_alias=True))
+            print(f"\n\n{yaml_dump}")
+        else:            
+            try:
+                _ = dc()            
+                yaml_dump = yaml.safe_dump(_.model_dump(mode="json", by_alias=True))
+                
+                print(f"\n\n{yaml_dump}")        
+            except TypeError as e:
+                print(f"Could not instantiate {dc.__name__}: {e}")
+            except Exception as e:
+                print(f"Could not instantiate {dc.__name__}: {e}")            
+         

From baced3368a37eb4f3fe9e8bd588e161f9f03778b Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Fri, 27 Sep 2024 16:48:52 +0200
Subject: [PATCH 24/34] added some unit tests for DocItem

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 .../docling_document/unit/FloatingItem.yaml   |   9 +
 .../docling_document/unit/KeyValueItem.yaml   |   5 +
 .../docling_document/unit/PictureItem.yaml    |  10 +
 .../docling_document/unit/SectionItem.yaml    |   8 +
 .../data/docling_document/unit/TableItem.yaml | 179 ++++++++++++++++++
 test/data/docling_document/unit/TextItem.yaml |   7 +
 test/test_docling_doc.py                      | 102 ++++++++--
 7 files changed, 299 insertions(+), 21 deletions(-)
 create mode 100644 test/data/docling_document/unit/FloatingItem.yaml
 create mode 100644 test/data/docling_document/unit/KeyValueItem.yaml
 create mode 100644 test/data/docling_document/unit/PictureItem.yaml
 create mode 100644 test/data/docling_document/unit/SectionItem.yaml
 create mode 100644 test/data/docling_document/unit/TableItem.yaml
 create mode 100644 test/data/docling_document/unit/TextItem.yaml

diff --git a/test/data/docling_document/unit/FloatingItem.yaml b/test/data/docling_document/unit/FloatingItem.yaml
new file mode 100644
index 0000000..1a816a8
--- /dev/null
+++ b/test/data/docling_document/unit/FloatingItem.yaml
@@ -0,0 +1,9 @@
+captions: []
+children: []
+footnotes: []
+image: null
+label: text
+parent: null
+prov: []
+references: []
+self_ref: '#'
diff --git a/test/data/docling_document/unit/KeyValueItem.yaml b/test/data/docling_document/unit/KeyValueItem.yaml
new file mode 100644
index 0000000..ea7a00c
--- /dev/null
+++ b/test/data/docling_document/unit/KeyValueItem.yaml
@@ -0,0 +1,5 @@
+children: []
+label: text
+parent: null
+prov: []
+self_ref: '#'
diff --git a/test/data/docling_document/unit/PictureItem.yaml b/test/data/docling_document/unit/PictureItem.yaml
new file mode 100644
index 0000000..6809aaf
--- /dev/null
+++ b/test/data/docling_document/unit/PictureItem.yaml
@@ -0,0 +1,10 @@
+captions: []
+children: []
+data: {}
+footnotes: []
+image: null
+label: text
+parent: null
+prov: []
+references: []
+self_ref: '#'
diff --git a/test/data/docling_document/unit/SectionItem.yaml b/test/data/docling_document/unit/SectionItem.yaml
new file mode 100644
index 0000000..c31847e
--- /dev/null
+++ b/test/data/docling_document/unit/SectionItem.yaml
@@ -0,0 +1,8 @@
+children: []
+label: text
+level: 1
+orig: whatever
+parent: null
+prov: []
+self_ref: '#'
+text: whatever
diff --git a/test/data/docling_document/unit/TableItem.yaml b/test/data/docling_document/unit/TableItem.yaml
new file mode 100644
index 0000000..746cf06
--- /dev/null
+++ b/test/data/docling_document/unit/TableItem.yaml
@@ -0,0 +1,179 @@
+captions: []
+children: []
+data:
+  grid:
+  - - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 0
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 0
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 0
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 0
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 1
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 0
+      text: ''
+  - - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 1
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 1
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 1
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 1
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 2
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 1
+      text: ''
+  - - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 1
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 0
+      start_row_offset_idx: 2
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 2
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 1
+      start_row_offset_idx: 2
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 3
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 2
+      start_row_offset_idx: 2
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 4
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 3
+      start_row_offset_idx: 2
+      text: ''
+    - bbox: null
+      col_span: 1
+      column_header: false
+      end_col_offset_idx: 5
+      end_row_offset_idx: 3
+      row_header: false
+      row_section: false
+      row_span: 1
+      start_col_offset_idx: 4
+      start_row_offset_idx: 2
+      text: ''
+  num_cols: 5
+  num_rows: 3
+  table_cells: []
+footnotes: []
+image: null
+label: text
+parent: null
+prov: []
+references: []
+self_ref: '#'
diff --git a/test/data/docling_document/unit/TextItem.yaml b/test/data/docling_document/unit/TextItem.yaml
new file mode 100644
index 0000000..aa56c38
--- /dev/null
+++ b/test/data/docling_document/unit/TextItem.yaml
@@ -0,0 +1,7 @@
+children: []
+label: text
+orig: whatever
+parent: null
+prov: []
+self_ref: '#'
+text: whatever
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index 48ffb7d..75bee1d 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -1,16 +1,97 @@
 import yaml
 
+from collections import deque
+
 from docling_core.types.experimental.document import (
     BasePictureData,
     BaseTableData,
     DescriptionItem,
     DoclingDocument,
     TableCell,
+    NodeItem,
     DocItem,
     TextItem,
+    FloatingItem,
+    KeyValueItem,
+    SectionItem,
+    PictureItem,
+    TableItem,
+    BasePictureData,
+    BaseTableData
 )
 from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
 
+def test_docitems():
+
+    # Iterative function to find all subclasses
+    def find_all_subclasses_iterative(base_class):
+        subclasses = deque([base_class])  # Use a deque for efficient popping from the front
+        all_subclasses = []
+
+        while subclasses:
+            current_class = subclasses.popleft()  # Get the next class to process
+            for subclass in current_class.__subclasses__():
+                all_subclasses.append(subclass)
+                subclasses.append(subclass)  # Add the subclass for further exploration
+
+        return all_subclasses
+
+    def serialise(obj):
+        return yaml.safe_dump(obj.model_dump(mode="json", by_alias=True))
+
+    def write(name:str, serialisation:str):
+        with open(f"./test/data/docling_document/unit/{name}.yaml", "w") as fw:
+            fw.write(serialisation)
+
+    def read(name:str):
+        with open(f"./test/data/docling_document/unit/{name}.yaml", "r") as fr:
+            gold = fr.read()
+        return gold
+
+    def generate(dc, obj):
+        write(dc.__name__, pred)
+    
+    def verify(dc, obj):
+        pred = serialise(obj)            
+        #print(f"\t{dc.__name__}:\n {pred}")
+        gold = read(dc.__name__)
+
+        assert pred==gold, f"pred!=gold for {dc.__name__}"        
+    
+    # Iterate over the derived classes of the BaseClass
+    derived_classes = find_all_subclasses_iterative(DocItem)
+    for dc in derived_classes:
+
+        if dc is TextItem:
+            obj = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT, self_ref="#")
+            verify(dc, obj)
+            
+        elif dc is FloatingItem:
+            obj = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT, self_ref="#")
+            verify(dc, obj)
+            
+        elif dc is KeyValueItem:
+            obj = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT, self_ref="#")
+            verify(dc, obj)
+            
+        elif dc is SectionItem:
+            obj = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT, self_ref="#")
+            verify(dc, obj)
+            
+        elif dc is PictureItem:
+            obj = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT, self_ref="#",
+                     data=BasePictureData())
+            verify(dc, obj)
+            
+        elif dc is TableItem:
+            obj = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT, self_ref="#",
+                     data=BaseTableData(num_rows=3, num_cols=5, cells=[]))
+            verify(dc, obj)
+            
+        else:
+            print(f"{dc.__name__} is not known")            
+            assert False, "new derived class detected {dc.__name__}: {e}"
+
 
 def test_reference_doc():
     # Read YAML file
@@ -203,24 +284,3 @@ def _construct_doc() -> DoclingDocument:
 
     return doc
     
-def test_docitems():
-
-    # Iterate over the derived classes of the BaseClass
-    derived_classes = DocItem.__subclasses__()
-    for dc in derived_classes:
-
-        if issubclass(dc, TextItem):
-            _ = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT)
-            yaml_dump = yaml.safe_dump(_.model_dump(mode="json", by_alias=True))
-            print(f"\n\n{yaml_dump}")
-        else:            
-            try:
-                _ = dc()            
-                yaml_dump = yaml.safe_dump(_.model_dump(mode="json", by_alias=True))
-                
-                print(f"\n\n{yaml_dump}")        
-            except TypeError as e:
-                print(f"Could not instantiate {dc.__name__}: {e}")
-            except Exception as e:
-                print(f"Could not instantiate {dc.__name__}: {e}")            
-         

From 40d7fa411ec6078266309763d02e66004b742387 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Fri, 27 Sep 2024 17:06:05 +0200
Subject: [PATCH 25/34] Add tree validation API and test case

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/experimental/document.py | 61 +++++++++++++-----
 test/data/experimental/dummy_doc.yaml       |  1 +
 test/test_docling_doc.py                    | 68 +++++++++++++++------
 3 files changed, 97 insertions(+), 33 deletions(-)

diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index 80f7a32..bd4fd55 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -316,9 +316,10 @@ def export_to_document_tokens(
 
 
 class SectionItem(TextItem):
-    """Section."""
+    """SectionItem."""
 
-    level: LevelNumber = 1
+    label: DocItemLabel = DocItemLabel.SECTION_HEADER
+    level: LevelNumber
 
 
 class FloatingItem(DocItem):
@@ -616,7 +617,7 @@ class DescriptionItem(BaseModel):
 class DoclingDocument(BaseModel):
     """DoclingDocument."""
 
-    version: str = "0.1.0"  # use SemanticVersion type instead
+    version: str = "0.1.0"  # TODO use SemanticVersion type instead
     description: DescriptionItem
     name: str  # The working name of this document, without extensions
     # (could be taken from originating doc, or just "Untitled 1")
@@ -632,7 +633,7 @@ class DoclingDocument(BaseModel):
     body: GroupItem = GroupItem(name="_root_", self_ref="#/body")  # List[RefItem] = []
 
     groups: List[GroupItem] = []
-    texts: List[TextItem] = []
+    texts: List[Union[SectionItem, TextItem]] = []
     pictures: List[PictureItem] = []
     tables: List[TableItem] = []
     key_value_items: List[KeyValueItem] = []
@@ -676,14 +677,13 @@ def add_group(
 
         return group
 
-    def add_paragraph(
+    def add_text(
         self,
         label: str,
         text: str,
         orig: Optional[str] = None,
         prov: Optional[ProvenanceItem] = None,
         parent: Optional[GroupItem] = None,
-        item_cls=TextItem,
     ):
         """add_paragraph.
 
@@ -692,7 +692,6 @@ def add_paragraph(
         :param orig: Optional[str]:  (Default value = None)
         :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[GroupItem]:  (Default value = None)
-        :param item_cls:  (Default value = TextItem)
 
         """
         if not parent:
@@ -703,7 +702,7 @@ def add_paragraph(
 
         text_index = len(self.texts)
         cref = f"#/texts/{text_index}"
-        text_item = item_cls(
+        text_item = TextItem(
             label=label,
             text=text,
             orig=orig,
@@ -809,21 +808,44 @@ def add_heading(
         :param parent: Optional[GroupItem]:  (Default value = None)
 
         """
-        item: SectionItem = self.add_paragraph(
-            label=DocItemLabel.SECTION_HEADER,
+        if not parent:
+            parent = self.body
+
+        if not orig:
+            orig = text
+
+        text_index = len(self.texts)
+        cref = f"#/texts/{text_index}"
+        section_header_item = SectionItem(
+            level=level,
             text=text,
             orig=orig,
-            prov=prov,
-            parent=parent,
-            item_cls=SectionItem,
+            self_ref=cref,
+            parent=parent.get_ref(),
         )
-        item.level = level
-        return item
+        if prov:
+            section_header_item.prov.append(prov)
+
+        self.texts.append(section_header_item)
+        parent.children.append(RefItem(cref=cref))
+
+        return section_header_item
 
     def num_pages(self):
         """num_pages."""
         return len(self.pages.values())
 
+    def validate_tree(self, root) -> bool:
+        """validate_tree."""
+        res = []
+        for child_ref in root.children:
+            child = child_ref.resolve(self)
+            if child.parent.resolve(self) != root:
+                return False
+            res.append(self.validate_tree(child))
+
+        return all(res) or len(res) == 0
+
     def iterate_elements(
         self,
         root: Optional[NodeItem] = None,
@@ -1133,3 +1155,12 @@ def add_page(self, page_no: int, size: Size) -> PageItem:
 
         self.pages[page_no] = pitem
         return pitem
+
+    @model_validator(mode="after")  # type: ignore
+    @classmethod
+    def validate_document(cls, d: "DoclingDocument"):
+        """validate_document."""
+        if not d.validate_tree(d.body) or not d.validate_tree(d.furniture):
+            raise ValueError("Document hierachy is inconsistent.")
+
+        return d
diff --git a/test/data/experimental/dummy_doc.yaml b/test/data/experimental/dummy_doc.yaml
index 13d184b..2bd50a0 100644
--- a/test/data/experimental/dummy_doc.yaml
+++ b/test/data/experimental/dummy_doc.yaml
@@ -80,6 +80,7 @@ texts:
           t: 354.3
           b: 334.4
           r: 376.0
+          coord_origin: BOTTOMLEFT
         charspan: [ 0,734 ]
   - orig: "Figure 1: Four examples of complex page layouts across dif-\nferent document categories" # nested inside the figure
     text: "Figure 1: Four examples of complex page layouts across different document categories"
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index 69cae69..66cc2e0 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -1,3 +1,4 @@
+import pytest
 import yaml
 
 from docling_core.types.experimental.document import (
@@ -11,7 +12,7 @@
 
 
 def test_reference_doc():
-    # Read YAML file
+    # Read YAML file of manual reference doc
     with open("test/data/experimental/dummy_doc.yaml", "r") as fp:
         dict_from_yaml = yaml.safe_load(fp)
 
@@ -35,19 +36,17 @@ def test_reference_doc():
     assert obj == obj2
     assert obj is obj2
 
-    doc_dumped = doc.model_dump(mode="json", by_alias=True)
-    out_yaml = yaml.safe_dump(doc_dumped)
-
-    doc_reload = DoclingDocument.model_validate(yaml.safe_load(out_yaml))
-
-    assert doc_reload == doc  # must be equal
-    assert doc_reload is not doc  # can't be identical
-
-    ### Iterate all elements
+    # Iterate all elements
 
     for item, level in doc.iterate_elements():
         print(f"Item: {item} at level {level}")
 
+    # Serialize and reload
+    _test_serialize_and_reload(doc)
+
+    # Call Export methods
+    _test_export_methods(doc)
+
 
 def test_parse_doc():
     with open(
@@ -65,15 +64,31 @@ def test_parse_doc():
 def test_construct_doc():
 
     doc = _construct_doc()
+
+    assert doc.validate_tree(doc.body)
+    assert doc.validate_tree(doc.furniture)
+
     _test_export_methods(doc)
     _test_serialize_and_reload(doc)
 
 
+def test_construct_bad_doc():
+    doc = _construct_bad_doc()
+    assert doc.validate_tree(doc.body) == False
+
+    _test_export_methods(doc)
+    with pytest.raises(ValueError):
+        _test_serialize_and_reload(doc)
+
+
 def _test_serialize_and_reload(doc):
     ### Serialize and deserialize stuff
     yaml_dump = yaml.safe_dump(doc.model_dump(mode="json", by_alias=True))
     # print(f"\n\n{yaml_dump}")
-    DoclingDocument.model_validate(yaml.safe_load(yaml_dump))
+    doc_reload = DoclingDocument.model_validate(yaml.safe_load(yaml_dump))
+
+    assert doc_reload == doc  # must be equal
+    assert doc_reload is not doc  # can't be identical
 
 
 def _test_export_methods(doc):
@@ -90,11 +105,28 @@ def _test_export_methods(doc):
         fig.export_to_document_tokens(doc)
 
 
+def _construct_bad_doc():
+    doc = DoclingDocument(description=DescriptionItem(), name="Bad doc")
+
+    title = doc.add_text(label=DocItemLabel.TITLE, text="This is the title")
+    group = doc.add_group(parent=title, name="chapter 1")
+    text = doc.add_text(
+        parent=group,
+        label=DocItemLabel.SECTION_HEADER,
+        text="This is the first section",
+    )
+
+    # Bend the parent of an element to be another.
+    text.parent = title.get_ref()
+
+    return doc
+
+
 def _construct_doc() -> DoclingDocument:
     doc = DoclingDocument(description=DescriptionItem(), name="Untitled 1")
     # group, heading, paragraph, table, figure, title, list, provenance
-    doc.add_paragraph(label=DocItemLabel.TEXT, text="Author 1\nAffiliation 1")
-    doc.add_paragraph(label=DocItemLabel.TEXT, text="Author 2\nAffiliation 2")
+    doc.add_text(label=DocItemLabel.TEXT, text="Author 1\nAffiliation 1")
+    doc.add_text(label=DocItemLabel.TEXT, text="Author 2\nAffiliation 2")
 
     chapter1 = doc.add_group(
         label=GroupLabel.CHAPTER, name="Introduction"
@@ -105,21 +137,21 @@ def _construct_doc() -> DoclingDocument:
         text="1. Introduction",
         level=1,
     )
-    doc.add_paragraph(
+    doc.add_text(
         parent=chapter1,
         label=DocItemLabel.TEXT,
         text="This paper introduces the biggest invention ever made. ...",
     )
     mylist = doc.add_group(parent=chapter1, label=GroupLabel.LIST)
-    doc.add_paragraph(
+    doc.add_text(
         parent=mylist,
         label=DocItemLabel.LIST_ITEM,
         text="Cooks your favourite meal before you know you want it.",
     )
-    doc.add_paragraph(
+    doc.add_text(
         parent=mylist, label=DocItemLabel.LIST_ITEM, text="Cleans up all your dishes."
     )
-    doc.add_paragraph(
+    doc.add_text(
         parent=mylist,
         label=DocItemLabel.LIST_ITEM,
         text="Drains your bank account without consent.",
@@ -194,7 +226,7 @@ def _construct_doc() -> DoclingDocument:
     table_el = BaseTableData(num_rows=3, num_cols=3, table_cells=table_cells)
     doc.add_table(data=table_el)
 
-    fig_caption = doc.add_paragraph(
+    fig_caption = doc.add_text(
         label=DocItemLabel.CAPTION, text="This is the caption of figure 1."
     )
     fig_item = doc.add_picture(data=BasePictureData(), caption=fig_caption)

From adc16f39bdf0c7408a39abeac833fb6d2c67f044 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Fri, 27 Sep 2024 17:21:47 +0200
Subject: [PATCH 26/34] Add extra=Forbid to NodeItem

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/experimental/document.py   |   2 +
 .../docling_document/unit/SectionItem.yaml    |   2 +-
 test/data/experimental/dummy_doc.yaml         |   7 --
 test/test_docling_doc.py                      | 103 ++++++++++++------
 4 files changed, 71 insertions(+), 43 deletions(-)

diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index bd4fd55..d188435 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -211,6 +211,8 @@ class NodeItem(BaseModel):
     parent: Optional[RefItem] = None
     children: List[RefItem] = []
 
+    model_config = ConfigDict(extra="forbid")
+
     def get_ref(self):
         """get_ref."""
         return RefItem(cref=self.self_ref)
diff --git a/test/data/docling_document/unit/SectionItem.yaml b/test/data/docling_document/unit/SectionItem.yaml
index c31847e..7429499 100644
--- a/test/data/docling_document/unit/SectionItem.yaml
+++ b/test/data/docling_document/unit/SectionItem.yaml
@@ -1,6 +1,6 @@
 children: []
 label: text
-level: 1
+level: 2
 orig: whatever
 parent: null
 prov: []
diff --git a/test/data/experimental/dummy_doc.yaml b/test/data/experimental/dummy_doc.yaml
index 2bd50a0..d72f454 100644
--- a/test/data/experimental/dummy_doc.yaml
+++ b/test/data/experimental/dummy_doc.yaml
@@ -36,7 +36,6 @@ texts:
   - orig: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
     text: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
     self_ref: "#/texts/0"
-    hash: 132103230
     label: "page_header"
     parent:
       $ref: "#/furniture"
@@ -52,7 +51,6 @@ texts:
   - orig: "DocLayNet: A Large Human-Annotated Dataset for\nDocument-Layout Analysis"
     text: "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
     self_ref: "#/texts/1"
-    hash: 2349732 # uint64 hash of self_ref
     label: "title"
     parent:
       $ref: "#/body"
@@ -68,7 +66,6 @@ texts:
   - orig: "OPERATION (cont.)" # nested inside the figure
     text: "OPERATION (cont.)"
     self_ref: "#/texts/2"
-    hash: 6978483
     label: "section_header"
     parent:
       $ref: "/pictures/0"
@@ -85,7 +82,6 @@ texts:
   - orig: "Figure 1: Four examples of complex page layouts across dif-\nferent document categories" # nested inside the figure
     text: "Figure 1: Four examples of complex page layouts across different document categories"
     self_ref: "#/texts/3"
-    hash: 6978483
     label: "caption"
     parent:
       $ref: "#/body"
@@ -103,7 +99,6 @@ texts:
 
 tables: # All tables...
   - self_ref: "#/table/0"
-    hash: 98574
     label: "table"
     parent:
       $ref: "#/body"
@@ -133,7 +128,6 @@ tables: # All tables...
 
 pictures: # All pictures...
   - self_ref: "#/pictures/0"
-    hash: 7782482
     label: "picture"
     parent:
       $ref: "#/body"
@@ -168,7 +162,6 @@ key_value_items: [ ] # All KV-items
 # We should consider this for pages
 pages: # Optional, for layout documents
   1:
-    hash: 6203680922337857390
     size:
       width: 768.23
       height: 583.15
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index eea3f1f..73f644d 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -1,32 +1,32 @@
+from collections import deque
+
 import pytest
 import yaml
 
-from collections import deque
-
 from docling_core.types.experimental.document import (
     BasePictureData,
     BaseTableData,
     DescriptionItem,
-    DoclingDocument,
-    TableCell,
-    NodeItem,
     DocItem,
-    TextItem,
+    DoclingDocument,
     FloatingItem,
     KeyValueItem,
-    SectionItem,
     PictureItem,
+    SectionItem,
+    TableCell,
     TableItem,
-    BasePictureData,
-    BaseTableData
+    TextItem,
 )
 from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
 
+
 def test_docitems():
 
     # Iterative function to find all subclasses
     def find_all_subclasses_iterative(base_class):
-        subclasses = deque([base_class])  # Use a deque for efficient popping from the front
+        subclasses = deque(
+            [base_class]
+        )  # Use a deque for efficient popping from the front
         all_subclasses = []
 
         while subclasses:
@@ -40,57 +40,91 @@ def find_all_subclasses_iterative(base_class):
     def serialise(obj):
         return yaml.safe_dump(obj.model_dump(mode="json", by_alias=True))
 
-    def write(name:str, serialisation:str):
+    def write(name: str, serialisation: str):
         with open(f"./test/data/docling_document/unit/{name}.yaml", "w") as fw:
             fw.write(serialisation)
 
-    def read(name:str):
+    def read(name: str):
         with open(f"./test/data/docling_document/unit/{name}.yaml", "r") as fr:
             gold = fr.read()
         return gold
 
-    def generate(dc, obj):
-        write(dc.__name__, pred)
-    
     def verify(dc, obj):
-        pred = serialise(obj)            
-        #print(f"\t{dc.__name__}:\n {pred}")
+        pred = serialise(obj)
+        # print(f"\t{dc.__name__}:\n {pred}")
         gold = read(dc.__name__)
 
-        assert pred==gold, f"pred!=gold for {dc.__name__}"        
-    
+        assert pred == gold, f"pred!=gold for {dc.__name__}"
+
     # Iterate over the derived classes of the BaseClass
     derived_classes = find_all_subclasses_iterative(DocItem)
     for dc in derived_classes:
 
         if dc is TextItem:
-            obj = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT, self_ref="#")
+            obj = dc(
+                text="whatever",
+                orig="whatever",
+                dloc="sdvsd",
+                label=DocItemLabel.TEXT,
+                self_ref="#",
+            )
             verify(dc, obj)
-            
+
         elif dc is FloatingItem:
-            obj = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT, self_ref="#")
+            obj = dc(
+                text="whatever",
+                orig="whatever",
+                dloc="sdvsd",
+                label=DocItemLabel.TEXT,
+                self_ref="#",
+            )
             verify(dc, obj)
-            
+
         elif dc is KeyValueItem:
-            obj = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT, self_ref="#")
+            obj = dc(
+                text="whatever",
+                orig="whatever",
+                dloc="sdvsd",
+                label=DocItemLabel.TEXT,
+                self_ref="#",
+            )
             verify(dc, obj)
-            
+
         elif dc is SectionItem:
-            obj = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT, self_ref="#")
+            obj = dc(
+                text="whatever",
+                orig="whatever",
+                dloc="sdvsd",
+                label=DocItemLabel.TEXT,
+                self_ref="#",
+                level=2,
+            )
             verify(dc, obj)
-            
+
         elif dc is PictureItem:
-            obj = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT, self_ref="#",
-                     data=BasePictureData())
+            obj = dc(
+                text="whatever",
+                orig="whatever",
+                dloc="sdvsd",
+                label=DocItemLabel.TEXT,
+                self_ref="#",
+                data=BasePictureData(),
+            )
             verify(dc, obj)
-            
+
         elif dc is TableItem:
-            obj = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT, self_ref="#",
-                     data=BaseTableData(num_rows=3, num_cols=5, cells=[]))
+            obj = dc(
+                text="whatever",
+                orig="whatever",
+                dloc="sdvsd",
+                label=DocItemLabel.TEXT,
+                self_ref="#",
+                data=BaseTableData(num_rows=3, num_cols=5, cells=[]),
+            )
             verify(dc, obj)
-            
+
         else:
-            print(f"{dc.__name__} is not known")            
+            print(f"{dc.__name__} is not known")
             assert False, "new derived class detected {dc.__name__}: {e}"
 
 
@@ -315,4 +349,3 @@ def _construct_doc() -> DoclingDocument:
     fig_item = doc.add_picture(data=BasePictureData(), caption=fig_caption)
 
     return doc
-    

From 34ce64bd315ad44c4dc41f99dff319340b979854 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Date: Fri, 27 Sep 2024 19:14:19 +0200
Subject: [PATCH 27/34] feat: set DoclingDocument version as SemanticVersion
 with default

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
---
 docling_core/types/experimental/document.py |  13 +-
 docs/Document.json                          |   6 +-
 docs/Document.md                            |   2 +-
 docs/Generic.json                           |   6 +-
 docs/Generic.md                             |   2 +-
 poetry.lock                                 | 665 +-------------------
 pyproject.toml                              |   3 +-
 test/test_docling_doc.py                    |  33 +
 8 files changed, 82 insertions(+), 648 deletions(-)

diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index d188435..0595f23 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -1,6 +1,7 @@
 """Models for the Docling Document data type."""
 
 import hashlib
+import importlib
 import mimetypes
 import typing
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -15,6 +16,7 @@
     field_validator,
     model_validator,
 )
+from pydantic_extra_types.semantic_version import SemanticVersion
 from tabulate import tabulate
 
 from docling_core.types.doc.tokens import DocumentToken
@@ -619,7 +621,7 @@ class DescriptionItem(BaseModel):
 class DoclingDocument(BaseModel):
     """DoclingDocument."""
 
-    version: str = "0.1.0"  # TODO use SemanticVersion type instead
+    version: Optional[SemanticVersion] = Field(default=None, validate_default=True)
     description: DescriptionItem
     name: str  # The working name of this document, without extensions
     # (could be taken from originating doc, or just "Untitled 1")
@@ -642,6 +644,15 @@ class DoclingDocument(BaseModel):
 
     pages: Dict[int, PageItem] = {}  # empty as default
 
+    @field_validator("version")
+    @classmethod
+    def check_version_omitted(cls, v: str) -> str:
+        """Set the version field to this library version by default."""
+        if v is None:
+            return importlib.metadata.version("docling-core")
+        else:
+            return v
+
     def _compute_hash(self, obj):
         hash_object = hashlib.sha256(obj.encode("utf-8"))
         # Convert the hash to an integer
diff --git a/docs/Document.json b/docs/Document.json
index 2580e70..55eda68 100644
--- a/docs/Document.json
+++ b/docs/Document.json
@@ -323,11 +323,7 @@
           "type": "string"
         },
         "bounding_box": {
-          "allOf": [
-            {
-              "$ref": "#/$defs/BoundingBoxContainer"
-            }
-          ],
+          "$ref": "#/$defs/BoundingBoxContainer",
           "x-es-suppress": true
         },
         "prov": {
diff --git a/docs/Document.md b/docs/Document.md
index ab50400..40ee7fe 100644
--- a/docs/Document.md
+++ b/docs/Document.md
@@ -6052,7 +6052,7 @@ Must be one of:
 | **Type**                  | `object`                                                                  |
 | **Required**              | Yes                                                                       |
 | **Additional properties** | [[Any type: allowed]](# "Additional Properties of any type are allowed.") |
-| **Defined in**            |                                                                           |
+| **Defined in**            | #/$defs/BoundingBoxContainer                                              |
 
 **Description:** Bounding box container.
 
diff --git a/docs/Generic.json b/docs/Generic.json
index 6b15029..c900e11 100644
--- a/docs/Generic.json
+++ b/docs/Generic.json
@@ -58,11 +58,7 @@
       "x-es-type": "text"
     },
     "file-info": {
-      "allOf": [
-        {
-          "$ref": "#/$defs/FileInfoObject"
-        }
-      ],
+      "$ref": "#/$defs/FileInfoObject",
       "description": "Minimal identification information of the document within a collection.",
       "title": "Document information"
     }
diff --git a/docs/Generic.md b/docs/Generic.md
index 175187f..32b5066 100644
--- a/docs/Generic.md
+++ b/docs/Generic.md
@@ -75,7 +75,7 @@
 | **Type**                  | `object`                                                                  |
 | **Required**              | Yes                                                                       |
 | **Additional properties** | [[Any type: allowed]](# "Additional Properties of any type are allowed.") |
-| **Defined in**            |                                                                           |
+| **Defined in**            | #/$defs/FileInfoObject                                                    |
 
 **Description:** Minimal identification information of the document within a collection.
 
diff --git a/poetry.lock b/poetry.lock
index ffa50de..5776567 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -45,21 +45,6 @@ files = [
 pyflakes = ">=3.0.0"
 tomli = {version = ">=2.0.1", markers = "python_version < \"3.11\""}
 
-[[package]]
-name = "backports-tarfile"
-version = "1.2.0"
-description = "Backport of CPython tarfile module"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "backports.tarfile-1.2.0-py3-none-any.whl", hash = "sha256:77e284d754527b01fb1e6fa8a1afe577858ebe4e9dad8919e34c862cb399bc34"},
-    {file = "backports_tarfile-1.2.0.tar.gz", hash = "sha256:d75e02c268746e1b8144c278978b6e98e85de6ad16f8e4b0844a154557eca991"},
-]
-
-[package.extras]
-docs = ["furo", "jaraco.packaging (>=9.3)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)"]
-
 [[package]]
 name = "black"
 version = "24.8.0"
@@ -117,85 +102,6 @@ files = [
     {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"},
 ]
 
-[[package]]
-name = "cffi"
-version = "1.17.1"
-description = "Foreign Function Interface for Python calling C code."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"},
-    {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"},
-    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382"},
-    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702"},
-    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3"},
-    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5da5719280082ac6bd9aa7becb3938dc9f9cbd57fac7d2871717b1feb0902ab6"},
-    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb1a08b8008b281856e5971307cc386a8e9c5b625ac297e853d36da6efe9c17"},
-    {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8"},
-    {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e"},
-    {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be"},
-    {file = "cffi-1.17.1-cp310-cp310-win32.whl", hash = "sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c"},
-    {file = "cffi-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15"},
-    {file = "cffi-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401"},
-    {file = "cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf"},
-    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4"},
-    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41"},
-    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1"},
-    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6"},
-    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d"},
-    {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6"},
-    {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f"},
-    {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b"},
-    {file = "cffi-1.17.1-cp311-cp311-win32.whl", hash = "sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655"},
-    {file = "cffi-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0"},
-    {file = "cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4"},
-    {file = "cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c"},
-    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36"},
-    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5"},
-    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff"},
-    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99"},
-    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93"},
-    {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3"},
-    {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8"},
-    {file = "cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65"},
-    {file = "cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903"},
-    {file = "cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e"},
-    {file = "cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2"},
-    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3"},
-    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683"},
-    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5"},
-    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4"},
-    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd"},
-    {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed"},
-    {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9"},
-    {file = "cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d"},
-    {file = "cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a"},
-    {file = "cffi-1.17.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b"},
-    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964"},
-    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9"},
-    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc"},
-    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c"},
-    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1"},
-    {file = "cffi-1.17.1-cp38-cp38-win32.whl", hash = "sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8"},
-    {file = "cffi-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1"},
-    {file = "cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16"},
-    {file = "cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36"},
-    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8"},
-    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca74b8dbe6e8e8263c0ffd60277de77dcee6c837a3d0881d8c1ead7268c9e576"},
-    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7f5baafcc48261359e14bcd6d9bff6d4b28d9103847c9e136694cb0501aef87"},
-    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98e3969bcff97cae1b2def8ba499ea3d6f31ddfdb7635374834cf89a1a08ecf0"},
-    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdf5ce3acdfd1661132f2a9c19cac174758dc2352bfe37d98aa7512c6b7178b3"},
-    {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9755e4345d1ec879e3849e62222a18c7174d65a6a92d5b346b1863912168b595"},
-    {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f1e22e8c4419538cb197e4dd60acc919d7696e5ef98ee4da4e01d3f8cfa4cc5a"},
-    {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c03e868a0b3bc35839ba98e74211ed2b05d2119be4e8a0f224fba9384f1fe02e"},
-    {file = "cffi-1.17.1-cp39-cp39-win32.whl", hash = "sha256:e31ae45bc2e29f6b2abd0de1cc3b9d5205aa847cafaecb8af1476a609a2f6eb7"},
-    {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"},
-    {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"},
-]
-
-[package.dependencies]
-pycparser = "*"
-
 [[package]]
 name = "cfgv"
 version = "3.4.0"
@@ -320,20 +226,6 @@ files = [
 [package.dependencies]
 colorama = {version = "*", markers = "platform_system == \"Windows\""}
 
-[[package]]
-name = "click-log"
-version = "0.4.0"
-description = "Logging integration for Click"
-optional = false
-python-versions = "*"
-files = [
-    {file = "click-log-0.4.0.tar.gz", hash = "sha256:3970f8570ac54491237bcdb3d8ab5e3eef6c057df29f8c3d1151a51a9c23b975"},
-    {file = "click_log-0.4.0-py2.py3-none-any.whl", hash = "sha256:a43e394b528d52112af599f2fc9e4b7cf3c15f94e53581f74fa6867e68c91756"},
-]
-
-[package.dependencies]
-click = "*"
-
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -345,55 +237,6 @@ files = [
     {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 ]
 
-[[package]]
-name = "cryptography"
-version = "43.0.1"
-description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "cryptography-43.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d"},
-    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062"},
-    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68aaecc4178e90719e95298515979814bda0cbada1256a4485414860bd7ab962"},
-    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:de41fd81a41e53267cb020bb3a7212861da53a7d39f863585d13ea11049cf277"},
-    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f98bf604c82c416bc829e490c700ca1553eafdf2912a91e23a79d97d9801372a"},
-    {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:61ec41068b7b74268fa86e3e9e12b9f0c21fcf65434571dbb13d954bceb08042"},
-    {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:014f58110f53237ace6a408b5beb6c427b64e084eb451ef25a28308270086494"},
-    {file = "cryptography-43.0.1-cp37-abi3-win32.whl", hash = "sha256:2bd51274dcd59f09dd952afb696bf9c61a7a49dfc764c04dd33ef7a6b502a1e2"},
-    {file = "cryptography-43.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:666ae11966643886c2987b3b721899d250855718d6d9ce41b521252a17985f4d"},
-    {file = "cryptography-43.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:ac119bb76b9faa00f48128b7f5679e1d8d437365c5d26f1c2c3f0da4ce1b553d"},
-    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bbcce1a551e262dfbafb6e6252f1ae36a248e615ca44ba302df077a846a8806"},
-    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58d4e9129985185a06d849aa6df265bdd5a74ca6e1b736a77959b498e0505b85"},
-    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d03a475165f3134f773d1388aeb19c2d25ba88b6a9733c5c590b9ff7bbfa2e0c"},
-    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:511f4273808ab590912a93ddb4e3914dfd8a388fed883361b02dea3791f292e1"},
-    {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:80eda8b3e173f0f247f711eef62be51b599b5d425c429b5d4ca6a05e9e856baa"},
-    {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38926c50cff6f533f8a2dae3d7f19541432610d114a70808f0926d5aaa7121e4"},
-    {file = "cryptography-43.0.1-cp39-abi3-win32.whl", hash = "sha256:a575913fb06e05e6b4b814d7f7468c2c660e8bb16d8d5a1faf9b33ccc569dd47"},
-    {file = "cryptography-43.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:d75601ad10b059ec832e78823b348bfa1a59f6b8d545db3a24fd44362a1564cb"},
-    {file = "cryptography-43.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ea25acb556320250756e53f9e20a4177515f012c9eaea17eb7587a8c4d8ae034"},
-    {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c1332724be35d23a854994ff0b66530119500b6053d0bd3363265f7e5e77288d"},
-    {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fba1007b3ef89946dbbb515aeeb41e30203b004f0b4b00e5e16078b518563289"},
-    {file = "cryptography-43.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5b43d1ea6b378b54a1dc99dd8a2b5be47658fe9a7ce0a58ff0b55f4b43ef2b84"},
-    {file = "cryptography-43.0.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:88cce104c36870d70c49c7c8fd22885875d950d9ee6ab54df2745f83ba0dc365"},
-    {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:9d3cdb25fa98afdd3d0892d132b8d7139e2c087da1712041f6b762e4f807cc96"},
-    {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e710bf40870f4db63c3d7d929aa9e09e4e7ee219e703f949ec4073b4294f6172"},
-    {file = "cryptography-43.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7c05650fe8023c5ed0d46793d4b7d7e6cd9c04e68eabe5b0aeea836e37bdcec2"},
-    {file = "cryptography-43.0.1.tar.gz", hash = "sha256:203e92a75716d8cfb491dc47c79e17d0d9207ccffcbcb35f598fbe463ae3444d"},
-]
-
-[package.dependencies]
-cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""}
-
-[package.extras]
-docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
-docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"]
-nox = ["nox"]
-pep8test = ["check-sdist", "click", "mypy", "ruff"]
-sdist = ["build"]
-ssh = ["bcrypt (>=3.1.5)"]
-test = ["certifi", "cryptography-vectors (==43.0.1)", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
-test-randomorder = ["pytest-randomly"]
-
 [[package]]
 name = "dataclasses-json"
 version = "0.5.9"
@@ -424,28 +267,6 @@ files = [
     {file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"},
 ]
 
-[[package]]
-name = "docutils"
-version = "0.21.2"
-description = "Docutils -- Python Documentation Utilities"
-optional = false
-python-versions = ">=3.9"
-files = [
-    {file = "docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2"},
-    {file = "docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f"},
-]
-
-[[package]]
-name = "dotty-dict"
-version = "1.3.1"
-description = "Dictionary wrapper for quick access to deeply nested keys."
-optional = false
-python-versions = ">=3.5,<4.0"
-files = [
-    {file = "dotty_dict-1.3.1-py3-none-any.whl", hash = "sha256:5022d234d9922f13aa711b4950372a06a6d64cb6d6db9ba43d0ba133ebfce31f"},
-    {file = "dotty_dict-1.3.1.tar.gz", hash = "sha256:4b016e03b8ae265539757a53eba24b9bfda506fb94fbce0bee843c6f05541a15"},
-]
-
 [[package]]
 name = "exceptiongroup"
 version = "1.2.2"
@@ -507,38 +328,6 @@ files = [
 flake8 = ">=3"
 pydocstyle = ">=2.1"
 
-[[package]]
-name = "gitdb"
-version = "4.0.11"
-description = "Git Object Database"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "gitdb-4.0.11-py3-none-any.whl", hash = "sha256:81a3407ddd2ee8df444cbacea00e2d038e40150acfa3001696fe0dcf1d3adfa4"},
-    {file = "gitdb-4.0.11.tar.gz", hash = "sha256:bf5421126136d6d0af55bc1e7c1af1c397a34f5b7bd79e776cd3e89785c2b04b"},
-]
-
-[package.dependencies]
-smmap = ">=3.0.1,<6"
-
-[[package]]
-name = "gitpython"
-version = "3.1.43"
-description = "GitPython is a Python library used to interact with Git repositories"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "GitPython-3.1.43-py3-none-any.whl", hash = "sha256:eec7ec56b92aad751f9912a73404bc02ba212a23adb2c7098ee668417051a1ff"},
-    {file = "GitPython-3.1.43.tar.gz", hash = "sha256:35f314a9f878467f5453cc1fee295c3e18e52f1b99f10f6cf5b1682e968a9e7c"},
-]
-
-[package.dependencies]
-gitdb = ">=4.0.1,<5"
-
-[package.extras]
-doc = ["sphinx (==4.3.2)", "sphinx-autodoc-typehints", "sphinx-rtd-theme", "sphinxcontrib-applehelp (>=1.0.2,<=1.0.4)", "sphinxcontrib-devhelp (==1.0.2)", "sphinxcontrib-htmlhelp (>=2.0.0,<=2.0.1)", "sphinxcontrib-qthelp (==1.0.3)", "sphinxcontrib-serializinghtml (==1.1.5)"]
-test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions"]
-
 [[package]]
 name = "htmlmin"
 version = "0.1.12"
@@ -577,29 +366,6 @@ files = [
 [package.extras]
 all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
 
-[[package]]
-name = "importlib-metadata"
-version = "8.5.0"
-description = "Read metadata from Python packages"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b"},
-    {file = "importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7"},
-]
-
-[package.dependencies]
-zipp = ">=3.20"
-
-[package.extras]
-check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
-cover = ["pytest-cov"]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-enabler = ["pytest-enabler (>=2.2)"]
-perf = ["ipython"]
-test = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"]
-type = ["pytest-mypy"]
-
 [[package]]
 name = "iniconfig"
 version = "2.0.0"
@@ -611,17 +377,6 @@ files = [
     {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
 ]
 
-[[package]]
-name = "invoke"
-version = "2.2.0"
-description = "Pythonic task execution"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "invoke-2.2.0-py3-none-any.whl", hash = "sha256:6ea924cc53d4f78e3d98bc436b08069a03077e6f85ad1ddaa8a116d7dad15820"},
-    {file = "invoke-2.2.0.tar.gz", hash = "sha256:ee6cbb101af1a859c7fe84f2a264c059020b0cb7fe3535f9424300ab568f6bd5"},
-]
-
 [[package]]
 name = "isort"
 version = "5.13.2"
@@ -636,75 +391,6 @@ files = [
 [package.extras]
 colors = ["colorama (>=0.4.6)"]
 
-[[package]]
-name = "jaraco-classes"
-version = "3.4.0"
-description = "Utility functions for Python class constructs"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "jaraco.classes-3.4.0-py3-none-any.whl", hash = "sha256:f662826b6bed8cace05e7ff873ce0f9283b5c924470fe664fff1c2f00f581790"},
-    {file = "jaraco.classes-3.4.0.tar.gz", hash = "sha256:47a024b51d0239c0dd8c8540c6c7f484be3b8fcf0b2d85c13825780d3b3f3acd"},
-]
-
-[package.dependencies]
-more-itertools = "*"
-
-[package.extras]
-docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
-
-[[package]]
-name = "jaraco-context"
-version = "6.0.1"
-description = "Useful decorators and context managers"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "jaraco.context-6.0.1-py3-none-any.whl", hash = "sha256:f797fc481b490edb305122c9181830a3a5b76d84ef6d1aef2fb9b47ab956f9e4"},
-    {file = "jaraco_context-6.0.1.tar.gz", hash = "sha256:9bae4ea555cf0b14938dc0aee7c9f32ed303aa20a3b73e7dc80111628792d1b3"},
-]
-
-[package.dependencies]
-"backports.tarfile" = {version = "*", markers = "python_version < \"3.12\""}
-
-[package.extras]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-test = ["portend", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
-
-[[package]]
-name = "jaraco-functools"
-version = "4.0.2"
-description = "Functools like those found in stdlib"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "jaraco.functools-4.0.2-py3-none-any.whl", hash = "sha256:c9d16a3ed4ccb5a889ad8e0b7a343401ee5b2a71cee6ed192d3f68bc351e94e3"},
-    {file = "jaraco_functools-4.0.2.tar.gz", hash = "sha256:3460c74cd0d32bf82b9576bbb3527c4364d5b27a21f5158a62aed6c4b42e23f5"},
-]
-
-[package.dependencies]
-more-itertools = "*"
-
-[package.extras]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-test = ["jaraco.classes", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
-
-[[package]]
-name = "jeepney"
-version = "0.8.0"
-description = "Low-level, pure Python DBus protocol wrapper."
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "jeepney-0.8.0-py3-none-any.whl", hash = "sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755"},
-    {file = "jeepney-0.8.0.tar.gz", hash = "sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806"},
-]
-
-[package.extras]
-test = ["async-timeout", "pytest", "pytest-asyncio (>=0.17)", "pytest-trio", "testpath", "trio"]
-trio = ["async_generator", "trio"]
-
 [[package]]
 name = "jinja2"
 version = "3.1.4"
@@ -808,35 +494,6 @@ files = [
 [package.dependencies]
 referencing = ">=0.31.0"
 
-[[package]]
-name = "keyring"
-version = "25.4.1"
-description = "Store and access your passwords safely."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "keyring-25.4.1-py3-none-any.whl", hash = "sha256:5426f817cf7f6f007ba5ec722b1bcad95a75b27d780343772ad76b17cb47b0bf"},
-    {file = "keyring-25.4.1.tar.gz", hash = "sha256:b07ebc55f3e8ed86ac81dd31ef14e81ace9dd9c3d4b5d77a6e9a2016d0d71a1b"},
-]
-
-[package.dependencies]
-importlib-metadata = {version = ">=4.11.4", markers = "python_version < \"3.12\""}
-"jaraco.classes" = "*"
-"jaraco.context" = "*"
-"jaraco.functools" = "*"
-jeepney = {version = ">=0.4.2", markers = "sys_platform == \"linux\""}
-pywin32-ctypes = {version = ">=0.2.0", markers = "sys_platform == \"win32\""}
-SecretStorage = {version = ">=3.2", markers = "sys_platform == \"linux\""}
-
-[package.extras]
-check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
-completion = ["shtab (>=1.1.0)"]
-cover = ["pytest-cov"]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-enabler = ["pytest-enabler (>=2.2)"]
-test = ["pyfakefs", "pytest (>=6,!=8.1.*)"]
-type = ["pygobject-stubs", "pytest-mypy", "shtab", "types-pywin32"]
-
 [[package]]
 name = "markdown2"
 version = "2.5.0"
@@ -967,17 +624,6 @@ files = [
     {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
 ]
 
-[[package]]
-name = "more-itertools"
-version = "10.5.0"
-description = "More routines for operating on iterables, beyond itertools"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "more-itertools-10.5.0.tar.gz", hash = "sha256:5482bfef7849c25dc3c6dd53a6173ae4795da2a41a80faea6700d9f5846c5da6"},
-    {file = "more_itertools-10.5.0-py3-none-any.whl", hash = "sha256:037b0d3203ce90cca8ab1defbbdac29d5f993fc20131f3664dc8d6acfa872aef"},
-]
-
 [[package]]
 name = "mypy"
 version = "1.11.2"
@@ -1036,31 +682,6 @@ files = [
     {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
 ]
 
-[[package]]
-name = "nh3"
-version = "0.2.18"
-description = "Python bindings to the ammonia HTML sanitization library."
-optional = false
-python-versions = "*"
-files = [
-    {file = "nh3-0.2.18-cp37-abi3-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:14c5a72e9fe82aea5fe3072116ad4661af5cf8e8ff8fc5ad3450f123e4925e86"},
-    {file = "nh3-0.2.18-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:7b7c2a3c9eb1a827d42539aa64091640bd275b81e097cd1d8d82ef91ffa2e811"},
-    {file = "nh3-0.2.18-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42c64511469005058cd17cc1537578eac40ae9f7200bedcfd1fc1a05f4f8c200"},
-    {file = "nh3-0.2.18-cp37-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0411beb0589eacb6734f28d5497ca2ed379eafab8ad8c84b31bb5c34072b7164"},
-    {file = "nh3-0.2.18-cp37-abi3-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:5f36b271dae35c465ef5e9090e1fdaba4a60a56f0bb0ba03e0932a66f28b9189"},
-    {file = "nh3-0.2.18-cp37-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:34c03fa78e328c691f982b7c03d4423bdfd7da69cd707fe572f544cf74ac23ad"},
-    {file = "nh3-0.2.18-cp37-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19aaba96e0f795bd0a6c56291495ff59364f4300d4a39b29a0abc9cb3774a84b"},
-    {file = "nh3-0.2.18-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de3ceed6e661954871d6cd78b410213bdcb136f79aafe22aa7182e028b8c7307"},
-    {file = "nh3-0.2.18-cp37-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6955369e4d9f48f41e3f238a9e60f9410645db7e07435e62c6a9ea6135a4907f"},
-    {file = "nh3-0.2.18-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:f0eca9ca8628dbb4e916ae2491d72957fdd35f7a5d326b7032a345f111ac07fe"},
-    {file = "nh3-0.2.18-cp37-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:3a157ab149e591bb638a55c8c6bcb8cdb559c8b12c13a8affaba6cedfe51713a"},
-    {file = "nh3-0.2.18-cp37-abi3-musllinux_1_2_i686.whl", hash = "sha256:c8b3a1cebcba9b3669ed1a84cc65bf005728d2f0bc1ed2a6594a992e817f3a50"},
-    {file = "nh3-0.2.18-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:36c95d4b70530b320b365659bb5034341316e6a9b30f0b25fa9c9eff4c27a204"},
-    {file = "nh3-0.2.18-cp37-abi3-win32.whl", hash = "sha256:a7f1b5b2c15866f2db413a3649a8fe4fd7b428ae58be2c0f6bca5eefd53ca2be"},
-    {file = "nh3-0.2.18-cp37-abi3-win_amd64.whl", hash = "sha256:8ce0f819d2f1933953fca255db2471ad58184a60508f03e6285e5114b6254844"},
-    {file = "nh3-0.2.18.tar.gz", hash = "sha256:94a166927e53972a9698af9542ace4e38b9de50c34352b962f4d9a7d4c927af4"},
-]
-
 [[package]]
 name = "nodeenv"
 version = "1.9.1"
@@ -1310,20 +931,6 @@ files = [
 [package.dependencies]
 flake8 = ">=5.0.0"
 
-[[package]]
-name = "pkginfo"
-version = "1.11.1"
-description = "Query metadata from sdists / bdists / installed packages."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "pkginfo-1.11.1-py3-none-any.whl", hash = "sha256:bfa76a714fdfc18a045fcd684dbfc3816b603d9d075febef17cb6582bea29573"},
-    {file = "pkginfo-1.11.1.tar.gz", hash = "sha256:2e0dca1cf4c8e39644eed32408ea9966ee15e0d324c62ba899a393b3c6b467aa"},
-]
-
-[package.extras]
-testing = ["pytest", "pytest-cov", "wheel"]
-
 [[package]]
 name = "platformdirs"
 version = "4.3.6"
@@ -1384,17 +991,6 @@ files = [
     {file = "pycodestyle-2.12.1.tar.gz", hash = "sha256:6838eae08bbce4f6accd5d5572075c63626a15ee3e6f842df996bf62f6d73521"},
 ]
 
-[[package]]
-name = "pycparser"
-version = "2.22"
-description = "C parser in Python"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"},
-    {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"},
-]
-
 [[package]]
 name = "pydantic"
 version = "2.9.2"
@@ -1519,6 +1115,28 @@ files = [
 [package.dependencies]
 typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
 
+[[package]]
+name = "pydantic-extra-types"
+version = "2.9.0"
+description = "Extra Pydantic types."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pydantic_extra_types-2.9.0-py3-none-any.whl", hash = "sha256:f0bb975508572ba7bf3390b7337807588463b7248587e69f43b1ad7c797530d0"},
+    {file = "pydantic_extra_types-2.9.0.tar.gz", hash = "sha256:e061c01636188743bb69f368dcd391f327b8cfbfede2fe1cbb1211b06601ba3b"},
+]
+
+[package.dependencies]
+pydantic = ">=2.5.2"
+
+[package.extras]
+all = ["pendulum (>=3.0.0,<4.0.0)", "phonenumbers (>=8,<9)", "pycountry (>=23)", "python-ulid (>=1,<2)", "python-ulid (>=1,<3)", "pytz (>=2024.1)", "semver (>=3.0.2)", "tzdata (>=2024.1)"]
+pendulum = ["pendulum (>=3.0.0,<4.0.0)"]
+phonenumbers = ["phonenumbers (>=8,<9)"]
+pycountry = ["pycountry (>=23)"]
+python-ulid = ["python-ulid (>=1,<2)", "python-ulid (>=1,<3)"]
+semver = ["semver (>=3.0.2)"]
+
 [[package]]
 name = "pydocstyle"
 version = "6.3.0"
@@ -1597,56 +1215,6 @@ files = [
 [package.dependencies]
 six = ">=1.5"
 
-[[package]]
-name = "python-gitlab"
-version = "3.15.0"
-description = "Interact with GitLab API"
-optional = false
-python-versions = ">=3.7.0"
-files = [
-    {file = "python-gitlab-3.15.0.tar.gz", hash = "sha256:c9e65eb7612a9fbb8abf0339972eca7fd7a73d4da66c9b446ffe528930aff534"},
-    {file = "python_gitlab-3.15.0-py3-none-any.whl", hash = "sha256:8f8d1c0d387f642eb1ac7bf5e8e0cd8b3dd49c6f34170cee3c7deb7d384611f3"},
-]
-
-[package.dependencies]
-requests = ">=2.25.0"
-requests-toolbelt = ">=0.10.1"
-
-[package.extras]
-autocompletion = ["argcomplete (>=1.10.0,<3)"]
-yaml = ["PyYaml (>=5.2)"]
-
-[[package]]
-name = "python-semantic-release"
-version = "7.34.6"
-description = "Automatic Semantic Versioning for Python projects"
-optional = false
-python-versions = "*"
-files = [
-    {file = "python-semantic-release-7.34.6.tar.gz", hash = "sha256:e9b8fb788024ae9510a924136d573588415a16eeca31cc5240f2754a80a2e831"},
-    {file = "python_semantic_release-7.34.6-py3-none-any.whl", hash = "sha256:7e3969ba4663d9b2087b02bf3ac140e202551377bf045c34e09bfe19753e19ab"},
-]
-
-[package.dependencies]
-click = ">=7,<9"
-click-log = ">=0.3,<1"
-dotty-dict = ">=1.3.0,<2"
-gitpython = ">=3.0.8,<4"
-invoke = ">=1.4.1,<3"
-packaging = "*"
-python-gitlab = ">=2,<4"
-requests = ">=2.25,<3"
-semver = ">=2.10,<3"
-tomlkit = ">=0.10,<1.0"
-twine = ">=3,<4"
-wheel = "*"
-
-[package.extras]
-dev = ["black", "isort", "tox"]
-docs = ["Jinja2 (==3.0.3)", "Sphinx (==1.8.6)"]
-mypy = ["mypy", "types-requests"]
-test = ["coverage (>=5,<6)", "mock (==1.3.0)", "pytest (>=7,<8)", "pytest-mock (>=2,<3)", "pytest-xdist (>=1,<2)", "responses (==0.13.3)"]
-
 [[package]]
 name = "pytz"
 version = "2024.2"
@@ -1658,17 +1226,6 @@ files = [
     {file = "pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a"},
 ]
 
-[[package]]
-name = "pywin32-ctypes"
-version = "0.2.3"
-description = "A (partial) reimplementation of pywin32 using ctypes/cffi"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "pywin32-ctypes-0.2.3.tar.gz", hash = "sha256:d162dc04946d704503b2edc4d55f3dba5c1d539ead017afa00142c38b9885755"},
-    {file = "pywin32_ctypes-0.2.3-py3-none-any.whl", hash = "sha256:8a1513379d709975552d202d942d9837758905c8d01eb82b8bcc30918929e7b8"},
-]
-
 [[package]]
 name = "pyyaml"
 version = "6.0.2"
@@ -1731,25 +1288,6 @@ files = [
     {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
 ]
 
-[[package]]
-name = "readme-renderer"
-version = "44.0"
-description = "readme_renderer is a library for rendering readme descriptions for Warehouse"
-optional = false
-python-versions = ">=3.9"
-files = [
-    {file = "readme_renderer-44.0-py3-none-any.whl", hash = "sha256:2fbca89b81a08526aadf1357a8c2ae889ec05fb03f5da67f9769c9a592166151"},
-    {file = "readme_renderer-44.0.tar.gz", hash = "sha256:8712034eabbfa6805cacf1402b4eeb2a73028f72d1166d6f5cb7f9c047c5d1e1"},
-]
-
-[package.dependencies]
-docutils = ">=0.21.2"
-nh3 = ">=0.2.14"
-Pygments = ">=2.5.1"
-
-[package.extras]
-md = ["cmarkgfm (>=0.8.0)"]
-
 [[package]]
 name = "referencing"
 version = "0.35.1"
@@ -1786,34 +1324,6 @@ urllib3 = ">=1.21.1,<3"
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
 use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
-[[package]]
-name = "requests-toolbelt"
-version = "1.0.0"
-description = "A utility belt for advanced users of python-requests"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-files = [
-    {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"},
-    {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"},
-]
-
-[package.dependencies]
-requests = ">=2.0.1,<3.0.0"
-
-[[package]]
-name = "rfc3986"
-version = "2.0.0"
-description = "Validating URI References per RFC 3986"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "rfc3986-2.0.0-py2.py3-none-any.whl", hash = "sha256:50b1502b60e289cb37883f3dfd34532b8873c7de9f49bb546641ce9cbd256ebd"},
-    {file = "rfc3986-2.0.0.tar.gz", hash = "sha256:97aacf9dbd4bfd829baad6e6309fa6573aaf1be3f6fa735c8ab05e46cecb261c"},
-]
-
-[package.extras]
-idna2008 = ["idna"]
-
 [[package]]
 name = "rpds-py"
 version = "0.20.0"
@@ -1926,30 +1436,15 @@ files = [
     {file = "rpds_py-0.20.0.tar.gz", hash = "sha256:d72a210824facfdaf8768cf2d7ca25a042c30320b3020de2fa04640920d4e121"},
 ]
 
-[[package]]
-name = "secretstorage"
-version = "3.3.3"
-description = "Python bindings to FreeDesktop.org Secret Service API"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "SecretStorage-3.3.3-py3-none-any.whl", hash = "sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99"},
-    {file = "SecretStorage-3.3.3.tar.gz", hash = "sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77"},
-]
-
-[package.dependencies]
-cryptography = ">=2.0"
-jeepney = ">=0.6"
-
 [[package]]
 name = "semver"
-version = "2.13.0"
-description = "Python helper for Semantic Versioning (http://semver.org/)"
+version = "3.0.2"
+description = "Python helper for Semantic Versioning (https://semver.org)"
 optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+python-versions = ">=3.7"
 files = [
-    {file = "semver-2.13.0-py2.py3-none-any.whl", hash = "sha256:ced8b23dceb22134307c1b8abfa523da14198793d9787ac838e70e29e77458d4"},
-    {file = "semver-2.13.0.tar.gz", hash = "sha256:fa0fe2722ee1c3f57eac478820c3a5ae2f624af8264cbdf9000c980ff7f75e3f"},
+    {file = "semver-3.0.2-py3-none-any.whl", hash = "sha256:b1ea4686fe70b981f85359eda33199d60c53964284e0cfb4977d243e37cf4bf4"},
+    {file = "semver-3.0.2.tar.gz", hash = "sha256:6253adb39c70f6e51afed2fa7152bcd414c411286088fb4b9effb133885ab4cc"},
 ]
 
 [[package]]
@@ -1963,17 +1458,6 @@ files = [
     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
 ]
 
-[[package]]
-name = "smmap"
-version = "5.0.1"
-description = "A pure Python implementation of a sliding window memory map manager"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "smmap-5.0.1-py3-none-any.whl", hash = "sha256:e6d8668fa5f93e706934a62d7b4db19c8d9eb8cf2adbb75ef1b675aa332b69da"},
-    {file = "smmap-5.0.1.tar.gz", hash = "sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62"},
-]
-
 [[package]]
 name = "snowballstemmer"
 version = "2.2.0"
@@ -2010,60 +1494,6 @@ files = [
     {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
 ]
 
-[[package]]
-name = "tomlkit"
-version = "0.13.2"
-description = "Style preserving TOML library"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "tomlkit-0.13.2-py3-none-any.whl", hash = "sha256:7a974427f6e119197f670fbbbeae7bef749a6c14e793db934baefc1b5f03efde"},
-    {file = "tomlkit-0.13.2.tar.gz", hash = "sha256:fff5fe59a87295b278abd31bec92c15d9bc4a06885ab12bcea52c71119392e79"},
-]
-
-[[package]]
-name = "tqdm"
-version = "4.66.5"
-description = "Fast, Extensible Progress Meter"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "tqdm-4.66.5-py3-none-any.whl", hash = "sha256:90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd"},
-    {file = "tqdm-4.66.5.tar.gz", hash = "sha256:e1020aef2e5096702d8a025ac7d16b1577279c9d63f8375b63083e9a5f0fcbad"},
-]
-
-[package.dependencies]
-colorama = {version = "*", markers = "platform_system == \"Windows\""}
-
-[package.extras]
-dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"]
-notebook = ["ipywidgets (>=6)"]
-slack = ["slack-sdk"]
-telegram = ["requests"]
-
-[[package]]
-name = "twine"
-version = "3.8.0"
-description = "Collection of utilities for publishing packages on PyPI"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "twine-3.8.0-py3-none-any.whl", hash = "sha256:d0550fca9dc19f3d5e8eadfce0c227294df0a2a951251a4385797c8a6198b7c8"},
-    {file = "twine-3.8.0.tar.gz", hash = "sha256:8efa52658e0ae770686a13b675569328f1fba9837e5de1867bfe5f46a9aefe19"},
-]
-
-[package.dependencies]
-colorama = ">=0.4.3"
-importlib-metadata = ">=3.6"
-keyring = ">=15.1"
-pkginfo = ">=1.8.1"
-readme-renderer = ">=21.0"
-requests = ">=2.20"
-requests-toolbelt = ">=0.8.0,<0.9.0 || >0.9.0"
-rfc3986 = ">=1.4.0"
-tqdm = ">=4.14"
-urllib3 = ">=1.26.0"
-
 [[package]]
 name = "types-setuptools"
 version = "70.3.0.20240710"
@@ -2131,13 +1561,13 @@ zstd = ["zstandard (>=0.18.0)"]
 
 [[package]]
 name = "virtualenv"
-version = "20.26.5"
+version = "20.26.6"
 description = "Virtual Python Environment builder"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "virtualenv-20.26.5-py3-none-any.whl", hash = "sha256:4f3ac17b81fba3ce3bd6f4ead2749a72da5929c01774948e243db9ba41df4ff6"},
-    {file = "virtualenv-20.26.5.tar.gz", hash = "sha256:ce489cac131aa58f4b25e321d6d186171f78e6cb13fafbf32a840cee67733ff4"},
+    {file = "virtualenv-20.26.6-py3-none-any.whl", hash = "sha256:7345cc5b25405607a624d8418154577459c3e0277f5466dd79c49d5e492995f2"},
+    {file = "virtualenv-20.26.6.tar.gz", hash = "sha256:280aede09a2a5c317e409a00102e7077c6432c5a38f0ef938e643805a7ad2c48"},
 ]
 
 [package.dependencies]
@@ -2149,40 +1579,7 @@ platformdirs = ">=3.9.1,<5"
 docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
 test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
 
-[[package]]
-name = "wheel"
-version = "0.44.0"
-description = "A built-package format for Python"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "wheel-0.44.0-py3-none-any.whl", hash = "sha256:2376a90c98cc337d18623527a97c31797bd02bad0033d41547043a1cbfbe448f"},
-    {file = "wheel-0.44.0.tar.gz", hash = "sha256:a29c3f2817e95ab89aa4660681ad547c0e9547f20e75b0562fe7723c9a2a9d49"},
-]
-
-[package.extras]
-test = ["pytest (>=6.0.0)", "setuptools (>=65)"]
-
-[[package]]
-name = "zipp"
-version = "3.20.2"
-description = "Backport of pathlib-compatible object wrapper for zip files"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350"},
-    {file = "zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29"},
-]
-
-[package.extras]
-check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
-cover = ["pytest-cov"]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-enabler = ["pytest-enabler (>=2.2)"]
-test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
-type = ["pytest-mypy"]
-
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "2256d7b264ca3af01e83a71107252a8f9cc57abcbe73bf1e4b6bebd33906cf9e"
+content-hash = "0f99802ae048309d0d17af28c4328eae27b36bccb30130c8f3b06537cb16c264"
diff --git a/pyproject.toml b/pyproject.toml
index f6565b8..f29fd5d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,6 +53,8 @@ jsonref = "^1.1.0"
 json-schema-for-humans = "^1.0.0"
 tabulate = "^0.9.0"
 pandas = "^2.2.2"
+pydantic-extra-types = "^2.9.0"
+semver = "^3.0.2"
 
 [tool.poetry.group.dev.dependencies]
 black = "^24.4.2"
@@ -67,7 +69,6 @@ flake8-docstrings = "^1.6.0"
 pep8-naming = "^0.13.2"
 jsondiff = "^2.0.0"
 types-setuptools = "^70.3.0"
-python-semantic-release = "^7.32.2"
 
 [tool.setuptools.packages.find]
 where = ["docling_core/resources/schemas"]
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index 73f644d..892fd48 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -1,7 +1,9 @@
+import importlib
 from collections import deque
 
 import pytest
 import yaml
+from pydantic import ValidationError
 
 from docling_core.types.experimental.document import (
     BasePictureData,
@@ -349,3 +351,34 @@ def _construct_doc() -> DoclingDocument:
     fig_item = doc.add_picture(data=BasePictureData(), caption=fig_caption)
 
     return doc
+
+
+def test_version_doc():
+
+    # default version
+    version = importlib.metadata.version("docling-core")
+    doc = DoclingDocument(description=DescriptionItem(), name="Untitled 1")
+    assert doc.version == version
+
+    with open("test/data/experimental/dummy_doc.yaml", "r") as fp:
+        dict_from_yaml = yaml.safe_load(fp)
+    doc = DoclingDocument.model_validate(dict_from_yaml)
+    assert doc.version == version
+
+    # custom version at construction
+    doc = DoclingDocument(
+        description=DescriptionItem(),
+        name="Untitled 1",
+        version="2.1.0-post.8+96354bda",
+    )
+    assert doc.version.major == 2
+    assert doc.version.minor == 1
+    assert doc.version.patch == 0
+    assert doc.version.prerelease == "post.8"
+    assert doc.version.build == "96354bda"
+    doc_json = doc.model_dump()
+    assert doc_json["version"] == "2.1.0-post.8+96354bda"
+
+    # invalid version
+    with pytest.raises(ValidationError, match="SemVer"):
+        DoclingDocument(description=DescriptionItem(), name="Untitled 1", version="abc")

From 62a3ebe3cacf4ffcc8a848d5d7ac48ed2f1dea51 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Mon, 30 Sep 2024 10:10:59 +0200
Subject: [PATCH 28/34] Cleanup

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/experimental/document.py | 26 +++++++++------------
 test/test_docling_doc.py                    | 18 ++------------
 2 files changed, 13 insertions(+), 31 deletions(-)

diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index d188435..3101e9b 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -1,6 +1,5 @@
 """Models for the Docling Document data type."""
 
-import hashlib
 import mimetypes
 import typing
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -131,7 +130,9 @@ def parse_hex_string(cls, value):
                 # Convert hex string to an integer
                 hash_int = Uint64(value, 16)
                 # Mask to fit within 64 bits (unsigned)
-                return hash_int & 0xFFFFFFFFFFFFFFFF
+                return (
+                    hash_int & 0xFFFFFFFFFFFFFFFF
+                )  # TODO be sure it doesn't clip uint64 max
             except ValueError:
                 raise ValueError(f"Invalid sha256 hexdigest: {value}")
         return value  # If already an int, return it as is.
@@ -599,7 +600,7 @@ class KeyValueItem(DocItem):
     """KeyValueItem."""
 
 
-ContentItem = Union[TextItem, PictureItem, TableItem, KeyValueItem]
+ContentItem = Union[TextItem, SectionItem, PictureItem, TableItem, KeyValueItem]
 
 
 class PageItem(BaseModel):
@@ -619,6 +620,8 @@ class DescriptionItem(BaseModel):
 class DoclingDocument(BaseModel):
     """DoclingDocument."""
 
+    schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
+
     version: str = "0.1.0"  # TODO use SemanticVersion type instead
     description: DescriptionItem
     name: str  # The working name of this document, without extensions
@@ -642,13 +645,6 @@ class DoclingDocument(BaseModel):
 
     pages: Dict[int, PageItem] = {}  # empty as default
 
-    def _compute_hash(self, obj):
-        hash_object = hashlib.sha256(obj.encode("utf-8"))
-        # Convert the hash to an integer
-        hash_int = int.from_bytes(hash_object.digest(), "big")
-        # Mask it to fit within 64 bits
-        return Uint64(hash_int & 0xFFFFFFFFFFFFFFFF)  # 64-bit unsigned integer mask
-
     def add_group(
         self,
         label: Optional[GroupLabel] = None,
@@ -848,7 +844,7 @@ def validate_tree(self, root) -> bool:
 
         return all(res) or len(res) == 0
 
-    def iterate_elements(
+    def iterate_items(
         self,
         root: Optional[NodeItem] = None,
         with_groups: bool = False,
@@ -887,13 +883,13 @@ def iterate_elements(
             if isinstance(child, NodeItem):
                 # If the child is a NodeItem, recursively traverse it
                 if not isinstance(child, PictureItem) or traverse_pictures:
-                    yield from self.iterate_elements(
+                    yield from self.iterate_items(
                         child, _level=_level + 1, with_groups=with_groups
                     )
 
     def print_element_tree(self):
         """print_element_tree."""
-        for ix, (item, level) in enumerate(self.iterate_elements(with_groups=True)):
+        for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
             if isinstance(item, GroupItem):
                 print(" " * level, f"{ix}: {item.label.value} with name={item.name}")
             elif isinstance(item, DocItem):
@@ -948,7 +944,7 @@ def export_to_markdown(
         md_texts: list[str] = []
 
         skip_count = 0
-        for ix, (item, level) in enumerate(self.iterate_elements(self.body)):
+        for ix, (item, level) in enumerate(self.iterate_items(self.body)):
             if skip_count < from_element:
                 skip_count += 1
                 continue  # skip as many items as you want
@@ -1077,7 +1073,7 @@ def export_to_document_tokens(
         # pagedims = self.get_map_to_page_dimensions()
 
         skip_count = 0
-        for ix, (item, level) in enumerate(self.iterate_elements(self.body)):
+        for ix, (item, level) in enumerate(self.iterate_items(self.body)):
             if skip_count < from_element:
                 skip_count += 1
                 continue  # skip as many items as you want
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index 73f644d..aa0ee1c 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -64,7 +64,6 @@ def verify(dc, obj):
             obj = dc(
                 text="whatever",
                 orig="whatever",
-                dloc="sdvsd",
                 label=DocItemLabel.TEXT,
                 self_ref="#",
             )
@@ -72,9 +71,6 @@ def verify(dc, obj):
 
         elif dc is FloatingItem:
             obj = dc(
-                text="whatever",
-                orig="whatever",
-                dloc="sdvsd",
                 label=DocItemLabel.TEXT,
                 self_ref="#",
             )
@@ -82,9 +78,6 @@ def verify(dc, obj):
 
         elif dc is KeyValueItem:
             obj = dc(
-                text="whatever",
-                orig="whatever",
-                dloc="sdvsd",
                 label=DocItemLabel.TEXT,
                 self_ref="#",
             )
@@ -94,7 +87,6 @@ def verify(dc, obj):
             obj = dc(
                 text="whatever",
                 orig="whatever",
-                dloc="sdvsd",
                 label=DocItemLabel.TEXT,
                 self_ref="#",
                 level=2,
@@ -103,9 +95,6 @@ def verify(dc, obj):
 
         elif dc is PictureItem:
             obj = dc(
-                text="whatever",
-                orig="whatever",
-                dloc="sdvsd",
                 label=DocItemLabel.TEXT,
                 self_ref="#",
                 data=BasePictureData(),
@@ -114,12 +103,9 @@ def verify(dc, obj):
 
         elif dc is TableItem:
             obj = dc(
-                text="whatever",
-                orig="whatever",
-                dloc="sdvsd",
                 label=DocItemLabel.TEXT,
                 self_ref="#",
-                data=BaseTableData(num_rows=3, num_cols=5, cells=[]),
+                data=BaseTableData(num_rows=3, num_cols=5, table_cells=[]),
             )
             verify(dc, obj)
 
@@ -155,7 +141,7 @@ def test_reference_doc():
 
     # Iterate all elements
 
-    for item, level in doc.iterate_elements():
+    for item, level in doc.iterate_items():
         print(f"Item: {item} at level {level}")
 
     # Serialize and reload

From 46a6d8d6b670c0db1c93b1c5638e4a72b348738d Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Mon, 30 Sep 2024 11:07:21 +0200
Subject: [PATCH 29/34] Simpler literal enforcement

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/experimental/document.py | 14 ++++----------
 test/test_docling_doc.py                    |  4 +---
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index 813e0f4..1f33b04 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -7,7 +7,6 @@
 
 import pandas as pd
 from pydantic import (
-    AfterValidator,
     AnyUrl,
     BaseModel,
     ConfigDict,
@@ -324,9 +323,7 @@ def export_to_document_tokens(
 class SectionHeaderItem(TextItem):
     """SectionItem."""
 
-    label: typing.Annotated[
-        DocItemLabel, AfterValidator(lambda x: DocItemLabel.SECTION_HEADER)
-    ] = Field(default=DocItemLabel.SECTION_HEADER, frozen=True)
+    label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
     level: LevelNumber
 
 
@@ -342,9 +339,8 @@ class FloatingItem(DocItem):
 class PictureItem(FloatingItem):
     """PictureItem."""
 
-    label: typing.Annotated[
-        DocItemLabel, AfterValidator(lambda x: DocItemLabel.PICTURE)
-    ] = Field(default=DocItemLabel.PICTURE, frozen=True)
+    label: typing.Literal[DocItemLabel.PICTURE] = DocItemLabel.PICTURE
+
     data: BasePictureData
 
     def export_to_document_tokens(
@@ -401,9 +397,7 @@ class TableItem(FloatingItem):
     """TableItem."""
 
     data: BaseTableData
-    label: typing.Annotated[
-        DocItemLabel, AfterValidator(lambda x: DocItemLabel.TABLE)
-    ] = Field(default=DocItemLabel.TABLE, frozen=True)
+    label: typing.Literal[DocItemLabel.TABLE] = DocItemLabel.TABLE
 
     def export_to_dataframe(self) -> pd.DataFrame:
         """Export the table as a Pandas DataFrame."""
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index 83ef08f..9cc00ef 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -89,7 +89,7 @@ def verify(dc, obj):
             obj = dc(
                 text="whatever",
                 orig="whatever",
-                label=DocItemLabel.TEXT,
+                label=DocItemLabel.SECTION_HEADER,
                 self_ref="#",
                 level=2,
             )
@@ -97,7 +97,6 @@ def verify(dc, obj):
 
         elif dc is PictureItem:
             obj = dc(
-                label=DocItemLabel.TEXT,
                 self_ref="#",
                 data=BasePictureData(),
             )
@@ -105,7 +104,6 @@ def verify(dc, obj):
 
         elif dc is TableItem:
             obj = dc(
-                label=DocItemLabel.TEXT,
                 self_ref="#",
                 data=BaseTableData(num_rows=3, num_cols=5, table_cells=[]),
             )

From bb96c848356264661e40e84c8dd14aeb80bb121e Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Mon, 30 Sep 2024 13:16:23 +0200
Subject: [PATCH 30/34] Fix static document version

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/experimental/document.py         | 8 +++++---
 test/data/experimental/2206.01062.experimental.yaml | 2 +-
 test/test_docling_doc.py                            | 3 +--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index 1f33b04..4d90708 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -1,6 +1,5 @@
 """Models for the Docling Document data type."""
 
-import importlib
 import mimetypes
 import typing
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -224,7 +223,10 @@ def get_ref(self):
 class GroupItem(NodeItem):  # Container type, can't be a leaf node
     """GroupItem."""
 
-    name: str = "group"
+    name: str = (
+        "group"  # Name of the group, e.g. "Introduction Chapter",
+        # "Slide 5", "Navigation menu list", ...
+    )
     label: GroupLabel = GroupLabel.UNSPECIFIED
 
 
@@ -654,7 +656,7 @@ class DoclingDocument(BaseModel):
     def check_version_omitted(cls, v: str) -> str:
         """Set the version field to this library version by default."""
         if v is None:
-            return importlib.metadata.version("docling-core")
+            return "1.0.0"
         else:
             return v
 
diff --git a/test/data/experimental/2206.01062.experimental.yaml b/test/data/experimental/2206.01062.experimental.yaml
index e927205..4a608ff 100644
--- a/test/data/experimental/2206.01062.experimental.yaml
+++ b/test/data/experimental/2206.01062.experimental.yaml
@@ -13273,4 +13273,4 @@ texts:
   self_ref: '#/texts/127'
   text: '[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation
     for deep learning. Journal of Big Data , 6(1):60, 2019.'
-version: 0.1.0
+version: 1.0.0
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index 9cc00ef..2cb2c14 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -1,4 +1,3 @@
-import importlib
 from collections import deque
 
 import pytest
@@ -340,7 +339,7 @@ def _construct_doc() -> DoclingDocument:
 def test_version_doc():
 
     # default version
-    version = importlib.metadata.version("docling-core")
+    version = "1.0.0"
     doc = DoclingDocument(description=DescriptionItem(), name="Untitled 1")
     assert doc.version == version
 

From 089d692b95dd704a503e9d15c5072e2ff58a65e6 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Mon, 30 Sep 2024 13:30:52 +0200
Subject: [PATCH 31/34] Rollback changes to allow for semver<3

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/experimental/document.py | 22 ++++----
 poetry.lock                                 | 35 +-----------
 pyproject.toml                              |  2 -
 test/test_docling_doc.py                    | 59 ++++++++++-----------
 4 files changed, 42 insertions(+), 76 deletions(-)

diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index 4d90708..6b4a9a2 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -14,7 +14,6 @@
     field_validator,
     model_validator,
 )
-from pydantic_extra_types.semantic_version import SemanticVersion
 from tabulate import tabulate
 
 from docling_core.types.doc.tokens import DocumentToken
@@ -628,7 +627,10 @@ class DoclingDocument(BaseModel):
     """DoclingDocument."""
 
     schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
-    version: Optional[SemanticVersion] = Field(default=None, validate_default=True)
+    version: typing.Literal["1.0.0"] = (
+        "1.0.0"
+        # Optional[SemanticVersion] = Field(default=None, validate_default=True)
+    )
     description: DescriptionItem
     name: str  # The working name of this document, without extensions
     # (could be taken from originating doc, or just "Untitled 1")
@@ -651,14 +653,14 @@ class DoclingDocument(BaseModel):
 
     pages: Dict[int, PageItem] = {}  # empty as default
 
-    @field_validator("version")
-    @classmethod
-    def check_version_omitted(cls, v: str) -> str:
-        """Set the version field to this library version by default."""
-        if v is None:
-            return "1.0.0"
-        else:
-            return v
+    # @field_validator("version")
+    # @classmethod
+    # def check_version_omitted(cls, v: str) -> str:
+    #     """Set the version field to this library version by default."""
+    #     if v is None:
+    #         return "1.0.0"
+    #     else:
+    #         return v
 
     def add_group(
         self,
diff --git a/poetry.lock b/poetry.lock
index 5776567..54c4f81 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1115,28 +1115,6 @@ files = [
 [package.dependencies]
 typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
 
-[[package]]
-name = "pydantic-extra-types"
-version = "2.9.0"
-description = "Extra Pydantic types."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "pydantic_extra_types-2.9.0-py3-none-any.whl", hash = "sha256:f0bb975508572ba7bf3390b7337807588463b7248587e69f43b1ad7c797530d0"},
-    {file = "pydantic_extra_types-2.9.0.tar.gz", hash = "sha256:e061c01636188743bb69f368dcd391f327b8cfbfede2fe1cbb1211b06601ba3b"},
-]
-
-[package.dependencies]
-pydantic = ">=2.5.2"
-
-[package.extras]
-all = ["pendulum (>=3.0.0,<4.0.0)", "phonenumbers (>=8,<9)", "pycountry (>=23)", "python-ulid (>=1,<2)", "python-ulid (>=1,<3)", "pytz (>=2024.1)", "semver (>=3.0.2)", "tzdata (>=2024.1)"]
-pendulum = ["pendulum (>=3.0.0,<4.0.0)"]
-phonenumbers = ["phonenumbers (>=8,<9)"]
-pycountry = ["pycountry (>=23)"]
-python-ulid = ["python-ulid (>=1,<2)", "python-ulid (>=1,<3)"]
-semver = ["semver (>=3.0.2)"]
-
 [[package]]
 name = "pydocstyle"
 version = "6.3.0"
@@ -1436,17 +1414,6 @@ files = [
     {file = "rpds_py-0.20.0.tar.gz", hash = "sha256:d72a210824facfdaf8768cf2d7ca25a042c30320b3020de2fa04640920d4e121"},
 ]
 
-[[package]]
-name = "semver"
-version = "3.0.2"
-description = "Python helper for Semantic Versioning (https://semver.org)"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "semver-3.0.2-py3-none-any.whl", hash = "sha256:b1ea4686fe70b981f85359eda33199d60c53964284e0cfb4977d243e37cf4bf4"},
-    {file = "semver-3.0.2.tar.gz", hash = "sha256:6253adb39c70f6e51afed2fa7152bcd414c411286088fb4b9effb133885ab4cc"},
-]
-
 [[package]]
 name = "six"
 version = "1.16.0"
@@ -1582,4 +1549,4 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "0f99802ae048309d0d17af28c4328eae27b36bccb30130c8f3b06537cb16c264"
+content-hash = "329b132a93271e27c24b2809afc4db6a95fff4e605d964d29966c09eb6f1443d"
diff --git a/pyproject.toml b/pyproject.toml
index f29fd5d..603c168 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,8 +53,6 @@ jsonref = "^1.1.0"
 json-schema-for-humans = "^1.0.0"
 tabulate = "^0.9.0"
 pandas = "^2.2.2"
-pydantic-extra-types = "^2.9.0"
-semver = "^3.0.2"
 
 [tool.poetry.group.dev.dependencies]
 black = "^24.4.2"
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index 2cb2c14..ae60c8f 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -2,7 +2,6 @@
 
 import pytest
 import yaml
-from pydantic import ValidationError
 
 from docling_core.types.experimental.document import (
     BasePictureData,
@@ -336,32 +335,32 @@ def _construct_doc() -> DoclingDocument:
     return doc
 
 
-def test_version_doc():
-
-    # default version
-    version = "1.0.0"
-    doc = DoclingDocument(description=DescriptionItem(), name="Untitled 1")
-    assert doc.version == version
-
-    with open("test/data/experimental/dummy_doc.yaml", "r") as fp:
-        dict_from_yaml = yaml.safe_load(fp)
-    doc = DoclingDocument.model_validate(dict_from_yaml)
-    assert doc.version == version
-
-    # custom version at construction
-    doc = DoclingDocument(
-        description=DescriptionItem(),
-        name="Untitled 1",
-        version="2.1.0-post.8+96354bda",
-    )
-    assert doc.version.major == 2
-    assert doc.version.minor == 1
-    assert doc.version.patch == 0
-    assert doc.version.prerelease == "post.8"
-    assert doc.version.build == "96354bda"
-    doc_json = doc.model_dump()
-    assert doc_json["version"] == "2.1.0-post.8+96354bda"
-
-    # invalid version
-    with pytest.raises(ValidationError, match="SemVer"):
-        DoclingDocument(description=DescriptionItem(), name="Untitled 1", version="abc")
+# def test_version_doc():
+#
+#     # default version
+#     version = "1.0.0"
+#     doc = DoclingDocument(description=DescriptionItem(), name="Untitled 1")
+#     assert doc.version == version
+#
+#     with open("test/data/experimental/dummy_doc.yaml", "r") as fp:
+#         dict_from_yaml = yaml.safe_load(fp)
+#     doc = DoclingDocument.model_validate(dict_from_yaml)
+#     assert doc.version == version
+#
+#     # custom version at construction
+#     doc = DoclingDocument(
+#         description=DescriptionItem(),
+#         name="Untitled 1",
+#         version="2.1.0-post.8+96354bda",
+#     )
+#     assert doc.version.major == 2
+#     assert doc.version.minor == 1
+#     assert doc.version.patch == 0
+#     assert doc.version.prerelease == "post.8"
+#     assert doc.version.build == "96354bda"
+#     doc_json = doc.model_dump()
+#     assert doc_json["version"] == "2.1.0-post.8+96354bda"
+#
+#     # invalid version
+#     with pytest.raises(ValidationError, match="SemVer"):
+#         DoclingDocument(description=DescriptionItem(), name="Untitled 1", version="abc")

From 9fcaa887e5037868cdbc7d45644a40c431b1f785 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Date: Mon, 30 Sep 2024 14:06:31 +0200
Subject: [PATCH 32/34] build: rollback changes to include
 python-semantic-release as dev dependency

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
---
 poetry.lock    | 646 ++++++++++++++++++++++++++++++++++++++++++++++++-
 pyproject.toml |   1 +
 2 files changed, 644 insertions(+), 3 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 54c4f81..e84b67a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -45,6 +45,21 @@ files = [
 pyflakes = ">=3.0.0"
 tomli = {version = ">=2.0.1", markers = "python_version < \"3.11\""}
 
+[[package]]
+name = "backports-tarfile"
+version = "1.2.0"
+description = "Backport of CPython tarfile module"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "backports.tarfile-1.2.0-py3-none-any.whl", hash = "sha256:77e284d754527b01fb1e6fa8a1afe577858ebe4e9dad8919e34c862cb399bc34"},
+    {file = "backports_tarfile-1.2.0.tar.gz", hash = "sha256:d75e02c268746e1b8144c278978b6e98e85de6ad16f8e4b0844a154557eca991"},
+]
+
+[package.extras]
+docs = ["furo", "jaraco.packaging (>=9.3)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)"]
+
 [[package]]
 name = "black"
 version = "24.8.0"
@@ -102,6 +117,85 @@ files = [
     {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"},
 ]
 
+[[package]]
+name = "cffi"
+version = "1.17.1"
+description = "Foreign Function Interface for Python calling C code."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"},
+    {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"},
+    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382"},
+    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702"},
+    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3"},
+    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5da5719280082ac6bd9aa7becb3938dc9f9cbd57fac7d2871717b1feb0902ab6"},
+    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb1a08b8008b281856e5971307cc386a8e9c5b625ac297e853d36da6efe9c17"},
+    {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8"},
+    {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e"},
+    {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be"},
+    {file = "cffi-1.17.1-cp310-cp310-win32.whl", hash = "sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c"},
+    {file = "cffi-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15"},
+    {file = "cffi-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401"},
+    {file = "cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf"},
+    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4"},
+    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41"},
+    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1"},
+    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6"},
+    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d"},
+    {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6"},
+    {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f"},
+    {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b"},
+    {file = "cffi-1.17.1-cp311-cp311-win32.whl", hash = "sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655"},
+    {file = "cffi-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0"},
+    {file = "cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4"},
+    {file = "cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c"},
+    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36"},
+    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5"},
+    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff"},
+    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99"},
+    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93"},
+    {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3"},
+    {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8"},
+    {file = "cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65"},
+    {file = "cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903"},
+    {file = "cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e"},
+    {file = "cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2"},
+    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3"},
+    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683"},
+    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5"},
+    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4"},
+    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd"},
+    {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed"},
+    {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9"},
+    {file = "cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d"},
+    {file = "cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a"},
+    {file = "cffi-1.17.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b"},
+    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964"},
+    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9"},
+    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc"},
+    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c"},
+    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1"},
+    {file = "cffi-1.17.1-cp38-cp38-win32.whl", hash = "sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8"},
+    {file = "cffi-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1"},
+    {file = "cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16"},
+    {file = "cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36"},
+    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8"},
+    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca74b8dbe6e8e8263c0ffd60277de77dcee6c837a3d0881d8c1ead7268c9e576"},
+    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7f5baafcc48261359e14bcd6d9bff6d4b28d9103847c9e136694cb0501aef87"},
+    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98e3969bcff97cae1b2def8ba499ea3d6f31ddfdb7635374834cf89a1a08ecf0"},
+    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdf5ce3acdfd1661132f2a9c19cac174758dc2352bfe37d98aa7512c6b7178b3"},
+    {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9755e4345d1ec879e3849e62222a18c7174d65a6a92d5b346b1863912168b595"},
+    {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f1e22e8c4419538cb197e4dd60acc919d7696e5ef98ee4da4e01d3f8cfa4cc5a"},
+    {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c03e868a0b3bc35839ba98e74211ed2b05d2119be4e8a0f224fba9384f1fe02e"},
+    {file = "cffi-1.17.1-cp39-cp39-win32.whl", hash = "sha256:e31ae45bc2e29f6b2abd0de1cc3b9d5205aa847cafaecb8af1476a609a2f6eb7"},
+    {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"},
+    {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"},
+]
+
+[package.dependencies]
+pycparser = "*"
+
 [[package]]
 name = "cfgv"
 version = "3.4.0"
@@ -226,6 +320,20 @@ files = [
 [package.dependencies]
 colorama = {version = "*", markers = "platform_system == \"Windows\""}
 
+[[package]]
+name = "click-log"
+version = "0.4.0"
+description = "Logging integration for Click"
+optional = false
+python-versions = "*"
+files = [
+    {file = "click-log-0.4.0.tar.gz", hash = "sha256:3970f8570ac54491237bcdb3d8ab5e3eef6c057df29f8c3d1151a51a9c23b975"},
+    {file = "click_log-0.4.0-py2.py3-none-any.whl", hash = "sha256:a43e394b528d52112af599f2fc9e4b7cf3c15f94e53581f74fa6867e68c91756"},
+]
+
+[package.dependencies]
+click = "*"
+
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -237,6 +345,55 @@ files = [
     {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 ]
 
+[[package]]
+name = "cryptography"
+version = "43.0.1"
+description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "cryptography-43.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d"},
+    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062"},
+    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68aaecc4178e90719e95298515979814bda0cbada1256a4485414860bd7ab962"},
+    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:de41fd81a41e53267cb020bb3a7212861da53a7d39f863585d13ea11049cf277"},
+    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f98bf604c82c416bc829e490c700ca1553eafdf2912a91e23a79d97d9801372a"},
+    {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:61ec41068b7b74268fa86e3e9e12b9f0c21fcf65434571dbb13d954bceb08042"},
+    {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:014f58110f53237ace6a408b5beb6c427b64e084eb451ef25a28308270086494"},
+    {file = "cryptography-43.0.1-cp37-abi3-win32.whl", hash = "sha256:2bd51274dcd59f09dd952afb696bf9c61a7a49dfc764c04dd33ef7a6b502a1e2"},
+    {file = "cryptography-43.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:666ae11966643886c2987b3b721899d250855718d6d9ce41b521252a17985f4d"},
+    {file = "cryptography-43.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:ac119bb76b9faa00f48128b7f5679e1d8d437365c5d26f1c2c3f0da4ce1b553d"},
+    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bbcce1a551e262dfbafb6e6252f1ae36a248e615ca44ba302df077a846a8806"},
+    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58d4e9129985185a06d849aa6df265bdd5a74ca6e1b736a77959b498e0505b85"},
+    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d03a475165f3134f773d1388aeb19c2d25ba88b6a9733c5c590b9ff7bbfa2e0c"},
+    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:511f4273808ab590912a93ddb4e3914dfd8a388fed883361b02dea3791f292e1"},
+    {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:80eda8b3e173f0f247f711eef62be51b599b5d425c429b5d4ca6a05e9e856baa"},
+    {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38926c50cff6f533f8a2dae3d7f19541432610d114a70808f0926d5aaa7121e4"},
+    {file = "cryptography-43.0.1-cp39-abi3-win32.whl", hash = "sha256:a575913fb06e05e6b4b814d7f7468c2c660e8bb16d8d5a1faf9b33ccc569dd47"},
+    {file = "cryptography-43.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:d75601ad10b059ec832e78823b348bfa1a59f6b8d545db3a24fd44362a1564cb"},
+    {file = "cryptography-43.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ea25acb556320250756e53f9e20a4177515f012c9eaea17eb7587a8c4d8ae034"},
+    {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c1332724be35d23a854994ff0b66530119500b6053d0bd3363265f7e5e77288d"},
+    {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fba1007b3ef89946dbbb515aeeb41e30203b004f0b4b00e5e16078b518563289"},
+    {file = "cryptography-43.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5b43d1ea6b378b54a1dc99dd8a2b5be47658fe9a7ce0a58ff0b55f4b43ef2b84"},
+    {file = "cryptography-43.0.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:88cce104c36870d70c49c7c8fd22885875d950d9ee6ab54df2745f83ba0dc365"},
+    {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:9d3cdb25fa98afdd3d0892d132b8d7139e2c087da1712041f6b762e4f807cc96"},
+    {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e710bf40870f4db63c3d7d929aa9e09e4e7ee219e703f949ec4073b4294f6172"},
+    {file = "cryptography-43.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7c05650fe8023c5ed0d46793d4b7d7e6cd9c04e68eabe5b0aeea836e37bdcec2"},
+    {file = "cryptography-43.0.1.tar.gz", hash = "sha256:203e92a75716d8cfb491dc47c79e17d0d9207ccffcbcb35f598fbe463ae3444d"},
+]
+
+[package.dependencies]
+cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""}
+
+[package.extras]
+docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
+docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"]
+nox = ["nox"]
+pep8test = ["check-sdist", "click", "mypy", "ruff"]
+sdist = ["build"]
+ssh = ["bcrypt (>=3.1.5)"]
+test = ["certifi", "cryptography-vectors (==43.0.1)", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
+test-randomorder = ["pytest-randomly"]
+
 [[package]]
 name = "dataclasses-json"
 version = "0.5.9"
@@ -267,6 +424,28 @@ files = [
     {file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"},
 ]
 
+[[package]]
+name = "docutils"
+version = "0.21.2"
+description = "Docutils -- Python Documentation Utilities"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2"},
+    {file = "docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f"},
+]
+
+[[package]]
+name = "dotty-dict"
+version = "1.3.1"
+description = "Dictionary wrapper for quick access to deeply nested keys."
+optional = false
+python-versions = ">=3.5,<4.0"
+files = [
+    {file = "dotty_dict-1.3.1-py3-none-any.whl", hash = "sha256:5022d234d9922f13aa711b4950372a06a6d64cb6d6db9ba43d0ba133ebfce31f"},
+    {file = "dotty_dict-1.3.1.tar.gz", hash = "sha256:4b016e03b8ae265539757a53eba24b9bfda506fb94fbce0bee843c6f05541a15"},
+]
+
 [[package]]
 name = "exceptiongroup"
 version = "1.2.2"
@@ -328,6 +507,38 @@ files = [
 flake8 = ">=3"
 pydocstyle = ">=2.1"
 
+[[package]]
+name = "gitdb"
+version = "4.0.11"
+description = "Git Object Database"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "gitdb-4.0.11-py3-none-any.whl", hash = "sha256:81a3407ddd2ee8df444cbacea00e2d038e40150acfa3001696fe0dcf1d3adfa4"},
+    {file = "gitdb-4.0.11.tar.gz", hash = "sha256:bf5421126136d6d0af55bc1e7c1af1c397a34f5b7bd79e776cd3e89785c2b04b"},
+]
+
+[package.dependencies]
+smmap = ">=3.0.1,<6"
+
+[[package]]
+name = "gitpython"
+version = "3.1.43"
+description = "GitPython is a Python library used to interact with Git repositories"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "GitPython-3.1.43-py3-none-any.whl", hash = "sha256:eec7ec56b92aad751f9912a73404bc02ba212a23adb2c7098ee668417051a1ff"},
+    {file = "GitPython-3.1.43.tar.gz", hash = "sha256:35f314a9f878467f5453cc1fee295c3e18e52f1b99f10f6cf5b1682e968a9e7c"},
+]
+
+[package.dependencies]
+gitdb = ">=4.0.1,<5"
+
+[package.extras]
+doc = ["sphinx (==4.3.2)", "sphinx-autodoc-typehints", "sphinx-rtd-theme", "sphinxcontrib-applehelp (>=1.0.2,<=1.0.4)", "sphinxcontrib-devhelp (==1.0.2)", "sphinxcontrib-htmlhelp (>=2.0.0,<=2.0.1)", "sphinxcontrib-qthelp (==1.0.3)", "sphinxcontrib-serializinghtml (==1.1.5)"]
+test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions"]
+
 [[package]]
 name = "htmlmin"
 version = "0.1.12"
@@ -366,6 +577,29 @@ files = [
 [package.extras]
 all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
 
+[[package]]
+name = "importlib-metadata"
+version = "8.5.0"
+description = "Read metadata from Python packages"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b"},
+    {file = "importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7"},
+]
+
+[package.dependencies]
+zipp = ">=3.20"
+
+[package.extras]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
+cover = ["pytest-cov"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+enabler = ["pytest-enabler (>=2.2)"]
+perf = ["ipython"]
+test = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"]
+type = ["pytest-mypy"]
+
 [[package]]
 name = "iniconfig"
 version = "2.0.0"
@@ -377,6 +611,17 @@ files = [
     {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
 ]
 
+[[package]]
+name = "invoke"
+version = "2.2.0"
+description = "Pythonic task execution"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "invoke-2.2.0-py3-none-any.whl", hash = "sha256:6ea924cc53d4f78e3d98bc436b08069a03077e6f85ad1ddaa8a116d7dad15820"},
+    {file = "invoke-2.2.0.tar.gz", hash = "sha256:ee6cbb101af1a859c7fe84f2a264c059020b0cb7fe3535f9424300ab568f6bd5"},
+]
+
 [[package]]
 name = "isort"
 version = "5.13.2"
@@ -391,6 +636,79 @@ files = [
 [package.extras]
 colors = ["colorama (>=0.4.6)"]
 
+[[package]]
+name = "jaraco-classes"
+version = "3.4.0"
+description = "Utility functions for Python class constructs"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "jaraco.classes-3.4.0-py3-none-any.whl", hash = "sha256:f662826b6bed8cace05e7ff873ce0f9283b5c924470fe664fff1c2f00f581790"},
+    {file = "jaraco.classes-3.4.0.tar.gz", hash = "sha256:47a024b51d0239c0dd8c8540c6c7f484be3b8fcf0b2d85c13825780d3b3f3acd"},
+]
+
+[package.dependencies]
+more-itertools = "*"
+
+[package.extras]
+docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
+
+[[package]]
+name = "jaraco-context"
+version = "6.0.1"
+description = "Useful decorators and context managers"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "jaraco.context-6.0.1-py3-none-any.whl", hash = "sha256:f797fc481b490edb305122c9181830a3a5b76d84ef6d1aef2fb9b47ab956f9e4"},
+    {file = "jaraco_context-6.0.1.tar.gz", hash = "sha256:9bae4ea555cf0b14938dc0aee7c9f32ed303aa20a3b73e7dc80111628792d1b3"},
+]
+
+[package.dependencies]
+"backports.tarfile" = {version = "*", markers = "python_version < \"3.12\""}
+
+[package.extras]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+test = ["portend", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
+
+[[package]]
+name = "jaraco-functools"
+version = "4.1.0"
+description = "Functools like those found in stdlib"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "jaraco.functools-4.1.0-py3-none-any.whl", hash = "sha256:ad159f13428bc4acbf5541ad6dec511f91573b90fba04df61dafa2a1231cf649"},
+    {file = "jaraco_functools-4.1.0.tar.gz", hash = "sha256:70f7e0e2ae076498e212562325e805204fc092d7b4c17e0e86c959e249701a9d"},
+]
+
+[package.dependencies]
+more-itertools = "*"
+
+[package.extras]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
+cover = ["pytest-cov"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+enabler = ["pytest-enabler (>=2.2)"]
+test = ["jaraco.classes", "pytest (>=6,!=8.1.*)"]
+type = ["pytest-mypy"]
+
+[[package]]
+name = "jeepney"
+version = "0.8.0"
+description = "Low-level, pure Python DBus protocol wrapper."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "jeepney-0.8.0-py3-none-any.whl", hash = "sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755"},
+    {file = "jeepney-0.8.0.tar.gz", hash = "sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806"},
+]
+
+[package.extras]
+test = ["async-timeout", "pytest", "pytest-asyncio (>=0.17)", "pytest-trio", "testpath", "trio"]
+trio = ["async_generator", "trio"]
+
 [[package]]
 name = "jinja2"
 version = "3.1.4"
@@ -494,6 +812,35 @@ files = [
 [package.dependencies]
 referencing = ">=0.31.0"
 
+[[package]]
+name = "keyring"
+version = "25.4.1"
+description = "Store and access your passwords safely."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "keyring-25.4.1-py3-none-any.whl", hash = "sha256:5426f817cf7f6f007ba5ec722b1bcad95a75b27d780343772ad76b17cb47b0bf"},
+    {file = "keyring-25.4.1.tar.gz", hash = "sha256:b07ebc55f3e8ed86ac81dd31ef14e81ace9dd9c3d4b5d77a6e9a2016d0d71a1b"},
+]
+
+[package.dependencies]
+importlib-metadata = {version = ">=4.11.4", markers = "python_version < \"3.12\""}
+"jaraco.classes" = "*"
+"jaraco.context" = "*"
+"jaraco.functools" = "*"
+jeepney = {version = ">=0.4.2", markers = "sys_platform == \"linux\""}
+pywin32-ctypes = {version = ">=0.2.0", markers = "sys_platform == \"win32\""}
+SecretStorage = {version = ">=3.2", markers = "sys_platform == \"linux\""}
+
+[package.extras]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
+completion = ["shtab (>=1.1.0)"]
+cover = ["pytest-cov"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+enabler = ["pytest-enabler (>=2.2)"]
+test = ["pyfakefs", "pytest (>=6,!=8.1.*)"]
+type = ["pygobject-stubs", "pytest-mypy", "shtab", "types-pywin32"]
+
 [[package]]
 name = "markdown2"
 version = "2.5.0"
@@ -624,6 +971,17 @@ files = [
     {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
 ]
 
+[[package]]
+name = "more-itertools"
+version = "10.5.0"
+description = "More routines for operating on iterables, beyond itertools"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "more-itertools-10.5.0.tar.gz", hash = "sha256:5482bfef7849c25dc3c6dd53a6173ae4795da2a41a80faea6700d9f5846c5da6"},
+    {file = "more_itertools-10.5.0-py3-none-any.whl", hash = "sha256:037b0d3203ce90cca8ab1defbbdac29d5f993fc20131f3664dc8d6acfa872aef"},
+]
+
 [[package]]
 name = "mypy"
 version = "1.11.2"
@@ -682,6 +1040,31 @@ files = [
     {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
 ]
 
+[[package]]
+name = "nh3"
+version = "0.2.18"
+description = "Python bindings to the ammonia HTML sanitization library."
+optional = false
+python-versions = "*"
+files = [
+    {file = "nh3-0.2.18-cp37-abi3-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:14c5a72e9fe82aea5fe3072116ad4661af5cf8e8ff8fc5ad3450f123e4925e86"},
+    {file = "nh3-0.2.18-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:7b7c2a3c9eb1a827d42539aa64091640bd275b81e097cd1d8d82ef91ffa2e811"},
+    {file = "nh3-0.2.18-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42c64511469005058cd17cc1537578eac40ae9f7200bedcfd1fc1a05f4f8c200"},
+    {file = "nh3-0.2.18-cp37-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0411beb0589eacb6734f28d5497ca2ed379eafab8ad8c84b31bb5c34072b7164"},
+    {file = "nh3-0.2.18-cp37-abi3-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:5f36b271dae35c465ef5e9090e1fdaba4a60a56f0bb0ba03e0932a66f28b9189"},
+    {file = "nh3-0.2.18-cp37-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:34c03fa78e328c691f982b7c03d4423bdfd7da69cd707fe572f544cf74ac23ad"},
+    {file = "nh3-0.2.18-cp37-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19aaba96e0f795bd0a6c56291495ff59364f4300d4a39b29a0abc9cb3774a84b"},
+    {file = "nh3-0.2.18-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de3ceed6e661954871d6cd78b410213bdcb136f79aafe22aa7182e028b8c7307"},
+    {file = "nh3-0.2.18-cp37-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6955369e4d9f48f41e3f238a9e60f9410645db7e07435e62c6a9ea6135a4907f"},
+    {file = "nh3-0.2.18-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:f0eca9ca8628dbb4e916ae2491d72957fdd35f7a5d326b7032a345f111ac07fe"},
+    {file = "nh3-0.2.18-cp37-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:3a157ab149e591bb638a55c8c6bcb8cdb559c8b12c13a8affaba6cedfe51713a"},
+    {file = "nh3-0.2.18-cp37-abi3-musllinux_1_2_i686.whl", hash = "sha256:c8b3a1cebcba9b3669ed1a84cc65bf005728d2f0bc1ed2a6594a992e817f3a50"},
+    {file = "nh3-0.2.18-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:36c95d4b70530b320b365659bb5034341316e6a9b30f0b25fa9c9eff4c27a204"},
+    {file = "nh3-0.2.18-cp37-abi3-win32.whl", hash = "sha256:a7f1b5b2c15866f2db413a3649a8fe4fd7b428ae58be2c0f6bca5eefd53ca2be"},
+    {file = "nh3-0.2.18-cp37-abi3-win_amd64.whl", hash = "sha256:8ce0f819d2f1933953fca255db2471ad58184a60508f03e6285e5114b6254844"},
+    {file = "nh3-0.2.18.tar.gz", hash = "sha256:94a166927e53972a9698af9542ace4e38b9de50c34352b962f4d9a7d4c927af4"},
+]
+
 [[package]]
 name = "nodeenv"
 version = "1.9.1"
@@ -873,9 +1256,9 @@ files = [
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
     {version = ">=1.22.4", markers = "python_version < \"3.11\""},
     {version = ">=1.23.2", markers = "python_version == \"3.11\""},
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
 python-dateutil = ">=2.8.2"
 pytz = ">=2020.1"
@@ -931,6 +1314,20 @@ files = [
 [package.dependencies]
 flake8 = ">=5.0.0"
 
+[[package]]
+name = "pkginfo"
+version = "1.11.1"
+description = "Query metadata from sdists / bdists / installed packages."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pkginfo-1.11.1-py3-none-any.whl", hash = "sha256:bfa76a714fdfc18a045fcd684dbfc3816b603d9d075febef17cb6582bea29573"},
+    {file = "pkginfo-1.11.1.tar.gz", hash = "sha256:2e0dca1cf4c8e39644eed32408ea9966ee15e0d324c62ba899a393b3c6b467aa"},
+]
+
+[package.extras]
+testing = ["pytest", "pytest-cov", "wheel"]
+
 [[package]]
 name = "platformdirs"
 version = "4.3.6"
@@ -991,6 +1388,17 @@ files = [
     {file = "pycodestyle-2.12.1.tar.gz", hash = "sha256:6838eae08bbce4f6accd5d5572075c63626a15ee3e6f842df996bf62f6d73521"},
 ]
 
+[[package]]
+name = "pycparser"
+version = "2.22"
+description = "C parser in Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"},
+    {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"},
+]
+
 [[package]]
 name = "pydantic"
 version = "2.9.2"
@@ -1006,8 +1414,8 @@ files = [
 annotated-types = ">=0.6.0"
 pydantic-core = "2.23.4"
 typing-extensions = [
-    {version = ">=4.12.2", markers = "python_version >= \"3.13\""},
     {version = ">=4.6.1", markers = "python_version < \"3.13\""},
+    {version = ">=4.12.2", markers = "python_version >= \"3.13\""},
 ]
 
 [package.extras]
@@ -1193,6 +1601,56 @@ files = [
 [package.dependencies]
 six = ">=1.5"
 
+[[package]]
+name = "python-gitlab"
+version = "3.15.0"
+description = "Interact with GitLab API"
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "python-gitlab-3.15.0.tar.gz", hash = "sha256:c9e65eb7612a9fbb8abf0339972eca7fd7a73d4da66c9b446ffe528930aff534"},
+    {file = "python_gitlab-3.15.0-py3-none-any.whl", hash = "sha256:8f8d1c0d387f642eb1ac7bf5e8e0cd8b3dd49c6f34170cee3c7deb7d384611f3"},
+]
+
+[package.dependencies]
+requests = ">=2.25.0"
+requests-toolbelt = ">=0.10.1"
+
+[package.extras]
+autocompletion = ["argcomplete (>=1.10.0,<3)"]
+yaml = ["PyYaml (>=5.2)"]
+
+[[package]]
+name = "python-semantic-release"
+version = "7.34.6"
+description = "Automatic Semantic Versioning for Python projects"
+optional = false
+python-versions = "*"
+files = [
+    {file = "python-semantic-release-7.34.6.tar.gz", hash = "sha256:e9b8fb788024ae9510a924136d573588415a16eeca31cc5240f2754a80a2e831"},
+    {file = "python_semantic_release-7.34.6-py3-none-any.whl", hash = "sha256:7e3969ba4663d9b2087b02bf3ac140e202551377bf045c34e09bfe19753e19ab"},
+]
+
+[package.dependencies]
+click = ">=7,<9"
+click-log = ">=0.3,<1"
+dotty-dict = ">=1.3.0,<2"
+gitpython = ">=3.0.8,<4"
+invoke = ">=1.4.1,<3"
+packaging = "*"
+python-gitlab = ">=2,<4"
+requests = ">=2.25,<3"
+semver = ">=2.10,<3"
+tomlkit = ">=0.10,<1.0"
+twine = ">=3,<4"
+wheel = "*"
+
+[package.extras]
+dev = ["black", "isort", "tox"]
+docs = ["Jinja2 (==3.0.3)", "Sphinx (==1.8.6)"]
+mypy = ["mypy", "types-requests"]
+test = ["coverage (>=5,<6)", "mock (==1.3.0)", "pytest (>=7,<8)", "pytest-mock (>=2,<3)", "pytest-xdist (>=1,<2)", "responses (==0.13.3)"]
+
 [[package]]
 name = "pytz"
 version = "2024.2"
@@ -1204,6 +1662,17 @@ files = [
     {file = "pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a"},
 ]
 
+[[package]]
+name = "pywin32-ctypes"
+version = "0.2.3"
+description = "A (partial) reimplementation of pywin32 using ctypes/cffi"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "pywin32-ctypes-0.2.3.tar.gz", hash = "sha256:d162dc04946d704503b2edc4d55f3dba5c1d539ead017afa00142c38b9885755"},
+    {file = "pywin32_ctypes-0.2.3-py3-none-any.whl", hash = "sha256:8a1513379d709975552d202d942d9837758905c8d01eb82b8bcc30918929e7b8"},
+]
+
 [[package]]
 name = "pyyaml"
 version = "6.0.2"
@@ -1266,6 +1735,25 @@ files = [
     {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
 ]
 
+[[package]]
+name = "readme-renderer"
+version = "44.0"
+description = "readme_renderer is a library for rendering readme descriptions for Warehouse"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "readme_renderer-44.0-py3-none-any.whl", hash = "sha256:2fbca89b81a08526aadf1357a8c2ae889ec05fb03f5da67f9769c9a592166151"},
+    {file = "readme_renderer-44.0.tar.gz", hash = "sha256:8712034eabbfa6805cacf1402b4eeb2a73028f72d1166d6f5cb7f9c047c5d1e1"},
+]
+
+[package.dependencies]
+docutils = ">=0.21.2"
+nh3 = ">=0.2.14"
+Pygments = ">=2.5.1"
+
+[package.extras]
+md = ["cmarkgfm (>=0.8.0)"]
+
 [[package]]
 name = "referencing"
 version = "0.35.1"
@@ -1302,6 +1790,34 @@ urllib3 = ">=1.21.1,<3"
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
 use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
+[[package]]
+name = "requests-toolbelt"
+version = "1.0.0"
+description = "A utility belt for advanced users of python-requests"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"},
+    {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"},
+]
+
+[package.dependencies]
+requests = ">=2.0.1,<3.0.0"
+
+[[package]]
+name = "rfc3986"
+version = "2.0.0"
+description = "Validating URI References per RFC 3986"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "rfc3986-2.0.0-py2.py3-none-any.whl", hash = "sha256:50b1502b60e289cb37883f3dfd34532b8873c7de9f49bb546641ce9cbd256ebd"},
+    {file = "rfc3986-2.0.0.tar.gz", hash = "sha256:97aacf9dbd4bfd829baad6e6309fa6573aaf1be3f6fa735c8ab05e46cecb261c"},
+]
+
+[package.extras]
+idna2008 = ["idna"]
+
 [[package]]
 name = "rpds-py"
 version = "0.20.0"
@@ -1414,6 +1930,32 @@ files = [
     {file = "rpds_py-0.20.0.tar.gz", hash = "sha256:d72a210824facfdaf8768cf2d7ca25a042c30320b3020de2fa04640920d4e121"},
 ]
 
+[[package]]
+name = "secretstorage"
+version = "3.3.3"
+description = "Python bindings to FreeDesktop.org Secret Service API"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "SecretStorage-3.3.3-py3-none-any.whl", hash = "sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99"},
+    {file = "SecretStorage-3.3.3.tar.gz", hash = "sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77"},
+]
+
+[package.dependencies]
+cryptography = ">=2.0"
+jeepney = ">=0.6"
+
+[[package]]
+name = "semver"
+version = "2.13.0"
+description = "Python helper for Semantic Versioning (http://semver.org/)"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "semver-2.13.0-py2.py3-none-any.whl", hash = "sha256:ced8b23dceb22134307c1b8abfa523da14198793d9787ac838e70e29e77458d4"},
+    {file = "semver-2.13.0.tar.gz", hash = "sha256:fa0fe2722ee1c3f57eac478820c3a5ae2f624af8264cbdf9000c980ff7f75e3f"},
+]
+
 [[package]]
 name = "six"
 version = "1.16.0"
@@ -1425,6 +1967,17 @@ files = [
     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
 ]
 
+[[package]]
+name = "smmap"
+version = "5.0.1"
+description = "A pure Python implementation of a sliding window memory map manager"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "smmap-5.0.1-py3-none-any.whl", hash = "sha256:e6d8668fa5f93e706934a62d7b4db19c8d9eb8cf2adbb75ef1b675aa332b69da"},
+    {file = "smmap-5.0.1.tar.gz", hash = "sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62"},
+]
+
 [[package]]
 name = "snowballstemmer"
 version = "2.2.0"
@@ -1461,6 +2014,60 @@ files = [
     {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
 ]
 
+[[package]]
+name = "tomlkit"
+version = "0.13.2"
+description = "Style preserving TOML library"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "tomlkit-0.13.2-py3-none-any.whl", hash = "sha256:7a974427f6e119197f670fbbbeae7bef749a6c14e793db934baefc1b5f03efde"},
+    {file = "tomlkit-0.13.2.tar.gz", hash = "sha256:fff5fe59a87295b278abd31bec92c15d9bc4a06885ab12bcea52c71119392e79"},
+]
+
+[[package]]
+name = "tqdm"
+version = "4.66.5"
+description = "Fast, Extensible Progress Meter"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tqdm-4.66.5-py3-none-any.whl", hash = "sha256:90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd"},
+    {file = "tqdm-4.66.5.tar.gz", hash = "sha256:e1020aef2e5096702d8a025ac7d16b1577279c9d63f8375b63083e9a5f0fcbad"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[package.extras]
+dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"]
+notebook = ["ipywidgets (>=6)"]
+slack = ["slack-sdk"]
+telegram = ["requests"]
+
+[[package]]
+name = "twine"
+version = "3.8.0"
+description = "Collection of utilities for publishing packages on PyPI"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "twine-3.8.0-py3-none-any.whl", hash = "sha256:d0550fca9dc19f3d5e8eadfce0c227294df0a2a951251a4385797c8a6198b7c8"},
+    {file = "twine-3.8.0.tar.gz", hash = "sha256:8efa52658e0ae770686a13b675569328f1fba9837e5de1867bfe5f46a9aefe19"},
+]
+
+[package.dependencies]
+colorama = ">=0.4.3"
+importlib-metadata = ">=3.6"
+keyring = ">=15.1"
+pkginfo = ">=1.8.1"
+readme-renderer = ">=21.0"
+requests = ">=2.20"
+requests-toolbelt = ">=0.8.0,<0.9.0 || >0.9.0"
+rfc3986 = ">=1.4.0"
+tqdm = ">=4.14"
+urllib3 = ">=1.26.0"
+
 [[package]]
 name = "types-setuptools"
 version = "70.3.0.20240710"
@@ -1546,7 +2153,40 @@ platformdirs = ">=3.9.1,<5"
 docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
 test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
 
+[[package]]
+name = "wheel"
+version = "0.44.0"
+description = "A built-package format for Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "wheel-0.44.0-py3-none-any.whl", hash = "sha256:2376a90c98cc337d18623527a97c31797bd02bad0033d41547043a1cbfbe448f"},
+    {file = "wheel-0.44.0.tar.gz", hash = "sha256:a29c3f2817e95ab89aa4660681ad547c0e9547f20e75b0562fe7723c9a2a9d49"},
+]
+
+[package.extras]
+test = ["pytest (>=6.0.0)", "setuptools (>=65)"]
+
+[[package]]
+name = "zipp"
+version = "3.20.2"
+description = "Backport of pathlib-compatible object wrapper for zip files"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350"},
+    {file = "zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29"},
+]
+
+[package.extras]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
+cover = ["pytest-cov"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+enabler = ["pytest-enabler (>=2.2)"]
+test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
+type = ["pytest-mypy"]
+
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "329b132a93271e27c24b2809afc4db6a95fff4e605d964d29966c09eb6f1443d"
+content-hash = "2256d7b264ca3af01e83a71107252a8f9cc57abcbe73bf1e4b6bebd33906cf9e"
diff --git a/pyproject.toml b/pyproject.toml
index 603c168..f6565b8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -67,6 +67,7 @@ flake8-docstrings = "^1.6.0"
 pep8-naming = "^0.13.2"
 jsondiff = "^2.0.0"
 types-setuptools = "^70.3.0"
+python-semantic-release = "^7.32.2"
 
 [tool.setuptools.packages.find]
 where = ["docling_core/resources/schemas"]

From 48ee25e0897bfd70f90c7f614c5a5cb10b823f3c Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Date: Mon, 30 Sep 2024 15:56:47 +0200
Subject: [PATCH 33/34] feat: set version field as string with pattern and
 check compatibility

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
---
 docling_core/types/experimental/document.py | 39 +++++++----
 test/test_docling_doc.py                    | 71 ++++++++++++---------
 2 files changed, 68 insertions(+), 42 deletions(-)

diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index 6b4a9a2..2bf1420 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -1,8 +1,9 @@
 """Models for the Docling Document data type."""
 
 import mimetypes
+import re
 import typing
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, Final, List, Optional, Tuple, Union
 
 import pandas as pd
 from pydantic import (
@@ -10,18 +11,22 @@
     BaseModel,
     ConfigDict,
     Field,
+    StringConstraints,
     computed_field,
     field_validator,
     model_validator,
 )
 from tabulate import tabulate
+from typing_extensions import Annotated
 
+from docling_core.search.package import VERSION_PATTERN
 from docling_core.types.doc.tokens import DocumentToken
 from docling_core.types.experimental import BoundingBox, Size
 from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
 
 Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
 LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
+CURRENT_VERSION: Final = "1.0.0"
 
 
 class BasePictureData(BaseModel):  # TBD
@@ -627,9 +632,8 @@ class DoclingDocument(BaseModel):
     """DoclingDocument."""
 
     schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
-    version: typing.Literal["1.0.0"] = (
-        "1.0.0"
-        # Optional[SemanticVersion] = Field(default=None, validate_default=True)
+    version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
+        CURRENT_VERSION
     )
     description: DescriptionItem
     name: str  # The working name of this document, without extensions
@@ -653,15 +657,6 @@ class DoclingDocument(BaseModel):
 
     pages: Dict[int, PageItem] = {}  # empty as default
 
-    # @field_validator("version")
-    # @classmethod
-    # def check_version_omitted(cls, v: str) -> str:
-    #     """Set the version field to this library version by default."""
-    #     if v is None:
-    #         return "1.0.0"
-    #     else:
-    #         return v
-
     def add_group(
         self,
         label: Optional[GroupLabel] = None,
@@ -1171,6 +1166,24 @@ def add_page(self, page_no: int, size: Size) -> PageItem:
         self.pages[page_no] = pitem
         return pitem
 
+    @field_validator("version")
+    @classmethod
+    def check_version_is_compatible(cls, v: str) -> str:
+        """Check if this document version is compatible with current version."""
+        current_match = re.match(VERSION_PATTERN, CURRENT_VERSION)
+        doc_match = re.match(VERSION_PATTERN, v)
+        if (
+            doc_match is None
+            or current_match is None
+            or doc_match["major"] != current_match["major"]
+            or doc_match["minor"] > current_match["minor"]
+        ):
+            raise ValueError(
+                f"incompatible version {v} with schema version {CURRENT_VERSION}"
+            )
+        else:
+            return CURRENT_VERSION
+
     @model_validator(mode="after")  # type: ignore
     @classmethod
     def validate_document(cls, d: "DoclingDocument"):
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index ae60c8f..2329f46 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -2,8 +2,10 @@
 
 import pytest
 import yaml
+from pydantic import ValidationError
 
 from docling_core.types.experimental.document import (
+    CURRENT_VERSION,
     BasePictureData,
     BaseTableData,
     DescriptionItem,
@@ -335,32 +337,43 @@ def _construct_doc() -> DoclingDocument:
     return doc
 
 
-# def test_version_doc():
-#
-#     # default version
-#     version = "1.0.0"
-#     doc = DoclingDocument(description=DescriptionItem(), name="Untitled 1")
-#     assert doc.version == version
-#
-#     with open("test/data/experimental/dummy_doc.yaml", "r") as fp:
-#         dict_from_yaml = yaml.safe_load(fp)
-#     doc = DoclingDocument.model_validate(dict_from_yaml)
-#     assert doc.version == version
-#
-#     # custom version at construction
-#     doc = DoclingDocument(
-#         description=DescriptionItem(),
-#         name="Untitled 1",
-#         version="2.1.0-post.8+96354bda",
-#     )
-#     assert doc.version.major == 2
-#     assert doc.version.minor == 1
-#     assert doc.version.patch == 0
-#     assert doc.version.prerelease == "post.8"
-#     assert doc.version.build == "96354bda"
-#     doc_json = doc.model_dump()
-#     assert doc_json["version"] == "2.1.0-post.8+96354bda"
-#
-#     # invalid version
-#     with pytest.raises(ValidationError, match="SemVer"):
-#         DoclingDocument(description=DescriptionItem(), name="Untitled 1", version="abc")
+def test_version_doc():
+
+    # default version
+    doc = DoclingDocument(description=DescriptionItem(), name="Untitled 1")
+    assert doc.version == CURRENT_VERSION
+
+    with open("test/data/experimental/dummy_doc.yaml") as fp:
+        dict_from_yaml = yaml.safe_load(fp)
+    doc = DoclingDocument.model_validate(dict_from_yaml)
+    assert doc.version == CURRENT_VERSION
+
+    # invalid version
+    with pytest.raises(ValidationError, match="NoneType"):
+        DoclingDocument(description=DescriptionItem(), name="Untitled 1", version=None)
+    with pytest.raises(ValidationError, match="pattern"):
+        DoclingDocument(description=DescriptionItem(), name="Untitled 1", version="abc")
+
+    # incompatible version (major)
+    major_split = CURRENT_VERSION.split(".", 1)
+    new_version = f"{int(major_split[0]) + 1}.{major_split[1]}"
+    with pytest.raises(ValidationError, match="incompatible"):
+        DoclingDocument(
+            description=DescriptionItem(), name="Untitled 1", version=new_version
+        )
+
+    # incompatible version (minor)
+    minor_split = major_split[1].split(".", 1)
+    new_version = f"{major_split[0]}.{int(minor_split[0]) + 1}.{minor_split[1]}"
+    with pytest.raises(ValidationError, match="incompatible"):
+        DoclingDocument(
+            description=DescriptionItem(), name="Untitled 1", version=new_version
+        )
+
+    # compatible version (equal or lower minor)
+    patch_split = minor_split[1].split(".", 1)
+    comp_version = f"{major_split[0]}.{minor_split[0]}.{int(patch_split[0]) + 1}"
+    doc = DoclingDocument(
+        description=DescriptionItem(), name="Untitled 1", version=comp_version
+    )
+    assert doc.version == CURRENT_VERSION

From 0ed6c5c7f063eb40b311a987989db89449ccc86f Mon Sep 17 00:00:00 2001
From: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Date: Mon, 30 Sep 2024 22:49:41 +0200
Subject: [PATCH 34/34] add JSON Pointer validation to refs, fix test data

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
---
 docling_core/types/experimental/document.py | 20 ++++++++++----------
 test/data/experimental/dummy_doc.yaml       | 18 +++++++++---------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
index 2bf1420..a528dd4 100644
--- a/docling_core/types/experimental/document.py
+++ b/docling_core/types/experimental/document.py
@@ -28,6 +28,9 @@
 LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
 CURRENT_VERSION: Final = "1.0.0"
 
+# (subset of) JSON Pointer URI fragment identifier format:
+_JSON_POINTER_REGEX = r"^#(/[\w\-]+(/\d+)?)?$"
+
 
 class BasePictureData(BaseModel):  # TBD
     """BasePictureData."""
@@ -155,7 +158,7 @@ def validate_mimetype(cls, v):
 class RefItem(BaseModel):
     """RefItem."""
 
-    cref: str = Field(alias="$ref")
+    cref: str = Field(alias="$ref", pattern=_JSON_POINTER_REGEX)
 
     # This method makes RefItem compatible with DocItem
     def get_ref(self):
@@ -169,18 +172,15 @@ def get_ref(self):
     def resolve(self, doc: "DoclingDocument"):
         """resolve."""
         path_components = self.cref.split("/")
-        if len(path_components) > 2:
+        if (num_comps := len(path_components)) == 3:
             _, path, index_str = path_components
-        else:
-            _, path = path_components
-            index_str = None
-
-        if index_str:
             index = int(index_str)
             obj = doc.__getattribute__(path)[index]
-        else:
+        elif num_comps == 2:
+            _, path = path_components
             obj = doc.__getattribute__(path)
-
+        else:
+            raise RuntimeError(f"Unsupported number of path components: {num_comps}")
         return obj
 
 
@@ -213,7 +213,7 @@ class ProvenanceItem(BaseModel):
 class NodeItem(BaseModel):
     """NodeItem."""
 
-    self_ref: str  # format spec: json-path
+    self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
     parent: Optional[RefItem] = None
     children: List[RefItem] = []
 
diff --git a/test/data/experimental/dummy_doc.yaml b/test/data/experimental/dummy_doc.yaml
index d72f454..c1fbe59 100644
--- a/test/data/experimental/dummy_doc.yaml
+++ b/test/data/experimental/dummy_doc.yaml
@@ -14,7 +14,7 @@ furniture:
   self_ref: "#/furniture"
   parent: null # Only root elements have no parent.
   children: # only the first-level children appear here, as references (RefItem)
-    - $ref: "/texts/0"
+    - $ref: "#/texts/0"
 
 # Root element for anything in the document body, type GroupItem
 body:
@@ -22,10 +22,10 @@ body:
   self_ref: "#/body"
   parent: null # Only root elements have no parent.
   children: # only the first-level children appear here, as references (RefItem)
-    - $ref: "/texts/1"
-    - $ref: "/pictures/0"
-    - $ref: "/texts/3"
-    - $ref: "/tables/0"
+    - $ref: "#/texts/1"
+    - $ref: "#/pictures/0"
+    - $ref: "#/texts/3"
+    - $ref: "#/tables/0"
 
 # All groups of items nested deeper in body or furniture roots, type List[GroupItem]
 groups: [] # The parent + children relations capture nesting and reading-order.
@@ -68,7 +68,7 @@ texts:
     self_ref: "#/texts/2"
     label: "section_header"
     parent:
-      $ref: "/pictures/0"
+      $ref: "#/pictures/0"
     children: [ ]
     prov:
       - page_no: 1
@@ -132,7 +132,7 @@ pictures: # All pictures...
     parent:
       $ref: "#/body"
     captions:
-      - $ref: "/texts/3"
+      - $ref: "#/texts/3"
     data: # BaseFigureData Type
       classification: "illustration"
       confidence: 0.78
@@ -147,7 +147,7 @@ pictures: # All pictures...
       uri: "file:///dummy_doc/pictures/0.png"
       #alternatives: base64 encoded striong
     children:
-      - $ref: "/texts/2" # This text element appears inside the figure, hence it is a child.
+      - $ref: "#/texts/2" # This text element appears inside the figure, hence it is a child.
     prov:
       - page_no: 1
         bbox:
@@ -174,4 +174,4 @@ pages: # Optional, for layout documents
       uri: "file:///dummy_doc/pages/1.png"
       #alternatives: base64 encoded string
     num_elements: 23
-    page_no: 1
\ No newline at end of file
+    page_no: 1