Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: (experimental) introduce new document format #21

Merged
merged 43 commits into from
Oct 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
a90cc19
Draft new docling document format, pydantic model and tests
cau-git Sep 17, 2024
43c23b9
Fix tests to have unique document_hashes per test
cau-git Sep 18, 2024
384d12a
Manual update from main
cau-git Sep 19, 2024
ed08764
Merge from main
cau-git Sep 19, 2024
1c753e9
Merge from main
cau-git Sep 19, 2024
3d3c2f3
Move new-format to experimental path
cau-git Sep 20, 2024
bdbd93e
Move new-format to experimental path
cau-git Sep 20, 2024
7dcbde7
Updates for document construction API and format
cau-git Sep 20, 2024
940f6cd
Add comments
cau-git Sep 20, 2024
ccbe241
Add BaseTableData and table cell typing
cau-git Sep 21, 2024
0685709
Tree element iterator, several API fixes
cau-git Sep 23, 2024
f791f74
Turn captions into list field
cau-git Sep 23, 2024
22137a7
Merge branch 'main' of github.com:DS4SD/docling-core into cau/new-for…
cau-git Sep 23, 2024
4f1c190
Add export methods to DoclingDocument and types
cau-git Sep 24, 2024
0a1e6ce
Change DoclingDocument.iterate_elements and add print tree function
cau-git Sep 24, 2024
a83ff00
Introduce label enum types, apply everywhere
cau-git Sep 24, 2024
622f625
Introduce provenance info, use enum labels
cau-git Sep 24, 2024
70ec6b1
Fixes for iterator, add label to GroupItem
cau-git Sep 25, 2024
b50d53c
Update formatting
cau-git Sep 25, 2024
4c12a69
Docstrings and linter fixes
cau-git Sep 25, 2024
8251f99
Lockfile rollback, since updating breaks tests
cau-git Sep 25, 2024
acb1cdc
Cleanup
cau-git Sep 25, 2024
ce0b7ee
Several improvements and cleanup
cau-git Sep 25, 2024
c986ea5
Format fixes
cau-git Sep 25, 2024
2389406
Big redesign for usage of hashes, several other fixes
cau-git Sep 27, 2024
67109c4
Fix flake8 config
cau-git Sep 27, 2024
a6167f7
Merge branch 'main' of github.com:DS4SD/docling-core into cau/new-for…
cau-git Sep 27, 2024
23e1a52
Remove hash, renamings
cau-git Sep 27, 2024
fe72b3b
updating the tests
PeterStaar-IBM Sep 27, 2024
b6a7049
fixed merge conflicts
PeterStaar-IBM Sep 27, 2024
baced33
added some unit tests for DocItem
PeterStaar-IBM Sep 27, 2024
40d7fa4
Add tree validation API and test case
cau-git Sep 27, 2024
5084853
Merge branch 'cau/new-format-dev' of github.com:DS4SD/docling-core in…
cau-git Sep 27, 2024
adc16f3
Add extra=Forbid to NodeItem
cau-git Sep 27, 2024
34ce64b
feat: set DoclingDocument version as SemanticVersion with default
ceberam Sep 27, 2024
62a3ebe
Cleanup
cau-git Sep 30, 2024
677c759
More renames, schema_name field, literal types
cau-git Sep 30, 2024
46a6d8d
Simpler literal enforcement
cau-git Sep 30, 2024
bb96c84
Fix static document version
cau-git Sep 30, 2024
089d692
Rollback changes to allow for semver<3
cau-git Sep 30, 2024
9fcaa88
build: rollback changes to include python-semantic-release as dev dep…
ceberam Sep 30, 2024
48ee25e
feat: set version field as string with pattern and check compatibility
ceberam Sep 30, 2024
0ed6c5c
add JSON Pointer validation to refs, fix test data
vagenas Sep 30, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
[flake8]
per-file-ignores = __init__.py:F401
max-line-length = 88
exclude = test/*
max-complexity = 18
docstring-convention = google
ignore = W503,E203
classmethod-decorators = classmethod,validator
classmethod-decorators = classmethod,validator
1 change: 0 additions & 1 deletion docling_core/types/doc/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,6 @@ def export_to_document_tokens(
):
"""Export text element to document tokens format."""
body = f"<{self.obj_type}>"
# body = f"<{self.name}>"

assert DocumentToken.is_known_token(
body
Expand Down
30 changes: 30 additions & 0 deletions docling_core/types/experimental/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

"""Package for models defined by the Document type."""

from .base import BoundingBox, CoordOrigin, Size
from .document import (
BasePictureData,
BaseTableData,
DescriptionItem,
DocItem,
DoclingDocument,
DocumentOrigin,
FloatingItem,
GroupItem,
ImageRef,
KeyValueItem,
NodeItem,
PageItem,
PictureItem,
ProvenanceItem,
RefItem,
SectionHeaderItem,
TableCell,
TableItem,
TextItem,
)
from .labels import DocItemLabel, GroupLabel, TableCellLabel
167 changes: 167 additions & 0 deletions docling_core/types/experimental/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
"""Models for the base data types."""

import copy
from enum import Enum
from typing import Tuple

from pydantic import BaseModel


class CoordOrigin(str, Enum):
"""CoordOrigin."""

TOPLEFT = "TOPLEFT"
BOTTOMLEFT = "BOTTOMLEFT"


class Size(BaseModel):
"""Size."""

width: float = 0.0
height: float = 0.0

def as_tuple(self):
"""as_tuple."""
return (self.width, self.height)


class BoundingBox(BaseModel):
"""BoundingBox."""

l: float # left
t: float # top
r: float # right
b: float # bottom

coord_origin: CoordOrigin = CoordOrigin.TOPLEFT

@property
def width(self):
"""width."""
return self.r - self.l

@property
def height(self):
"""height."""
return abs(self.t - self.b)

def scaled(self, scale: float) -> "BoundingBox":
"""scaled.

:param scale: float:

"""
out_bbox = copy.deepcopy(self)
out_bbox.l *= scale
out_bbox.r *= scale
out_bbox.t *= scale
out_bbox.b *= scale

return out_bbox

def normalized(self, page_size: Size) -> "BoundingBox":
"""normalized.

:param page_size: Size:

"""
out_bbox = copy.deepcopy(self)
out_bbox.l /= page_size.width
out_bbox.r /= page_size.width
out_bbox.t /= page_size.height
out_bbox.b /= page_size.height

return out_bbox

def as_tuple(self):
"""as_tuple."""
if self.coord_origin == CoordOrigin.TOPLEFT:
return (self.l, self.t, self.r, self.b)
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
return (self.l, self.b, self.r, self.t)

@classmethod
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
"""from_tuple.

:param coord: Tuple[float:
:param ...]:
:param origin: CoordOrigin:

"""
if origin == CoordOrigin.TOPLEFT:
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
if r < l:
l, r = r, l
if b < t:
b, t = t, b

return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
elif origin == CoordOrigin.BOTTOMLEFT:
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
if r < l:
l, r = r, l
if b > t:
b, t = t, b

return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)

def area(self) -> float:
"""area."""
return (self.r - self.l) * (self.b - self.t)

def intersection_area_with(self, other: "BoundingBox") -> float:
"""intersection_area_with.

:param other: "BoundingBox":

"""
# Calculate intersection coordinates
left = max(self.l, other.l)
top = max(self.t, other.t)
right = min(self.r, other.r)
bottom = min(self.b, other.b)

# Calculate intersection dimensions
width = right - left
height = bottom - top

# If the bounding boxes do not overlap, width or height will be negative
if width <= 0 or height <= 0:
return 0.0

return width * height

def to_bottom_left_origin(self, page_height) -> "BoundingBox":
"""to_bottom_left_origin.

:param page_height:

"""
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
return self
elif self.coord_origin == CoordOrigin.TOPLEFT:
return BoundingBox(
l=self.l,
r=self.r,
t=page_height - self.t,
b=page_height - self.b,
coord_origin=CoordOrigin.BOTTOMLEFT,
)

def to_top_left_origin(self, page_height):
"""to_top_left_origin.

:param page_height:

"""
if self.coord_origin == CoordOrigin.TOPLEFT:
return self
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
return BoundingBox(
l=self.l,
r=self.r,
t=page_height - self.t, # self.b
b=page_height - self.b, # self.t
coord_origin=CoordOrigin.TOPLEFT,
)
Loading
Loading