Skip to content

Commit

Permalink
feat: extend chunk meta with schema, version, origin (#49)
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas authored Oct 22, 2024
1 parent 1388e67 commit d09fe7e
Show file tree
Hide file tree
Showing 5 changed files with 1,880 additions and 236 deletions.
5 changes: 4 additions & 1 deletion docling_core/transforms/chunker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,7 @@
"""Define the chunker types."""

from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
from docling_core.transforms.chunker.hierarchical_chunker import (
DocMeta,
HierarchicalChunker,
)
2 changes: 1 addition & 1 deletion docling_core/transforms/chunker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@


class BaseMeta(BaseModel):
"""Metadata base class."""
"""Chunk metadata base class."""

excluded_embed: ClassVar[list[str]] = []
excluded_llm: ClassVar[list[str]] = []
Expand Down
64 changes: 58 additions & 6 deletions docling_core/transforms/chunker/hierarchical_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,19 @@
from __future__ import annotations

import logging
from typing import Any, ClassVar, Iterator, Optional
import re
from typing import Any, ClassVar, Final, Iterator, Literal, Optional

from pandas import DataFrame
from pydantic import Field
from pydantic import Field, StringConstraints, field_validator
from typing_extensions import Annotated

from docling_core.search.package import VERSION_PATTERN
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
from docling_core.types.doc import DoclingDocument as DLDocument
from docling_core.types import DoclingDocument as DLDocument
from docling_core.types.doc.document import (
DocItem,
DocumentOrigin,
LevelNumber,
ListItem,
SectionHeaderItem,
Expand All @@ -25,16 +29,31 @@
)
from docling_core.types.doc.labels import DocItemLabel

_VERSION: Final = "1.0.0"

_KEY_SCHEMA_NAME = "schema_name"
_KEY_VERSION = "version"
_KEY_DOC_ITEMS = "doc_items"
_KEY_HEADINGS = "headings"
_KEY_CAPTIONS = "captions"
_KEY_ORIGIN = "origin"

_logger = logging.getLogger(__name__)


class DocMeta(BaseMeta):
"""Data model for Hierarchical Chunker metadata."""
"""Data model for Hierarchical Chunker chunk metadata."""

schema_name: Literal["docling_core.transforms.chunker.DocMeta"] = Field(
default="docling_core.transforms.chunker.DocMeta",
alias=_KEY_SCHEMA_NAME,
)
version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
Field(
default=_VERSION,
alias=_KEY_VERSION,
)
)
doc_items: list[DocItem] = Field(
alias=_KEY_DOC_ITEMS,
min_length=1,
Expand All @@ -49,9 +68,39 @@ class DocMeta(BaseMeta):
alias=_KEY_CAPTIONS,
min_length=1,
)
origin: Optional[DocumentOrigin] = Field(
default=None,
alias=_KEY_ORIGIN,
)

excluded_embed: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
excluded_llm: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
excluded_embed: ClassVar[list[str]] = [
_KEY_SCHEMA_NAME,
_KEY_VERSION,
_KEY_DOC_ITEMS,
_KEY_ORIGIN,
]
excluded_llm: ClassVar[list[str]] = [
_KEY_SCHEMA_NAME,
_KEY_VERSION,
_KEY_DOC_ITEMS,
_KEY_ORIGIN,
]

@field_validator(_KEY_VERSION)
@classmethod
def check_version_is_compatible(cls, v: str) -> str:
"""Check if this meta item version is compatible with current version."""
current_match = re.match(VERSION_PATTERN, _VERSION)
doc_match = re.match(VERSION_PATTERN, v)
if (
doc_match is None
or current_match is None
or doc_match["major"] != current_match["major"]
or doc_match["minor"] > current_match["minor"]
):
raise ValueError(f"incompatible version {v} with schema version {_VERSION}")
else:
return _VERSION


class DocChunk(BaseChunk):
Expand Down Expand Up @@ -129,6 +178,7 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
for k in sorted(heading_by_level)
]
or None,
origin=dl_doc.origin,
),
)
list_items = [] # reset
Expand Down Expand Up @@ -171,6 +221,7 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
or None,
captions=captions,
origin=dl_doc.origin,
),
)
yield c
Expand All @@ -182,5 +233,6 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
doc_items=list_items,
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
or None,
origin=dl_doc.origin,
),
)
Loading

0 comments on commit d09fe7e

Please sign in to comment.