diff --git a/docling_core/transforms/chunker/base.py b/docling_core/transforms/chunker/base.py index 17650e7..c77a59a 100644 --- a/docling_core/transforms/chunker/base.py +++ b/docling_core/transforms/chunker/base.py @@ -7,16 +7,33 @@ from abc import ABC, abstractmethod from typing import Iterator, Optional -from pydantic import BaseModel +from pydantic import BaseModel, model_validator from docling_core.types import BoundingBox, Document +def _create_path(pos: int, path_prefix: str = "main-text") -> str: + return f"#/{path_prefix}/{pos}" + + class Chunk(BaseModel): """Data model for Chunk.""" path: str text: str + heading: Optional[str] = None + + @model_validator(mode="before") + @classmethod + def _json_pointer_from_json_path(cls, data): + path = data.get("path") + if path.startswith("$."): + parts = path.split("[") + data["path"] = _create_path( + pos=parts[1][:-1], + path_prefix=parts[0][2:], + ) + return data class ChunkWithMetadata(Chunk): @@ -24,7 +41,6 @@ class ChunkWithMetadata(Chunk): page: Optional[int] = None bbox: Optional[BoundingBox] = None - heading: Optional[str] = None class BaseChunker(BaseModel, ABC): @@ -44,3 +60,10 @@ def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]: Iterator[Chunk]: iterator over extracted chunks """ raise NotImplementedError() + + @classmethod + def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str: + return _create_path( + pos=pos, + path_prefix=path_prefix, + ) diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py index 5b1831d..2ad17d0 100644 --- a/docling_core/transforms/chunker/hierarchical_chunker.py +++ b/docling_core/transforms/chunker/hierarchical_chunker.py @@ -12,7 +12,7 @@ from typing import Any, Iterator, Optional, Union import pandas as pd -from pydantic import BaseModel, PositiveInt +from pydantic import BaseModel, Field, PositiveInt from docling_core.transforms.chunker import BaseChunker, Chunk, ChunkWithMetadata from docling_core.types import BaseText @@ -25,9 +25,17 @@ class HierarchicalChunker(BaseChunker): """Chunker implementation leveraging the document layout.""" - include_metadata: bool = True - heading_as_metadata: bool = False - min_chunk_len: PositiveInt = 64 + heading_as_metadata: bool = Field( + default=False, + description="Whether heading should be in metadata (instead of text)", + ) + include_metadata: bool = Field( + default=True, + description="Whether to include extras in the metadata", + ) + min_chunk_len: PositiveInt = Field( + default=64, description="Minimum chunk text length to consider (in chars)" + ) class _NodeType(str, Enum): PARAGRAPH = "paragraph" @@ -83,10 +91,6 @@ def _triplet_serialize(cls, table) -> Optional[str]: return output_text - @classmethod - def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str: - return f"$.{path_prefix}[{pos}]" - class _MainTextItemNode(BaseModel): parent: Optional[int] = None children: list[int] = [] @@ -304,14 +308,15 @@ def _build_chunk( return ChunkWithMetadata( text=concat, path=path, + heading=heading, page=item.prov[0].page if item.prov else None, bbox=item.prov[0].bbox if item.prov else None, - heading=heading, ) else: return Chunk( text=concat, path=path, + heading=heading, ) else: return None @@ -327,11 +332,6 @@ def chunk(self, dl_doc: DLDocument, delim="\n", **kwargs: Any) -> Iterator[Chunk Yields: Iterator[Chunk]: iterator over extracted chunks """ - if (not self.include_metadata) and self.heading_as_metadata: - raise RuntimeError( - "To enable `heading_as_metadata`, also `include_metadata` must be True." - ) - if dl_doc.main_text: # extract doc structure incl. metadata for # each item (e.g. parent, children) diff --git a/test/data/chunker/0_out_chunks_with_meta_incl_heading.json b/test/data/chunker/0_out_chunks_heading_in_meta_with_extras.json similarity index 79% rename from test/data/chunker/0_out_chunks_with_meta_incl_heading.json rename to test/data/chunker/0_out_chunks_heading_in_meta_with_extras.json index 6c44109..af62962 100644 --- a/test/data/chunker/0_out_chunks_with_meta_incl_heading.json +++ b/test/data/chunker/0_out_chunks_heading_in_meta_with_extras.json @@ -1,7 +1,7 @@ { "root": [ { - "path": "$.main-text[0]", + "path": "#/main-text/0", "text": "This paragraph is marginally long enough for getting accepted as a chunk.", "page": 1, "bbox": [ @@ -12,40 +12,40 @@ ] }, { - "path": "$.main-text[4]", + "path": "#/main-text/4", "text": "This one should also include the subtitle above since it is long enough.", + "heading": "Some subtitle", "page": 3, "bbox": [ 5.0, 6.0, 7.0, 8.0 - ], - "heading": "Some subtitle" + ] }, { - "path": "$.tables[0]", + "path": "#/tables/0", "text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany", + "heading": "Acquisitions", "page": 4, "bbox": [ 8.0, 9.0, 10.0, 11.0 - ], - "heading": "Acquisitions" + ] }, { - "path": "$.main-text[8]", + "path": "#/main-text/8", "text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.", + "heading": "Acquisitions", "page": 4, "bbox": [ 8.0, 9.0, 10.0, 11.0 - ], - "heading": "Acquisitions" + ] } ] } diff --git a/test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json b/test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json new file mode 100644 index 0000000..d45de94 --- /dev/null +++ b/test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json @@ -0,0 +1,23 @@ +{ + "root": [ + { + "path": "#/main-text/0", + "text": "This paragraph is marginally long enough for getting accepted as a chunk." + }, + { + "path": "#/main-text/4", + "text": "This one should also include the subtitle above since it is long enough.", + "heading": "Some subtitle" + }, + { + "path": "#/tables/0", + "text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany", + "heading": "Acquisitions" + }, + { + "path": "#/main-text/8", + "text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.", + "heading": "Acquisitions" + } + ] +} diff --git a/test/data/chunker/0_out_chunks_with_meta_heading_in_text.json b/test/data/chunker/0_out_chunks_heading_in_text_with_extras.json similarity index 89% rename from test/data/chunker/0_out_chunks_with_meta_heading_in_text.json rename to test/data/chunker/0_out_chunks_heading_in_text_with_extras.json index c33fc21..054bbc5 100644 --- a/test/data/chunker/0_out_chunks_with_meta_heading_in_text.json +++ b/test/data/chunker/0_out_chunks_heading_in_text_with_extras.json @@ -1,7 +1,7 @@ { "root": [ { - "path": "$.main-text[0]", + "path": "#/main-text/0", "text": "This paragraph is marginally long enough for getting accepted as a chunk.", "page": 1, "bbox": [ @@ -12,7 +12,7 @@ ] }, { - "path": "$.main-text[4]", + "path": "#/main-text/4", "text": "Some subtitle\nThis one should also include the subtitle above since it is long enough.", "page": 3, "bbox": [ @@ -23,7 +23,7 @@ ] }, { - "path": "$.tables[0]", + "path": "#/tables/0", "text": "Acquisitions\nAtomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany", "page": 4, "bbox": [ @@ -34,7 +34,7 @@ ] }, { - "path": "$.main-text[7]", + "path": "#/main-text/7", "text": "Acquisitions\nThis paragraph should actually include the latest subtitle.", "page": 4, "bbox": [ @@ -45,7 +45,7 @@ ] }, { - "path": "$.main-text[8]", + "path": "#/main-text/8", "text": "Acquisitions\nThis paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.", "page": 4, "bbox": [ diff --git a/test/data/chunker/0_out_chunks_wout_meta.json b/test/data/chunker/0_out_chunks_heading_in_text_wout_extras.json similarity index 80% rename from test/data/chunker/0_out_chunks_wout_meta.json rename to test/data/chunker/0_out_chunks_heading_in_text_wout_extras.json index 5bd0da8..994b19b 100644 --- a/test/data/chunker/0_out_chunks_wout_meta.json +++ b/test/data/chunker/0_out_chunks_heading_in_text_wout_extras.json @@ -1,23 +1,23 @@ { "root": [ { - "path": "$.main-text[0]", + "path": "#/main-text/0", "text": "This paragraph is marginally long enough for getting accepted as a chunk." }, { - "path": "$.main-text[4]", + "path": "#/main-text/4", "text": "Some subtitle\nThis one should also include the subtitle above since it is long enough." }, { - "path": "$.tables[0]", + "path": "#/tables/0", "text": "Acquisitions\nAtomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany" }, { - "path": "$.main-text[7]", + "path": "#/main-text/7", "text": "Acquisitions\nThis paragraph should actually include the latest subtitle." }, { - "path": "$.main-text[8]", + "path": "#/main-text/8", "text": "Acquisitions\nThis paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here." } ] diff --git a/test/test_hierarchical_chunker.py b/test/test_hierarchical_chunker.py index 00ef9d5..fe9cba1 100644 --- a/test/test_hierarchical_chunker.py +++ b/test/test_hierarchical_chunker.py @@ -9,37 +9,49 @@ from docling_core.types import Document as DLDocument -def test_chunk_without_metadata(): +def test_chunk_heading_in_text_wout_extras(): with open("test/data/chunker/0_inp_dl_doc.json") as f: data_json = f.read() dl_doc = DLDocument.model_validate_json(data_json) - chunker = HierarchicalChunker(include_metadata=False) + chunker = HierarchicalChunker(heading_as_metadata=False, include_metadata=False) chunks = chunker.chunk(dl_doc=dl_doc) - act_data = dict(root=[n.model_dump() for n in chunks]) - with open("test/data/chunker/0_out_chunks_wout_meta.json") as f: + act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks]) + with open("test/data/chunker/0_out_chunks_heading_in_text_wout_extras.json") as f: + exp_data = json.load(fp=f) + assert exp_data == act_data + + +def test_chunk_heading_in_text_with_extras(): + with open("test/data/chunker/0_inp_dl_doc.json") as f: + data_json = f.read() + dl_doc = DLDocument.model_validate_json(data_json) + chunker = HierarchicalChunker(heading_as_metadata=False, include_metadata=True) + chunks = chunker.chunk(dl_doc=dl_doc) + act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks]) + with open("test/data/chunker/0_out_chunks_heading_in_text_with_extras.json") as f: exp_data = json.load(fp=f) assert exp_data == act_data -def test_chunk_with_metadata_heading_in_text(): +def test_chunk_heading_in_meta_wout_extras(): with open("test/data/chunker/0_inp_dl_doc.json") as f: data_json = f.read() dl_doc = DLDocument.model_validate_json(data_json) - chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=False) + chunker = HierarchicalChunker(heading_as_metadata=True, include_metadata=False) chunks = chunker.chunk(dl_doc=dl_doc) act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks]) - with open("test/data/chunker/0_out_chunks_with_meta_heading_in_text.json") as f: + with open("test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json") as f: exp_data = json.load(fp=f) assert exp_data == act_data -def test_chunk_with_metadata_incl_heading(): +def test_chunk_heading_in_meta_with_extras(): with open("test/data/chunker/0_inp_dl_doc.json") as f: data_json = f.read() dl_doc = DLDocument.model_validate_json(data_json) - chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=True) + chunker = HierarchicalChunker(heading_as_metadata=True, include_metadata=True) chunks = chunker.chunk(dl_doc=dl_doc) act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks]) - with open("test/data/chunker/0_out_chunks_with_meta_incl_heading.json") as f: + with open("test/data/chunker/0_out_chunks_heading_in_meta_with_extras.json") as f: exp_data = json.load(fp=f) assert exp_data == act_data