From d09fe7ed44282b286f9c2588482e515bf40e0fca Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Tue, 22 Oct 2024 11:08:04 +0200 Subject: [PATCH] feat: extend chunk meta with schema, version, origin (#49) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling_core/transforms/chunker/__init__.py | 5 +- docling_core/transforms/chunker/base.py | 2 +- .../chunker/hierarchical_chunker.py | 64 +- test/data/chunker/0_out_chunks.json | 936 ++++++++++++-- test/data/chunker/1_out_chunks.json | 1109 +++++++++++++++-- 5 files changed, 1880 insertions(+), 236 deletions(-) diff --git a/docling_core/transforms/chunker/__init__.py b/docling_core/transforms/chunker/__init__.py index 3407614..8545738 100644 --- a/docling_core/transforms/chunker/__init__.py +++ b/docling_core/transforms/chunker/__init__.py @@ -6,4 +6,7 @@ """Define the chunker types.""" from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta -from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker +from docling_core.transforms.chunker.hierarchical_chunker import ( + DocMeta, + HierarchicalChunker, +) diff --git a/docling_core/transforms/chunker/base.py b/docling_core/transforms/chunker/base.py index 1bc3a36..92ba0b7 100644 --- a/docling_core/transforms/chunker/base.py +++ b/docling_core/transforms/chunker/base.py @@ -13,7 +13,7 @@ class BaseMeta(BaseModel): - """Metadata base class.""" + """Chunk metadata base class.""" excluded_embed: ClassVar[list[str]] = [] excluded_llm: ClassVar[list[str]] = [] diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py index f2b7cc4..524783b 100644 --- a/docling_core/transforms/chunker/hierarchical_chunker.py +++ b/docling_core/transforms/chunker/hierarchical_chunker.py @@ -8,15 +8,19 @@ from __future__ import annotations import logging -from typing import Any, ClassVar, Iterator, Optional +import re +from typing import Any, ClassVar, Final, Iterator, Literal, Optional from pandas import DataFrame -from pydantic import Field +from pydantic import Field, StringConstraints, field_validator +from typing_extensions import Annotated +from docling_core.search.package import VERSION_PATTERN from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta -from docling_core.types.doc import DoclingDocument as DLDocument +from docling_core.types import DoclingDocument as DLDocument from docling_core.types.doc.document import ( DocItem, + DocumentOrigin, LevelNumber, ListItem, SectionHeaderItem, @@ -25,16 +29,31 @@ ) from docling_core.types.doc.labels import DocItemLabel +_VERSION: Final = "1.0.0" + +_KEY_SCHEMA_NAME = "schema_name" +_KEY_VERSION = "version" _KEY_DOC_ITEMS = "doc_items" _KEY_HEADINGS = "headings" _KEY_CAPTIONS = "captions" +_KEY_ORIGIN = "origin" _logger = logging.getLogger(__name__) class DocMeta(BaseMeta): - """Data model for Hierarchical Chunker metadata.""" + """Data model for Hierarchical Chunker chunk metadata.""" + schema_name: Literal["docling_core.transforms.chunker.DocMeta"] = Field( + default="docling_core.transforms.chunker.DocMeta", + alias=_KEY_SCHEMA_NAME, + ) + version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = ( + Field( + default=_VERSION, + alias=_KEY_VERSION, + ) + ) doc_items: list[DocItem] = Field( alias=_KEY_DOC_ITEMS, min_length=1, @@ -49,9 +68,39 @@ class DocMeta(BaseMeta): alias=_KEY_CAPTIONS, min_length=1, ) + origin: Optional[DocumentOrigin] = Field( + default=None, + alias=_KEY_ORIGIN, + ) - excluded_embed: ClassVar[list[str]] = [_KEY_DOC_ITEMS] - excluded_llm: ClassVar[list[str]] = [_KEY_DOC_ITEMS] + excluded_embed: ClassVar[list[str]] = [ + _KEY_SCHEMA_NAME, + _KEY_VERSION, + _KEY_DOC_ITEMS, + _KEY_ORIGIN, + ] + excluded_llm: ClassVar[list[str]] = [ + _KEY_SCHEMA_NAME, + _KEY_VERSION, + _KEY_DOC_ITEMS, + _KEY_ORIGIN, + ] + + @field_validator(_KEY_VERSION) + @classmethod + def check_version_is_compatible(cls, v: str) -> str: + """Check if this meta item version is compatible with current version.""" + current_match = re.match(VERSION_PATTERN, _VERSION) + doc_match = re.match(VERSION_PATTERN, v) + if ( + doc_match is None + or current_match is None + or doc_match["major"] != current_match["major"] + or doc_match["minor"] > current_match["minor"] + ): + raise ValueError(f"incompatible version {v} with schema version {_VERSION}") + else: + return _VERSION class DocChunk(BaseChunk): @@ -129,6 +178,7 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]: for k in sorted(heading_by_level) ] or None, + origin=dl_doc.origin, ), ) list_items = [] # reset @@ -171,6 +221,7 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]: headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None, captions=captions, + origin=dl_doc.origin, ), ) yield c @@ -182,5 +233,6 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]: doc_items=list_items, headings=[heading_by_level[k] for k in sorted(heading_by_level)] or None, + origin=dl_doc.origin, ), ) diff --git a/test/data/chunker/0_out_chunks.json b/test/data/chunker/0_out_chunks.json index 3261e1d..4c7bf36 100644 --- a/test/data/chunker/0_out_chunks.json +++ b/test/data/chunker/0_out_chunks.json @@ -3,6 +3,8 @@ { "text": "arXiv:2408.09869v3 [cs.CL] 30 Aug 2024", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/0", @@ -28,12 +30,19 @@ } ] } - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Version 1.0", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/2", @@ -62,12 +71,19 @@ ], "headings": [ "Docling Technical Report" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/3", @@ -96,12 +112,19 @@ ], "headings": [ "Docling Technical Report" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "AI4K Group, IBM Research Ruschlikon, Switzerland", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/4", @@ -130,12 +153,19 @@ ], "headings": [ "Docling Technical Report" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/6", @@ -165,12 +195,19 @@ "headings": [ "Docling Technical Report", "Abstract" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/8", @@ -199,12 +236,19 @@ ], "headings": [ "1 Introduction" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/9", @@ -233,12 +277,19 @@ ], "headings": [ "1 Introduction" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Docling Technical Report", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/10", @@ -267,12 +318,19 @@ ], "headings": [ "1 Introduction" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "1", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/11", @@ -301,12 +359,19 @@ ], "headings": [ "1 Introduction" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Here is what Docling delivers today:", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/12", @@ -335,12 +400,19 @@ ], "headings": [ "1 Introduction" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "\u00b7 Converts PDF documents to JSON or Markdown format, stable and lightning fast\n\u00b7 Understands detailed page layout, reading order, locates figures and recovers table structures\n\u00b7 Extracts metadata from the document, such as title, authors, references and language\n\u00b7 Optionally applies OCR, e.g. for scanned PDFs\n\u00b7 Can be configured to be optimal for batch-mode (i.e high throughput, low time-to-solution) or interactive mode (compromise on efficiency, low time-to-solution)\n\u00b7 Can leverage different accelerators (GPU, MPS, etc).", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/13", @@ -489,12 +561,19 @@ ], "headings": [ "1 Introduction" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "To use Docling, you can simply install the docling package from PyPI. Documentation and examples are available in our GitHub repository at github.com/DS4SD/docling. All required model assets 1 are downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the model assets in advance.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/20", @@ -523,12 +602,19 @@ ], "headings": [ "2 Getting Started" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Docling provides an easy code interface to convert PDF documents from file system, URLs or binary streams, and retrieve the output in either JSON or Markdown format. For convenience, separate methods are offered to convert single documents or batches of documents. A basic usage example is illustrated below. Further examples are available in the Doclign code repository.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/21", @@ -557,12 +643,19 @@ ], "headings": [ "2 Getting Started" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "from docling.document_converter import DocumentConverter source = \"https :// arxiv.org/pdf /2206.01062\" # PDF path or URL converter = DocumentConverter () result = converter.convert_single(source) print(result.render_as_markdown ()) # output: \"## DocLayNet: A Large Human -Annotated Dataset for Document -Layout Analysis [...]\"", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/22", @@ -591,12 +684,19 @@ ], "headings": [ "2 Getting Started" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Optionally, you can configure custom pipeline features and runtime options, such as turning on or off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and defining the budget of CPU threads. Advanced usage examples and options are documented in the README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a container.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/23", @@ -625,12 +725,19 @@ ], "headings": [ "2 Getting Started" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Docling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support downstream operations. Then, the standard model pipeline applies a sequence of AI models independently on every page in the document to extract features and content, such as layout and table structures. Finally, the results from all pages are aggregated and passed through a post-processing stage, which augments metadata, detects the document language, infers reading-order and eventually assembles a typed document object which can be serialized to JSON or Markdown.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/25", @@ -659,12 +766,19 @@ ], "headings": [ "3 Processing pipeline" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Two basic requirements to process PDF documents in our pipeline are a) to retrieve all text content and their geometric coordinates on each page and b) to render the visual representation of each page as it would appear in a PDF viewer. Both these requirements are encapsulated in Docling's PDF backend interface. While there are several open-source PDF parsing libraries available for python, we faced major obstacles with all of them for different reasons, among which were restrictive", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/27", @@ -693,12 +807,19 @@ ], "headings": [ "3.1 PDF backends" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "$^{1}$see huggingface.co/ds4sd/docling-models/", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/28", @@ -727,12 +848,19 @@ ], "headings": [ "3.1 PDF backends" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "2", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/29", @@ -761,12 +889,19 @@ ], "headings": [ "3.1 PDF backends" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/30", @@ -795,12 +930,19 @@ ], "headings": [ "3.1 PDF backends" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14].", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/31", @@ -829,12 +971,19 @@ ], "headings": [ "3.1 PDF backends" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/32", @@ -863,12 +1012,19 @@ ], "headings": [ "3.1 PDF backends" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/34", @@ -897,12 +1053,19 @@ ], "headings": [ "3.2 AI models" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Our layout analysis model is an object-detector which predicts the bounding-boxes and classes of various elements on the image of a given page. Its architecture is derived from RT-DETR [16] and re-trained on DocLayNet [13], our popular human-annotated dataset for document-layout analysis, among other proprietary datasets. For inference, our implementation relies on the onnxruntime [5].", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/36", @@ -931,12 +1094,19 @@ ], "headings": [ "Layout Analysis Model" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "The Docling pipeline feeds page images at 72 dpi resolution, which can be processed on a single CPU with sub-second latency. All predicted bounding-box proposals for document elements are post-processed to remove overlapping proposals based on confidence and size, and then intersected with the text tokens in the PDF to group them into meaningful and complete units such as paragraphs, section titles, list items, captions, figures or tables.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/37", @@ -965,12 +1135,19 @@ ], "headings": [ "Layout Analysis Model" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "The TableFormer model [12], first published in 2022 and since refined with a custom structure token language [9], is a vision-transformer model for table structure recovery. It can predict the logical row and column structure of a given table based on an input image, and determine which table cells belong to column headers, row headers or the table body. Compared to earlier approaches, TableFormer handles many characteristics of tables, such as partial or no borderlines, empty cells, rows or columns, cell spans and hierarchy both on column-heading or row-heading level, tables with inconsistent indentation or alignment and other complexities. For inference, our implementation relies on PyTorch [2].", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/39", @@ -999,12 +1176,19 @@ ], "headings": [ "Table Structure Recognition" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "3", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/40", @@ -1033,12 +1217,19 @@ ], "headings": [ "Table Structure Recognition" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "The Docling pipeline feeds all table objects detected in the layout analysis to the TableFormer model, by providing an image-crop of the table and the included text cells. TableFormer structure predictions are matched back to the PDF cells in post-processing to avoid expensive re-transcription text in the table image. Typical tables require between 2 and 6 seconds to be processed on a standard CPU, strongly depending on the amount of included table cells.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/41", @@ -1067,12 +1258,19 @@ ], "headings": [ "Table Structure Recognition" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Docling provides optional support for OCR, for example to cover scanned PDFs or content in bitmaps images embedded on a page. In our initial release, we rely on EasyOCR [1], a popular thirdparty OCR library with support for many languages. Docling, by default, feeds a high-resolution page image (216 dpi) to the OCR engine, to allow capturing small print detail in decent quality. While EasyOCR delivers reasonable transcription quality, we observe that it runs fairly slow on CPU (upwards of 30 seconds per page).", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/43", @@ -1101,12 +1299,19 @@ ], "headings": [ "OCR" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "We are actively seeking collaboration from the open-source community to extend Docling with additional OCR backends and speed improvements.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/44", @@ -1135,12 +1340,19 @@ ], "headings": [ "OCR" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "In the final pipeline stage, Docling assembles all prediction results produced on each page into a well-defined datatype that encapsulates a converted document, as defined in the auxiliary package docling-core . The generated document object is passed through a post-processing model which leverages several algorithms to augment features, such as detection of the document language, correcting the reading order, matching figures with captions and labelling metadata such as title, authors and references. The final output can then be serialized to JSON or transformed into a Markdown representation at the users request.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/46", @@ -1169,12 +1381,19 @@ ], "headings": [ "3.3 Assembly" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Docling provides a straight-forward interface to extend its capabilities, namely the model pipeline. A model pipeline constitutes the central part in the processing, following initial document parsing and preceding output assembly, and can be fully customized by sub-classing from an abstract baseclass ( BaseModelPipeline ) or cloning the default model pipeline. This effectively allows to fully customize the chain of models, add or replace models, and introduce additional pipeline configuration parameters. To use a custom model pipeline, the custom pipeline class to instantiate can be provided as an argument to the main document conversion methods. We invite everyone in the community to propose additional or alternative models and improvements.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/48", @@ -1203,12 +1422,19 @@ ], "headings": [ "3.4 Extensibility" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Implementations of model classes must satisfy the python Callable interface. The __call__ method must accept an iterator over page objects, and produce another iterator over the page objects which were augmented with the additional features predicted by the model, by extending the provided PagePredictions data model accordingly.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/49", @@ -1237,12 +1463,19 @@ ], "headings": [ "3.4 Extensibility" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "In this section, we establish some reference numbers for the processing speed of Docling and the resource budget it requires. All tests in this section are run with default options on our standard test set distributed with Docling, which consists of three papers from arXiv and two IBM Redbooks, with a total of 225 pages. Measurements were taken using both available PDF backends on two different hardware systems: one MacBook Pro M3 Max, and one bare-metal server running Ubuntu 20.04 LTS on an Intel Xeon E5-2690 CPU. For reproducibility, we fixed the thread budget (through setting OMP NUM THREADS environment variable ) once to 4 (Docling default) and once to 16 (equal to full core count on the test hardware). All results are shown in Table 1.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/51", @@ -1271,12 +1504,19 @@ ], "headings": [ "4 Performance" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "If you need to run Docling in very low-resource environments, please consider configuring the pypdfium backend. While it is faster and more memory efficient than the default docling-parse backend, it will come at the expense of worse quality results, especially in table structure recovery.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/52", @@ -1305,12 +1545,19 @@ ], "headings": [ "4 Performance" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Establishing GPU acceleration support for the AI models is currently work-in-progress and largely untested, but may work implicitly when CUDA is available and discovered by the onnxruntime and", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/53", @@ -1339,12 +1586,19 @@ ], "headings": [ "4 Performance" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "4", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/54", @@ -1373,12 +1627,19 @@ ], "headings": [ "4 Performance" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/55", @@ -1407,12 +1668,19 @@ ], "headings": [ "4 Performance" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/56", @@ -1441,12 +1709,19 @@ ], "headings": [ "4 Performance" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Apple M3 Max, Thread budget.Thread budget = 4. Apple M3 Max, native backend.TTS = 177 s. Apple M3 Max, native backend.Pages/s = 1.27. Apple M3 Max, native backend.Mem = 6.20 GB. Apple M3 Max, pypdfium backend.TTS = 103 s. Apple M3 Max, pypdfium backend.Pages/s = 2.18. Apple M3 Max, pypdfium backend.Mem = 2.56 GB. (16 cores), Thread budget.Thread budget = 16. (16 cores), native backend.TTS = 167 s. (16 cores), native backend.Pages/s = 1.34. (16 cores), native backend.Mem = 6.20 GB. (16 cores), pypdfium backend.TTS = 92 s. (16 cores), pypdfium backend.Pages/s = 2.45. (16 cores), pypdfium backend.Mem = 2.56 GB. Intel(R) Xeon E5-2690, Thread budget.Thread budget = 4 16. Intel(R) Xeon E5-2690, native backend.TTS = 375 s 244 s. Intel(R) Xeon E5-2690, native backend.Pages/s = 0.60 0.92. Intel(R) Xeon E5-2690, native backend.Mem = 6.16 GB. Intel(R) Xeon E5-2690, pypdfium backend.TTS = 239 s 143 s. Intel(R) Xeon E5-2690, pypdfium backend.Pages/s = 0.94 1.57. Intel(R) Xeon E5-2690, pypdfium backend.Mem = 2.42 GB", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/tables/0", @@ -1478,12 +1753,19 @@ ], "captions": [ "Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads." - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/58", @@ -1512,12 +1794,19 @@ ], "headings": [ "5 Applications" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Docling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of content, as well as augment extracted document metadata with additional information. Further investment into testing and optimizing GPU acceleration as well as improving the Docling-native PDF backend are on our roadmap, too.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/60", @@ -1546,12 +1835,19 @@ ], "headings": [ "6 Future work and contributions" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "We encourage everyone to propose or implement additional features and models, and will gladly take your inputs and contributions under review . The codebase of Docling is open for use and contribution, under the MIT license agreement and in alignment with our contributing guidelines included in the Docling repository. If you use Docling in your projects, please consider citing this technical report.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/61", @@ -1580,12 +1876,19 @@ ], "headings": [ "6 Future work and contributions" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "[1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0.\n[2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/63", @@ -1638,12 +1941,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "5", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/65", @@ -1672,12 +1982,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "machine learning through dynamic python bytecode transformation and graph compilation. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24) . ACM, 4 2024. doi: 10.1145/3620665.3640366. URL https://pytorch.org/assets/pytorch2-2.pdf .", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/66", @@ -1706,12 +2023,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "[3] C. Auer, M. Dolfi, A. Carvalho, C. B. Ramis, and P. W. Staar. Delivering document conversion as a cloud service with high throughput and responsiveness. In 2022 IEEE 15th International Conference on Cloud Computing (CLOUD) , pages 363-373. IEEE, 2022.\n[4] J. Berkenbilt. Qpdf: A content-preserving pdf document transformer, 2024. URL https: //github.com/qpdf/qpdf .\n[5] O. R. developers. Onnx runtime. https://onnxruntime.ai/ , 2024. Version: 1.18.1.\n[6] IBM. Data Prep Kit: a community project to democratize and accelerate unstructured data preparation for LLM app developers, 2024. URL https://github.com/IBM/ data-prep-kit .\n[7] A. S. Inc. PyMuPDF, 2024. URL https://github.com/pymupdf/PyMuPDF .\n[8] J. Liu. LlamaIndex, 11 2022. URL https://github.com/jerryjliu/llama_index .\n[9] M. Lysak, A. Nassar, N. Livathinos, C. Auer, and P. Staar. Optimized Table Tokenization for Table Structure Recognition. In Document Analysis and Recognition - ICDAR 2023: 17th International Conference, San Jos'e, CA, USA, August 21-26, 2023, Proceedings, Part II , pages 37-50, Berlin, Heidelberg, Aug. 2023. Springer-Verlag. ISBN 978-3-031-41678-1. doi: 10. 1007/978-3-031-41679-8 3. URL https://doi.org/10.1007/978-3-031-41679-8_3 .\n[10] L. Mishra, S. Dhibi, Y. Kim, C. Berrospi Ramis, S. Gupta, M. Dolfi, and P. Staar. Statements: Universal information extraction from tables with large language models for ESG KPIs. In D. Stammbach, J. Ni, T. Schimanski, K. Dutia, A. Singh, J. Bingler, C. Christiaen, N. Kushwaha, V. Muccione, S. A. Vaghefi, and M. Leippold, editors, Proceedings of the 1st Workshop on Natural Language Processing Meets Climate Change (ClimateNLP 2024) , pages 193-214, Bangkok, Thailand, Aug. 2024. Association for Computational Linguistics. URL https://aclanthology.org/2024.climatenlp-1.15 .\n[11] L. Morin, V. Weber, G. I. Meijer, F. Yu, and P. W. J. Staar. Patcid: an open-access dataset of chemical structures in patent documents. Nature Communications , 15(1):6532, August 2024. ISSN 2041-1723. doi: 10.1038/s41467-024-50779-y. URL https://doi.org/10.1038/ s41467-024-50779-y .\n[12] A. Nassar, N. Livathinos, M. Lysak, and P. Staar. Tableformer: Table structure understanding with transformers. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition , pages 4614-4623, 2022.\n[13] B. Pfitzmann, C. Auer, M. Dolfi, A. S. Nassar, and P. Staar. Doclaynet: a large humanannotated dataset for document-layout segmentation. pages 3743-3751, 2022.\n[14] pypdf Maintainers. pypdf: A Pure-Python PDF Library, 2024. URL https://github.com/ py-pdf/pypdf .\n[15] P. Team. PyPDFium2: Python bindings for PDFium, 2024. URL https://github.com/ pypdfium2-team/pypdfium2 .\n[16] Y. Zhao, W. Lv, S. Xu, J. Wei, G. Wang, Q. Dang, Y. Liu, and J. Chen. Detrs beat yolos on real-time object detection, 2023.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/67", @@ -2052,12 +2376,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "6", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/81", @@ -2086,12 +2417,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "In this section, we illustrate a few examples of Docling' s output in Markdown and JSON.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/83", @@ -2120,12 +2458,19 @@ ], "headings": [ "Appendix" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/85", @@ -2154,12 +2499,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/86", @@ -2188,12 +2540,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/87", @@ -2222,12 +2581,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/88", @@ -2256,12 +2622,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/89", @@ -2290,12 +2663,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present $_{DocLayNet}$, a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/91", @@ -2324,12 +2704,19 @@ ], "headings": [ "ABSTRACT" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "\u00b7 Information systems \u2192 Document structure ; \u00b7 Applied computing \u2192 Document analysis ; \u00b7 Computing methodologies \u2192 Machine learning ; Computer vision ; $_{Object detection}$;", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/93", @@ -2358,12 +2745,19 @@ ], "headings": [ "CCS CONCEPTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profi t or commercial advantage and that copies bear this notice and the full citation on thefirst page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). KDD '22, August 14-18, 2022, Washington, DC, USA \u00a9 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/94", @@ -2392,12 +2786,19 @@ ], "headings": [ "CCS CONCEPTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/96", @@ -2426,12 +2827,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/97", @@ -2460,12 +2868,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/98", @@ -2494,12 +2909,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/99", @@ -2528,12 +2950,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/100", @@ -2562,12 +2991,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large groundtruth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/102", @@ -2596,12 +3032,19 @@ ], "headings": [ "ABSTRACT" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "$_{\u00b7 Information systems }$\u2192$_{ Document structure ; \u00b7 Applied computing }$ \u2192$_{ Document analysis ; \u00b7 Computing methodologies }$\u2192$_{ Machine learning ;}$ Computer vision ; Object detection ;", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/104", @@ -2630,12 +3073,19 @@ ], "headings": [ "CCS CONCEPTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/105", @@ -2664,12 +3114,19 @@ ], "headings": [ "CCS CONCEPTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "KDD '22, August 14-18, 2022, Washington, DC, USA \u00a9 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/106", @@ -2698,12 +3155,19 @@ ], "headings": [ "CCS CONCEPTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Figure 1: Four examples of complex page layouts across different document categories", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/107", @@ -2732,12 +3196,19 @@ ], "headings": [ "CCS CONCEPTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "PDF document conversion, layout segmentation, object-detection, data set, Machine Learning", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/109", @@ -2766,12 +3237,19 @@ ], "headings": [ "KEYWORDS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/111", @@ -2800,12 +3278,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Figure 1: Four examples of complex page layouts across different document categories", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/112", @@ -2834,12 +3319,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "PDF document conversion, layout segmentation, object-detection, data set, Machine Learning", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/114", @@ -2868,12 +3360,19 @@ ], "headings": [ "KEYWORDS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi , Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Wash-$_{ington, DC, USA.}$ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/116", @@ -2902,12 +3401,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "1 INTRODUCTION", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/117", @@ -2936,12 +3442,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv .org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown).", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/118", @@ -2970,12 +3483,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "7", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/119", @@ -3004,12 +3524,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/120", @@ -3038,12 +3565,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/121", @@ -3072,12 +3606,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/122", @@ -3106,12 +3647,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Caption, human.human = 84-89. Caption, MRCNN.R50 = 68.4. Caption, MRCNN.R101 = 71.5. Caption, FRCNN.R101 = 70.1. Caption, YOLO.v5x6 = 77.7. Footnote, human.human = 83-91. Footnote, MRCNN.R50 = 70.9. Footnote, MRCNN.R101 = 71.8. Footnote, FRCNN.R101 = 73.7. Footnote, YOLO.v5x6 = 77.2. Formula, human.human = 83-85. Formula, MRCNN.R50 = 60.1. Formula, MRCNN.R101 = 63.4. Formula, FRCNN.R101 = 63.5. Formula, YOLO.v5x6 = 66.2. List-item, human.human = 87-88. List-item, MRCNN.R50 = 81.2. List-item, MRCNN.R101 = 80.8. List-item, FRCNN.R101 = 81.0. List-item, YOLO.v5x6 = 86.2. Page-footer, human.human = 93-94. Page-footer, MRCNN.R50 = 61.6. Page-footer, MRCNN.R101 = 59.3. Page-footer, FRCNN.R101 = 58.9. Page-footer, YOLO.v5x6 = 61.1. Page-header, human.human = 85-89. Page-header, MRCNN.R50 = 71.9. Page-header, MRCNN.R101 = 70.0. Page-header, FRCNN.R101 = 72.0. Page-header, YOLO.v5x6 = 67.9. Picture, human.human = 69-71. Picture, MRCNN.R50 = 71.7. Picture, MRCNN.R101 = 72.7. Picture, FRCNN.R101 = . Picture, YOLO.v5x6 = 77.1. Section-header, human.human = 83-84. Section-header, MRCNN.R50 = 67.6. Section-header, MRCNN.R101 = 69.3. Section-header, FRCNN.R101 = 68.4. Section-header, YOLO.v5x6 = 74.6. Table, human.human = 77-81. Table, MRCNN.R50 = 82.2. Table, MRCNN.R101 = 82.9. Table, FRCNN.R101 = 82.2. Table, YOLO.v5x6 = 86.3. Text, human.human = 84-86. Text, MRCNN.R50 = 84.6. Text, MRCNN.R101 = 85.8. Text, FRCNN.R101 = 85.4. Text, YOLO.v5x6 = . , human.human = . , MRCNN.R50 = 76.7. , MRCNN.R101 = 80.4. , FRCNN.R101 = 79.9. , YOLO.v5x6 = 88.1. Title, human.human = 60-72. Title, MRCNN.R50 = . Title, MRCNN.R101 = . Title, FRCNN.R101 = . Title, YOLO.v5x6 = 82.7. All, human.human = 82-83. All, MRCNN.R50 = 72.4. All, MRCNN.R101 = 73.5. All, FRCNN.R101 = 73.4. All, YOLO.v5x6 = 76.8", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/tables/1", @@ -3143,12 +3691,19 @@ ], "captions": [ "Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset." - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and $_{Picture}$. For the latter, we instructed annotation staffto minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way toflag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in thefinal dataset. With all these measures in place, experienced annotation staffmanaged to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/123", @@ -3177,12 +3732,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/125", @@ -3211,12 +3773,19 @@ ], "headings": [ "5 EXPERIMENTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in \"5. Experiments\" wrapping over the column end is broken up in two and interrupted by the table.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/126", @@ -3245,12 +3814,19 @@ ], "headings": [ "5 EXPERIMENTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curv eflattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/127", @@ -3279,12 +3855,19 @@ ], "headings": [ "5 EXPERIMENTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/128", @@ -3313,12 +3896,19 @@ ], "headings": [ "5 EXPERIMENTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/129", @@ -3347,12 +3937,19 @@ ], "headings": [ "5 EXPERIMENTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of $^{1025}$\u00d71025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as $_{Text}$, Table and $_{Picture}$. This is not entirely surprising, as and Picture are abundant and the most visually distinctive in a document.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/131", @@ -3381,12 +3978,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Prediclion Derormance (up80.5-0.85 ohobeci detecion lalo ks Doclaynal Lest saL Ine VACNN (Mask R-CNNI and FACNN (Faster A-CNM) modcs mith PosNc: 50 PosNo: 101 backtone woro trainod based on Enc nchwwcrk achrocturos tom Ihc Oeronhroase a-CNn aso rioi-Fpn Jx, FasieA-Cnn a1o1-FPN Jx), wilh delaui conlwuralions The YoUg mpomorcabon utilzod w2s YoloSyb(13| modos woro inbalsod usino cro-trunodmonhts hron Coco 2017 datasor", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/132", @@ -3415,12 +4019,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Gaoon, noun = . Gaoon, Mrcnn = . Gaoon, MaCNN = . Gaoon, Frcne = . Gaoon, Yolo = . Foomolo, noun = . Foomolo, Mrcnn = . Foomolo, MaCNN = . Foomolo, Frcne = . Foomolo, Yolo = . Foula, noun = . Foula, Mrcnn = . Foula, MaCNN = . Foula, Frcne = . Foula, Yolo = . Ust-lern, noun = . Ust-lern, Mrcnn = . Ust-lern, MaCNN = . Ust-lern, Frcne = . Ust-lern, Yolo = . Page-locer, noun = . Page-locer, Mrcnn = . Page-locer, MaCNN = . Page-locer, Frcne = . Page-locer, Yolo = . Faqe-haje, noun = . Faqe-haje, Mrcnn = . Faqe-haje, MaCNN = . Faqe-haje, Frcne = . Faqe-haje, Yolo = . Pxlu, noun = . Pxlu, Mrcnn = . Pxlu, MaCNN = . Pxlu, Frcne = . Pxlu, Yolo = . Sonhoade, noun = . Sonhoade, Mrcnn = . Sonhoade, MaCNN = . Sonhoade, Frcne = . Sonhoade, Yolo = ", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/tables/2", @@ -3449,12 +4060,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "iD avod Ihbs arcost cha unbasndbasolino numoc human cocumnnt-Laycut annotalion; Thrd Inirooucod leatura 0i snapoina Doxes around lerl scainunis cblan & pixel-accuiale annolaton and aJan feduce Bifre and elonThe CCS annoinbon aloMalca shruks Ovory Usor-drawnboro mnmum boundino-borarounaIho onclosod coxt-colls Purolytort basud scoitontwhich uxclldcs Ort Tatlo and Picluo latsor Inssucicdannjlabon sha mnim so inclusion Suitcurding mlospeco whloIncvon Oenoncang doans d0 oisnaocmnbors Onchse Ihal So10 wioogly Daisoc Pogcs Cannol be annotalcd coTcCEY and nccd supocd Foudn Oshdned Wuyio(aq Dagcs (ccclod Cases whcion valid anncuabon eccofding abeiqu Oelines coukbe acheneu Eamnole Case, flis wouk PDF peoe3 Ihal rendernnccrrecUy contanlavuta hat Imnosshk cantra milh Vananonnyogannio{ Suchiceciodoaoos not coralnon Ihofnn hr Aroknacoarreehetyn annollca slall nluuocd unnoln sina \" Puou lypical Lmnetamre 0l 20s 10 605 cecendnc conoanty", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/133", @@ -3483,12 +4101,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Ine crimary goal OocVAYNo cblan hion-quality Modols AccuaiodoaMoiuvana4s WMeVanalon chalcnonglayoul: Cecurdg echon Doicdi Delccion modcb rtene Casistlo Usc, Quulo Hhndandiubon ground-vuth data COCO lornat [16] and avaladloy enetal Irarnenoiks uch derectrcnz7] Furnemmcre, baseline nmnoe < I Putun Notand DocBank calanodusnsundad coict dosnchonmodols such Mas< A CNN and Fasior A CNN SuEna blraomhdelecfa nonInr Canacle", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/135", @@ -3517,12 +4142,19 @@ ], "headings": [ "5 EXPERIMENTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Fauri Prco chon ocrloianC( 005-095) ola Mask A-CNN ncthoik ilh AcsNciSo backbono brainod on incrcasing Iracbons oi DocLaynei calasot Tne loannp auro altons around Ih0 \u20ac03 noicahino Ihal inxreasing /e 520 Q Iho DocL\u00f8y Nel dalasot Amardaen nol Ycid sn: dorOocC Chons LAD", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/136", @@ -3551,12 +4183,19 @@ ], "headings": [ "5 EXPERIMENTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "pangrandloave detallod evalvallon %moro rcoarimolhods monionan Secilg Jorhlure work", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/137", @@ -3585,12 +4224,19 @@ ], "headings": [ "5 EXPERIMENTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Inuhs sechon All Deseni seur8/ asoecis reles00 Perormanoe ouieci celec on DoxclayNet Simamtas In PLoLaynnt oyuato tnn qualmy cuthnlr crodictionsusiramnanavnna prncisicn (TTAP) wch IDovrdaos that rangn trom 0 5ta 005 (nap,o6-00: Ml olue Fnoula Cvurbar uvalaion coou piayIed DY Ihu COCO API/161 ook", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/138", @@ -3619,12 +4265,19 @@ ], "headings": [ "5 EXPERIMENTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "ptesenl baselne expenrnenls (Qvenin MAF) on Mas< R-CNN /121 Fasler F-CNN [11] an1 YOLOvS [13] Bou1 brann anavailang woropomormod AGa Imnoos vith dimonsions 1025 chxrols For tralring onN usodomannolatln Incaso ohcuunourfhunnolulco Dac3 Ohenn Vuruhoninptalunhamagny usnaroA en hn 10?7 loworrnannomap conoutec paicaisehuman anncrbons Aoo-amculeopnnos Ins Cves nacaton thatrhe DocLayNot daasci DOfo s mornwro clagnoo [csoarcncomrurt gap bctwoon human focogniticn and VL aporoaces nlelesuio IharNaska-CNNead Fasler GNincroova comnanen Maseoes nnocauna Ulbi AICBasodnanc scomrorubon oormvod Irom bounon)ooros Ooo{ abuin totcrorcochons Ontho chornnno Mcrocconi YolavSrmrodel does verywell und even Dul-Perdorins selectedlubels such Tedle undpcturl enbeh surcrisio Ta oloandPchre poincant amimemostasiaIN ishinsine documen: Ouau hnne", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/140", @@ -3653,12 +4306,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "8", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/141", @@ -3687,12 +4347,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/142", @@ -3721,12 +4388,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row \"Total\") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the tripleannotated pages, from which we obtain accuracy ranges. Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurr ence (as % of row \"Total\") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges. B", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/143", @@ -3755,12 +4429,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Caption, Count. = 22524. Caption, % of Total.Train = 2.04. Caption, % of Total.Test = 1.77. Caption, % of Total.Val = 2.32. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).All = 84-89. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 40-61. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 86-92. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 94-99. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 95-99. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 69-78. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).T en = n/a. Footnote, Count. = 6318. Footnote, % of Total.Train = 0.60. Footnote, % of Total.Test = 0.31. Footnote, % of Total.Val = 0.58. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).All = 83-91. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = n/a. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 100. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 62-88. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 85-94. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = n/a. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 82-97. Formula, Count. = 25027. Formula, % of Total.Train = 2.25. Formula, % of Total.Test = 1.90. Formula, % of Total.Val = 2.96. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).All = 83-85. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = . Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Man = n/a. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 84-87. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 86-96. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = . Formula, triple inter-annotator mAP @ 0.5-0.95 (%).T en = n/a. List-item, Count. = 185660. List-item, % of Total.Train = 17.19. List-item, % of Total.Test = 13.34. List-item, % of Total.Val = 15.82. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).All = 87-88. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 74-83. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 90-92. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 97-97. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 81-85. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 75-88. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 93-95. Page-footer, Count. = 70878. Page-footer, % of Total.Train = 6.51. Page-footer, % of Total.Test = 5.58. Page-footer, % of Total.Val = 6.00. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).All = 93-94. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 88-90. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 95-96. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 100. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 92-97. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 100. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 96-98. Page-header, Count. = 58022. Page-header, % of Total.Train = 5.10. Page-header, % of Total.Test = 6.70. Page-header, % of Total.Val = 5.06. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).All = 85-89. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 66-76. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 90-94. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 98-100. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 91-92. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 97-99. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 81-86. Picture, Count. = 45976. Picture, % of Total.Train = 4.21. Picture, % of Total.Test = 2.78. Picture, % of Total.Val = 5.31. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).All = 69-71. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 56-59. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 82-86. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 69-82. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 80-95. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 66-71. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 59-76. Section-header, Count. = 142884. Section-header, % of Total.Train = 12.60. Section-header, % of Total.Test = 15.77. Section-header, % of Total.Val = 12.85. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).All = 83-84. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 76-81. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 90-92. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 94-95. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 87-94. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 69-73. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 78-86. Table, Count. = 34733. Table, % of Total.Train = 3.20. Table, % of Total.Test = 2.27. Table, % of Total.Val = 3.60. Table, triple inter-annotator mAP @ 0.5-0.95 (%).All = 77-81. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 75-80. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 83-86. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 98-99. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 58-80. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 79-84. Table, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 70-85. Text, Count. = 510377. Text, % of Total.Train = 45.82. Text, % of Total.Test = 49.28. Text, % of Total.Val = 45.00. Text, triple inter-annotator mAP @ 0.5-0.95 (%).All = 84-86. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 81-86. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 88-93. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 89-93. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 87-92. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 71-79. Text, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 87-95. Title, Count. = 5071. Title, % of Total.Train = 0.47. Title, % of Total.Test = 0.30. Title, % of Total.Val = 0.50. Title, triple inter-annotator mAP @ 0.5-0.95 (%).All = 60-72. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 24-63. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 50-63. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 94-100. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 82-96. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 68-79. Title, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 24-56. Total, Count. = 1107470. Total, % of Total.Train = 941123. Total, % of Total.Test = 99816. Total, % of Total.Val = 66531. Total, triple inter-annotator mAP @ 0.5-0.95 (%).All = 82-83. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 71-74. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 79-81. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 89-94. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 86-91. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 71-76. Total, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 68-85", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/tables/3", @@ -3792,12 +4473,19 @@ ], "captions": [ "Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row \"Total\") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the tripleannotated pages, from which we obtain accuracy ranges. Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurr ence (as % of row \"Total\") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges. B" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Figure 3: face. The laid te be drawn the respe", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/144", @@ -3826,12 +4514,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "we distribute d the annotation workload and performed continuous quality contr ols. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised. Phase 1: Data selection and preparation. Our inclusion cri-", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/145", @@ -3860,12 +4555,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "of pages ed by seerties. For cument figur es or object how", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/146", @@ -3894,12 +4596,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "d the colfealayout labels. Pageand $_{Title}$. class cificity ed for of the ambiguous, while coverage ensures that all meaningful items on a page can be annotated. We refrained from class labels that are very specific to a document category, such as Abstract in the Scientific Articles category. We also avoided class labels that are tightly linked to the semantics of the text. Labels such as Author and $_{Affiliation}$, as seen", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/147", @@ -3928,12 +4637,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "teria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources in DocBank, are often only distinguishable by discriminating on $^{3}$https://arxiv.org/ Figure 4: Table 1 from the DocLayNet paper in the original PDF (A), as rendered Markdown (B) and in JSON representation (C). Spanning table cells, such as the multi-column header \"triple interannotator mAP@0.5-0.95 (%)\", is repeated for each column in the Markdown representation (B), which guarantees that every data point can be traced back to row and column headings only by its grid coordinates in the table. In the JSON representation, the span information is reflected in the fields of each table cell (C).", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/148", @@ -3962,12 +4678,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "9", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/149", @@ -3996,7 +4719,12 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } } ] diff --git a/test/data/chunker/1_out_chunks.json b/test/data/chunker/1_out_chunks.json index 43130ac..066055e 100644 --- a/test/data/chunker/1_out_chunks.json +++ b/test/data/chunker/1_out_chunks.json @@ -3,6 +3,8 @@ { "text": "arXiv:2408.09869v3 [cs.CL] 30 Aug 2024", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/0", @@ -28,12 +30,19 @@ } ] } - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Version 1.0", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/2", @@ -62,12 +71,19 @@ ], "headings": [ "Docling Technical Report" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/3", @@ -96,12 +112,19 @@ ], "headings": [ "Docling Technical Report" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "AI4K Group, IBM Research Ruschlikon, Switzerland", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/4", @@ -130,12 +153,19 @@ ], "headings": [ "Docling Technical Report" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/6", @@ -165,12 +195,19 @@ "headings": [ "Docling Technical Report", "Abstract" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/8", @@ -199,12 +236,19 @@ ], "headings": [ "1 Introduction" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/9", @@ -233,12 +277,19 @@ ], "headings": [ "1 Introduction" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Docling Technical Report", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/10", @@ -267,12 +318,19 @@ ], "headings": [ "1 Introduction" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "1", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/11", @@ -301,12 +359,19 @@ ], "headings": [ "1 Introduction" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Here is what Docling delivers today:", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/12", @@ -335,12 +400,19 @@ ], "headings": [ "1 Introduction" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "\u00b7 Converts PDF documents to JSON or Markdown format, stable and lightning fast", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/13", @@ -369,12 +441,19 @@ ], "headings": [ "1 Introduction" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "\u00b7 Understands detailed page layout, reading order, locates figures and recovers table structures", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/14", @@ -403,12 +482,19 @@ ], "headings": [ "1 Introduction" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "\u00b7 Extracts metadata from the document, such as title, authors, references and language", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/15", @@ -437,12 +523,19 @@ ], "headings": [ "1 Introduction" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "\u00b7 Optionally applies OCR, e.g. for scanned PDFs", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/16", @@ -471,12 +564,19 @@ ], "headings": [ "1 Introduction" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "\u00b7 Can be configured to be optimal for batch-mode (i.e high throughput, low time-to-solution) or interactive mode (compromise on efficiency, low time-to-solution)", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/17", @@ -505,12 +605,19 @@ ], "headings": [ "1 Introduction" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "\u00b7 Can leverage different accelerators (GPU, MPS, etc).", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/18", @@ -539,12 +646,19 @@ ], "headings": [ "1 Introduction" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "To use Docling, you can simply install the docling package from PyPI. Documentation and examples are available in our GitHub repository at github.com/DS4SD/docling. All required model assets 1 are downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the model assets in advance.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/20", @@ -573,12 +687,19 @@ ], "headings": [ "2 Getting Started" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Docling provides an easy code interface to convert PDF documents from file system, URLs or binary streams, and retrieve the output in either JSON or Markdown format. For convenience, separate methods are offered to convert single documents or batches of documents. A basic usage example is illustrated below. Further examples are available in the Doclign code repository.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/21", @@ -607,12 +728,19 @@ ], "headings": [ "2 Getting Started" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "from docling.document_converter import DocumentConverter source = \"https :// arxiv.org/pdf /2206.01062\" # PDF path or URL converter = DocumentConverter () result = converter.convert_single(source) print(result.render_as_markdown ()) # output: \"## DocLayNet: A Large Human -Annotated Dataset for Document -Layout Analysis [...]\"", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/22", @@ -641,12 +769,19 @@ ], "headings": [ "2 Getting Started" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Optionally, you can configure custom pipeline features and runtime options, such as turning on or off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and defining the budget of CPU threads. Advanced usage examples and options are documented in the README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a container.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/23", @@ -675,12 +810,19 @@ ], "headings": [ "2 Getting Started" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Docling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support downstream operations. Then, the standard model pipeline applies a sequence of AI models independently on every page in the document to extract features and content, such as layout and table structures. Finally, the results from all pages are aggregated and passed through a post-processing stage, which augments metadata, detects the document language, infers reading-order and eventually assembles a typed document object which can be serialized to JSON or Markdown.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/25", @@ -709,12 +851,19 @@ ], "headings": [ "3 Processing pipeline" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Two basic requirements to process PDF documents in our pipeline are a) to retrieve all text content and their geometric coordinates on each page and b) to render the visual representation of each page as it would appear in a PDF viewer. Both these requirements are encapsulated in Docling's PDF backend interface. While there are several open-source PDF parsing libraries available for python, we faced major obstacles with all of them for different reasons, among which were restrictive", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/27", @@ -743,12 +892,19 @@ ], "headings": [ "3.1 PDF backends" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "$^{1}$see huggingface.co/ds4sd/docling-models/", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/28", @@ -777,12 +933,19 @@ ], "headings": [ "3.1 PDF backends" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "2", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/29", @@ -811,12 +974,19 @@ ], "headings": [ "3.1 PDF backends" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/30", @@ -845,12 +1015,19 @@ ], "headings": [ "3.1 PDF backends" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14].", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/31", @@ -879,12 +1056,19 @@ ], "headings": [ "3.1 PDF backends" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/32", @@ -913,12 +1097,19 @@ ], "headings": [ "3.1 PDF backends" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/34", @@ -947,12 +1138,19 @@ ], "headings": [ "3.2 AI models" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Our layout analysis model is an object-detector which predicts the bounding-boxes and classes of various elements on the image of a given page. Its architecture is derived from RT-DETR [16] and re-trained on DocLayNet [13], our popular human-annotated dataset for document-layout analysis, among other proprietary datasets. For inference, our implementation relies on the onnxruntime [5].", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/36", @@ -981,12 +1179,19 @@ ], "headings": [ "Layout Analysis Model" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "The Docling pipeline feeds page images at 72 dpi resolution, which can be processed on a single CPU with sub-second latency. All predicted bounding-box proposals for document elements are post-processed to remove overlapping proposals based on confidence and size, and then intersected with the text tokens in the PDF to group them into meaningful and complete units such as paragraphs, section titles, list items, captions, figures or tables.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/37", @@ -1015,12 +1220,19 @@ ], "headings": [ "Layout Analysis Model" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "The TableFormer model [12], first published in 2022 and since refined with a custom structure token language [9], is a vision-transformer model for table structure recovery. It can predict the logical row and column structure of a given table based on an input image, and determine which table cells belong to column headers, row headers or the table body. Compared to earlier approaches, TableFormer handles many characteristics of tables, such as partial or no borderlines, empty cells, rows or columns, cell spans and hierarchy both on column-heading or row-heading level, tables with inconsistent indentation or alignment and other complexities. For inference, our implementation relies on PyTorch [2].", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/39", @@ -1049,12 +1261,19 @@ ], "headings": [ "Table Structure Recognition" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "3", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/40", @@ -1083,12 +1302,19 @@ ], "headings": [ "Table Structure Recognition" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "The Docling pipeline feeds all table objects detected in the layout analysis to the TableFormer model, by providing an image-crop of the table and the included text cells. TableFormer structure predictions are matched back to the PDF cells in post-processing to avoid expensive re-transcription text in the table image. Typical tables require between 2 and 6 seconds to be processed on a standard CPU, strongly depending on the amount of included table cells.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/41", @@ -1117,12 +1343,19 @@ ], "headings": [ "Table Structure Recognition" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Docling provides optional support for OCR, for example to cover scanned PDFs or content in bitmaps images embedded on a page. In our initial release, we rely on EasyOCR [1], a popular thirdparty OCR library with support for many languages. Docling, by default, feeds a high-resolution page image (216 dpi) to the OCR engine, to allow capturing small print detail in decent quality. While EasyOCR delivers reasonable transcription quality, we observe that it runs fairly slow on CPU (upwards of 30 seconds per page).", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/43", @@ -1151,12 +1384,19 @@ ], "headings": [ "OCR" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "We are actively seeking collaboration from the open-source community to extend Docling with additional OCR backends and speed improvements.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/44", @@ -1185,12 +1425,19 @@ ], "headings": [ "OCR" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "In the final pipeline stage, Docling assembles all prediction results produced on each page into a well-defined datatype that encapsulates a converted document, as defined in the auxiliary package docling-core . The generated document object is passed through a post-processing model which leverages several algorithms to augment features, such as detection of the document language, correcting the reading order, matching figures with captions and labelling metadata such as title, authors and references. The final output can then be serialized to JSON or transformed into a Markdown representation at the users request.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/46", @@ -1219,12 +1466,19 @@ ], "headings": [ "3.3 Assembly" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Docling provides a straight-forward interface to extend its capabilities, namely the model pipeline. A model pipeline constitutes the central part in the processing, following initial document parsing and preceding output assembly, and can be fully customized by sub-classing from an abstract baseclass ( BaseModelPipeline ) or cloning the default model pipeline. This effectively allows to fully customize the chain of models, add or replace models, and introduce additional pipeline configuration parameters. To use a custom model pipeline, the custom pipeline class to instantiate can be provided as an argument to the main document conversion methods. We invite everyone in the community to propose additional or alternative models and improvements.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/48", @@ -1253,12 +1507,19 @@ ], "headings": [ "3.4 Extensibility" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Implementations of model classes must satisfy the python Callable interface. The __call__ method must accept an iterator over page objects, and produce another iterator over the page objects which were augmented with the additional features predicted by the model, by extending the provided PagePredictions data model accordingly.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/49", @@ -1287,12 +1548,19 @@ ], "headings": [ "3.4 Extensibility" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "In this section, we establish some reference numbers for the processing speed of Docling and the resource budget it requires. All tests in this section are run with default options on our standard test set distributed with Docling, which consists of three papers from arXiv and two IBM Redbooks, with a total of 225 pages. Measurements were taken using both available PDF backends on two different hardware systems: one MacBook Pro M3 Max, and one bare-metal server running Ubuntu 20.04 LTS on an Intel Xeon E5-2690 CPU. For reproducibility, we fixed the thread budget (through setting OMP NUM THREADS environment variable ) once to 4 (Docling default) and once to 16 (equal to full core count on the test hardware). All results are shown in Table 1.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/51", @@ -1321,12 +1589,19 @@ ], "headings": [ "4 Performance" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "If you need to run Docling in very low-resource environments, please consider configuring the pypdfium backend. While it is faster and more memory efficient than the default docling-parse backend, it will come at the expense of worse quality results, especially in table structure recovery.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/52", @@ -1355,12 +1630,19 @@ ], "headings": [ "4 Performance" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Establishing GPU acceleration support for the AI models is currently work-in-progress and largely untested, but may work implicitly when CUDA is available and discovered by the onnxruntime and", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/53", @@ -1389,12 +1671,19 @@ ], "headings": [ "4 Performance" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "4", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/54", @@ -1423,12 +1712,19 @@ ], "headings": [ "4 Performance" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/55", @@ -1457,12 +1753,19 @@ ], "headings": [ "4 Performance" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/56", @@ -1491,12 +1794,19 @@ ], "headings": [ "4 Performance" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Apple M3 Max, Thread budget.Thread budget = 4. Apple M3 Max, native backend.TTS = 177 s. Apple M3 Max, native backend.Pages/s = 1.27. Apple M3 Max, native backend.Mem = 6.20 GB. Apple M3 Max, pypdfium backend.TTS = 103 s. Apple M3 Max, pypdfium backend.Pages/s = 2.18. Apple M3 Max, pypdfium backend.Mem = 2.56 GB. (16 cores), Thread budget.Thread budget = 16. (16 cores), native backend.TTS = 167 s. (16 cores), native backend.Pages/s = 1.34. (16 cores), native backend.Mem = 6.20 GB. (16 cores), pypdfium backend.TTS = 92 s. (16 cores), pypdfium backend.Pages/s = 2.45. (16 cores), pypdfium backend.Mem = 2.56 GB. Intel(R) Xeon E5-2690, Thread budget.Thread budget = 4 16. Intel(R) Xeon E5-2690, native backend.TTS = 375 s 244 s. Intel(R) Xeon E5-2690, native backend.Pages/s = 0.60 0.92. Intel(R) Xeon E5-2690, native backend.Mem = 6.16 GB. Intel(R) Xeon E5-2690, pypdfium backend.TTS = 239 s 143 s. Intel(R) Xeon E5-2690, pypdfium backend.Pages/s = 0.94 1.57. Intel(R) Xeon E5-2690, pypdfium backend.Mem = 2.42 GB", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/tables/0", @@ -1528,12 +1838,19 @@ ], "captions": [ "Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads." - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/58", @@ -1562,12 +1879,19 @@ ], "headings": [ "5 Applications" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Docling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of content, as well as augment extracted document metadata with additional information. Further investment into testing and optimizing GPU acceleration as well as improving the Docling-native PDF backend are on our roadmap, too.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/60", @@ -1596,12 +1920,19 @@ ], "headings": [ "6 Future work and contributions" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "We encourage everyone to propose or implement additional features and models, and will gladly take your inputs and contributions under review . The codebase of Docling is open for use and contribution, under the MIT license agreement and in alignment with our contributing guidelines included in the Docling repository. If you use Docling in your projects, please consider citing this technical report.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/61", @@ -1630,12 +1961,19 @@ ], "headings": [ "6 Future work and contributions" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "[1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/63", @@ -1664,12 +2002,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "[2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/64", @@ -1698,12 +2043,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "5", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/65", @@ -1732,12 +2084,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "machine learning through dynamic python bytecode transformation and graph compilation. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24) . ACM, 4 2024. doi: 10.1145/3620665.3640366. URL https://pytorch.org/assets/pytorch2-2.pdf .", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/66", @@ -1766,12 +2125,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "[3] C. Auer, M. Dolfi, A. Carvalho, C. B. Ramis, and P. W. Staar. Delivering document conversion as a cloud service with high throughput and responsiveness. In 2022 IEEE 15th International Conference on Cloud Computing (CLOUD) , pages 363-373. IEEE, 2022.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/67", @@ -1800,12 +2166,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "[4] J. Berkenbilt. Qpdf: A content-preserving pdf document transformer, 2024. URL https: //github.com/qpdf/qpdf .", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/68", @@ -1834,12 +2207,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "[5] O. R. developers. Onnx runtime. https://onnxruntime.ai/ , 2024. Version: 1.18.1.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/69", @@ -1868,12 +2248,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "[6] IBM. Data Prep Kit: a community project to democratize and accelerate unstructured data preparation for LLM app developers, 2024. URL https://github.com/IBM/ data-prep-kit .", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/70", @@ -1902,12 +2289,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "[7] A. S. Inc. PyMuPDF, 2024. URL https://github.com/pymupdf/PyMuPDF .", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/71", @@ -1936,12 +2330,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "[8] J. Liu. LlamaIndex, 11 2022. URL https://github.com/jerryjliu/llama_index .", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/72", @@ -1970,12 +2371,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "[9] M. Lysak, A. Nassar, N. Livathinos, C. Auer, and P. Staar. Optimized Table Tokenization for Table Structure Recognition. In Document Analysis and Recognition - ICDAR 2023: 17th International Conference, San Jos'e, CA, USA, August 21-26, 2023, Proceedings, Part II , pages 37-50, Berlin, Heidelberg, Aug. 2023. Springer-Verlag. ISBN 978-3-031-41678-1. doi: 10. 1007/978-3-031-41679-8 3. URL https://doi.org/10.1007/978-3-031-41679-8_3 .", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/73", @@ -2004,12 +2412,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "[10] L. Mishra, S. Dhibi, Y. Kim, C. Berrospi Ramis, S. Gupta, M. Dolfi, and P. Staar. Statements: Universal information extraction from tables with large language models for ESG KPIs. In D. Stammbach, J. Ni, T. Schimanski, K. Dutia, A. Singh, J. Bingler, C. Christiaen, N. Kushwaha, V. Muccione, S. A. Vaghefi, and M. Leippold, editors, Proceedings of the 1st Workshop on Natural Language Processing Meets Climate Change (ClimateNLP 2024) , pages 193-214, Bangkok, Thailand, Aug. 2024. Association for Computational Linguistics. URL https://aclanthology.org/2024.climatenlp-1.15 .", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/74", @@ -2038,12 +2453,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "[11] L. Morin, V. Weber, G. I. Meijer, F. Yu, and P. W. J. Staar. Patcid: an open-access dataset of chemical structures in patent documents. Nature Communications , 15(1):6532, August 2024. ISSN 2041-1723. doi: 10.1038/s41467-024-50779-y. URL https://doi.org/10.1038/ s41467-024-50779-y .", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/75", @@ -2072,12 +2494,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "[12] A. Nassar, N. Livathinos, M. Lysak, and P. Staar. Tableformer: Table structure understanding with transformers. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition , pages 4614-4623, 2022.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/76", @@ -2106,12 +2535,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "[13] B. Pfitzmann, C. Auer, M. Dolfi, A. S. Nassar, and P. Staar. Doclaynet: a large humanannotated dataset for document-layout segmentation. pages 3743-3751, 2022.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/77", @@ -2140,12 +2576,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "[14] pypdf Maintainers. pypdf: A Pure-Python PDF Library, 2024. URL https://github.com/ py-pdf/pypdf .", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/78", @@ -2174,12 +2617,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "[15] P. Team. PyPDFium2: Python bindings for PDFium, 2024. URL https://github.com/ pypdfium2-team/pypdfium2 .", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/79", @@ -2208,12 +2658,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "[16] Y. Zhao, W. Lv, S. Xu, J. Wei, G. Wang, Q. Dang, Y. Liu, and J. Chen. Detrs beat yolos on real-time object detection, 2023.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/80", @@ -2242,12 +2699,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "6", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/81", @@ -2276,12 +2740,19 @@ ], "headings": [ "References" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "In this section, we illustrate a few examples of Docling' s output in Markdown and JSON.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/83", @@ -2310,12 +2781,19 @@ ], "headings": [ "Appendix" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/85", @@ -2344,12 +2822,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/86", @@ -2378,12 +2863,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/87", @@ -2412,12 +2904,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/88", @@ -2446,12 +2945,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/89", @@ -2480,12 +2986,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present $_{DocLayNet}$, a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/91", @@ -2514,12 +3027,19 @@ ], "headings": [ "ABSTRACT" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "\u00b7 Information systems \u2192 Document structure ; \u00b7 Applied computing \u2192 Document analysis ; \u00b7 Computing methodologies \u2192 Machine learning ; Computer vision ; $_{Object detection}$;", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/93", @@ -2548,12 +3068,19 @@ ], "headings": [ "CCS CONCEPTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profi t or commercial advantage and that copies bear this notice and the full citation on thefirst page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). KDD '22, August 14-18, 2022, Washington, DC, USA \u00a9 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/94", @@ -2582,12 +3109,19 @@ ], "headings": [ "CCS CONCEPTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/96", @@ -2616,12 +3150,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/97", @@ -2650,12 +3191,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/98", @@ -2684,12 +3232,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/99", @@ -2718,12 +3273,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/100", @@ -2752,12 +3314,19 @@ ], "headings": [ "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large groundtruth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/102", @@ -2786,12 +3355,19 @@ ], "headings": [ "ABSTRACT" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "$_{\u00b7 Information systems }$\u2192$_{ Document structure ; \u00b7 Applied computing }$ \u2192$_{ Document analysis ; \u00b7 Computing methodologies }$\u2192$_{ Machine learning ;}$ Computer vision ; Object detection ;", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/104", @@ -2820,12 +3396,19 @@ ], "headings": [ "CCS CONCEPTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/105", @@ -2854,12 +3437,19 @@ ], "headings": [ "CCS CONCEPTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "KDD '22, August 14-18, 2022, Washington, DC, USA \u00a9 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/106", @@ -2888,12 +3478,19 @@ ], "headings": [ "CCS CONCEPTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Figure 1: Four examples of complex page layouts across different document categories", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/107", @@ -2922,12 +3519,19 @@ ], "headings": [ "CCS CONCEPTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "PDF document conversion, layout segmentation, object-detection, data set, Machine Learning", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/109", @@ -2956,12 +3560,19 @@ ], "headings": [ "KEYWORDS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/111", @@ -2990,12 +3601,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Figure 1: Four examples of complex page layouts across different document categories", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/112", @@ -3024,12 +3642,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "PDF document conversion, layout segmentation, object-detection, data set, Machine Learning", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/114", @@ -3058,12 +3683,19 @@ ], "headings": [ "KEYWORDS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi , Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Wash-$_{ington, DC, USA.}$ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/116", @@ -3092,12 +3724,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "1 INTRODUCTION", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/117", @@ -3126,12 +3765,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv .org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown).", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/118", @@ -3160,12 +3806,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "7", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/119", @@ -3194,12 +3847,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/120", @@ -3228,12 +3888,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/121", @@ -3262,12 +3929,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/122", @@ -3296,12 +3970,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Caption, human.human = 84-89. Caption, MRCNN.R50 = 68.4. Caption, MRCNN.R101 = 71.5. Caption, FRCNN.R101 = 70.1. Caption, YOLO.v5x6 = 77.7. Footnote, human.human = 83-91. Footnote, MRCNN.R50 = 70.9. Footnote, MRCNN.R101 = 71.8. Footnote, FRCNN.R101 = 73.7. Footnote, YOLO.v5x6 = 77.2. Formula, human.human = 83-85. Formula, MRCNN.R50 = 60.1. Formula, MRCNN.R101 = 63.4. Formula, FRCNN.R101 = 63.5. Formula, YOLO.v5x6 = 66.2. List-item, human.human = 87-88. List-item, MRCNN.R50 = 81.2. List-item, MRCNN.R101 = 80.8. List-item, FRCNN.R101 = 81.0. List-item, YOLO.v5x6 = 86.2. Page-footer, human.human = 93-94. Page-footer, MRCNN.R50 = 61.6. Page-footer, MRCNN.R101 = 59.3. Page-footer, FRCNN.R101 = 58.9. Page-footer, YOLO.v5x6 = 61.1. Page-header, human.human = 85-89. Page-header, MRCNN.R50 = 71.9. Page-header, MRCNN.R101 = 70.0. Page-header, FRCNN.R101 = 72.0. Page-header, YOLO.v5x6 = 67.9. Picture, human.human = 69-71. Picture, MRCNN.R50 = 71.7. Picture, MRCNN.R101 = 72.7. Picture, FRCNN.R101 = . Picture, YOLO.v5x6 = 77.1. Section-header, human.human = 83-84. Section-header, MRCNN.R50 = 67.6. Section-header, MRCNN.R101 = 69.3. Section-header, FRCNN.R101 = 68.4. Section-header, YOLO.v5x6 = 74.6. Table, human.human = 77-81. Table, MRCNN.R50 = 82.2. Table, MRCNN.R101 = 82.9. Table, FRCNN.R101 = 82.2. Table, YOLO.v5x6 = 86.3. Text, human.human = 84-86. Text, MRCNN.R50 = 84.6. Text, MRCNN.R101 = 85.8. Text, FRCNN.R101 = 85.4. Text, YOLO.v5x6 = . , human.human = . , MRCNN.R50 = 76.7. , MRCNN.R101 = 80.4. , FRCNN.R101 = 79.9. , YOLO.v5x6 = 88.1. Title, human.human = 60-72. Title, MRCNN.R50 = . Title, MRCNN.R101 = . Title, FRCNN.R101 = . Title, YOLO.v5x6 = 82.7. All, human.human = 82-83. All, MRCNN.R50 = 72.4. All, MRCNN.R101 = 73.5. All, FRCNN.R101 = 73.4. All, YOLO.v5x6 = 76.8", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/tables/1", @@ -3333,12 +4014,19 @@ ], "captions": [ "Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset." - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and $_{Picture}$. For the latter, we instructed annotation staffto minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way toflag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in thefinal dataset. With all these measures in place, experienced annotation staffmanaged to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/123", @@ -3367,12 +4055,19 @@ ], "headings": [ "ACM Reference Format:" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/125", @@ -3401,12 +4096,19 @@ ], "headings": [ "5 EXPERIMENTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in \"5. Experiments\" wrapping over the column end is broken up in two and interrupted by the table.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/126", @@ -3435,12 +4137,19 @@ ], "headings": [ "5 EXPERIMENTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curv eflattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/127", @@ -3469,12 +4178,19 @@ ], "headings": [ "5 EXPERIMENTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/128", @@ -3503,12 +4219,19 @@ ], "headings": [ "5 EXPERIMENTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/129", @@ -3537,12 +4260,19 @@ ], "headings": [ "5 EXPERIMENTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of $^{1025}$\u00d71025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as $_{Text}$, Table and $_{Picture}$. This is not entirely surprising, as and Picture are abundant and the most visually distinctive in a document.", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/131", @@ -3571,12 +4301,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Prediclion Derormance (up80.5-0.85 ohobeci detecion lalo ks Doclaynal Lest saL Ine VACNN (Mask R-CNNI and FACNN (Faster A-CNM) modcs mith PosNc: 50 PosNo: 101 backtone woro trainod based on Enc nchwwcrk achrocturos tom Ihc Oeronhroase a-CNn aso rioi-Fpn Jx, FasieA-Cnn a1o1-FPN Jx), wilh delaui conlwuralions The YoUg mpomorcabon utilzod w2s YoloSyb(13| modos woro inbalsod usino cro-trunodmonhts hron Coco 2017 datasor", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/132", @@ -3605,12 +4342,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Gaoon, noun = . Gaoon, Mrcnn = . Gaoon, MaCNN = . Gaoon, Frcne = . Gaoon, Yolo = . Foomolo, noun = . Foomolo, Mrcnn = . Foomolo, MaCNN = . Foomolo, Frcne = . Foomolo, Yolo = . Foula, noun = . Foula, Mrcnn = . Foula, MaCNN = . Foula, Frcne = . Foula, Yolo = . Ust-lern, noun = . Ust-lern, Mrcnn = . Ust-lern, MaCNN = . Ust-lern, Frcne = . Ust-lern, Yolo = . Page-locer, noun = . Page-locer, Mrcnn = . Page-locer, MaCNN = . Page-locer, Frcne = . Page-locer, Yolo = . Faqe-haje, noun = . Faqe-haje, Mrcnn = . Faqe-haje, MaCNN = . Faqe-haje, Frcne = . Faqe-haje, Yolo = . Pxlu, noun = . Pxlu, Mrcnn = . Pxlu, MaCNN = . Pxlu, Frcne = . Pxlu, Yolo = . Sonhoade, noun = . Sonhoade, Mrcnn = . Sonhoade, MaCNN = . Sonhoade, Frcne = . Sonhoade, Yolo = ", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/tables/2", @@ -3639,12 +4383,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "iD avod Ihbs arcost cha unbasndbasolino numoc human cocumnnt-Laycut annotalion; Thrd Inirooucod leatura 0i snapoina Doxes around lerl scainunis cblan & pixel-accuiale annolaton and aJan feduce Bifre and elonThe CCS annoinbon aloMalca shruks Ovory Usor-drawnboro mnmum boundino-borarounaIho onclosod coxt-colls Purolytort basud scoitontwhich uxclldcs Ort Tatlo and Picluo latsor Inssucicdannjlabon sha mnim so inclusion Suitcurding mlospeco whloIncvon Oenoncang doans d0 oisnaocmnbors Onchse Ihal So10 wioogly Daisoc Pogcs Cannol be annotalcd coTcCEY and nccd supocd Foudn Oshdned Wuyio(aq Dagcs (ccclod Cases whcion valid anncuabon eccofding abeiqu Oelines coukbe acheneu Eamnole Case, flis wouk PDF peoe3 Ihal rendernnccrrecUy contanlavuta hat Imnosshk cantra milh Vananonnyogannio{ Suchiceciodoaoos not coralnon Ihofnn hr Aroknacoarreehetyn annollca slall nluuocd unnoln sina \" Puou lypical Lmnetamre 0l 20s 10 605 cecendnc conoanty", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/133", @@ -3673,12 +4424,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Ine crimary goal OocVAYNo cblan hion-quality Modols AccuaiodoaMoiuvana4s WMeVanalon chalcnonglayoul: Cecurdg echon Doicdi Delccion modcb rtene Casistlo Usc, Quulo Hhndandiubon ground-vuth data COCO lornat [16] and avaladloy enetal Irarnenoiks uch derectrcnz7] Furnemmcre, baseline nmnoe < I Putun Notand DocBank calanodusnsundad coict dosnchonmodols such Mas< A CNN and Fasior A CNN SuEna blraomhdelecfa nonInr Canacle", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/135", @@ -3707,12 +4465,19 @@ ], "headings": [ "5 EXPERIMENTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Fauri Prco chon ocrloianC( 005-095) ola Mask A-CNN ncthoik ilh AcsNciSo backbono brainod on incrcasing Iracbons oi DocLaynei calasot Tne loannp auro altons around Ih0 \u20ac03 noicahino Ihal inxreasing /e 520 Q Iho DocL\u00f8y Nel dalasot Amardaen nol Ycid sn: dorOocC Chons LAD", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/136", @@ -3741,12 +4506,19 @@ ], "headings": [ "5 EXPERIMENTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "pangrandloave detallod evalvallon %moro rcoarimolhods monionan Secilg Jorhlure work", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/137", @@ -3775,12 +4547,19 @@ ], "headings": [ "5 EXPERIMENTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Inuhs sechon All Deseni seur8/ asoecis reles00 Perormanoe ouieci celec on DoxclayNet Simamtas In PLoLaynnt oyuato tnn qualmy cuthnlr crodictionsusiramnanavnna prncisicn (TTAP) wch IDovrdaos that rangn trom 0 5ta 005 (nap,o6-00: Ml olue Fnoula Cvurbar uvalaion coou piayIed DY Ihu COCO API/161 ook", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/138", @@ -3809,12 +4588,19 @@ ], "headings": [ "5 EXPERIMENTS" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "ptesenl baselne expenrnenls (Qvenin MAF) on Mas< R-CNN /121 Fasler F-CNN [11] an1 YOLOvS [13] Bou1 brann anavailang woropomormod AGa Imnoos vith dimonsions 1025 chxrols For tralring onN usodomannolatln Incaso ohcuunourfhunnolulco Dac3 Ohenn Vuruhoninptalunhamagny usnaroA en hn 10?7 loworrnannomap conoutec paicaisehuman anncrbons Aoo-amculeopnnos Ins Cves nacaton thatrhe DocLayNot daasci DOfo s mornwro clagnoo [csoarcncomrurt gap bctwoon human focogniticn and VL aporoaces nlelesuio IharNaska-CNNead Fasler GNincroova comnanen Maseoes nnocauna Ulbi AICBasodnanc scomrorubon oormvod Irom bounon)ooros Ooo{ abuin totcrorcochons Ontho chornnno Mcrocconi YolavSrmrodel does verywell und even Dul-Perdorins selectedlubels such Tedle undpcturl enbeh surcrisio Ta oloandPchre poincant amimemostasiaIN ishinsine documen: Ouau hnne", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/140", @@ -3843,12 +4629,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "8", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/141", @@ -3877,12 +4670,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/142", @@ -3911,12 +4711,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row \"Total\") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the tripleannotated pages, from which we obtain accuracy ranges. Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurr ence (as % of row \"Total\") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges. B", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/143", @@ -3945,12 +4752,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Caption, Count. = 22524. Caption, % of Total.Train = 2.04. Caption, % of Total.Test = 1.77. Caption, % of Total.Val = 2.32. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).All = 84-89. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 40-61. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 86-92. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 94-99. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 95-99. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 69-78. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).T en = n/a. Footnote, Count. = 6318. Footnote, % of Total.Train = 0.60. Footnote, % of Total.Test = 0.31. Footnote, % of Total.Val = 0.58. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).All = 83-91. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = n/a. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 100. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 62-88. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 85-94. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = n/a. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 82-97. Formula, Count. = 25027. Formula, % of Total.Train = 2.25. Formula, % of Total.Test = 1.90. Formula, % of Total.Val = 2.96. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).All = 83-85. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = . Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Man = n/a. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 84-87. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 86-96. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = . Formula, triple inter-annotator mAP @ 0.5-0.95 (%).T en = n/a. List-item, Count. = 185660. List-item, % of Total.Train = 17.19. List-item, % of Total.Test = 13.34. List-item, % of Total.Val = 15.82. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).All = 87-88. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 74-83. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 90-92. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 97-97. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 81-85. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 75-88. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 93-95. Page-footer, Count. = 70878. Page-footer, % of Total.Train = 6.51. Page-footer, % of Total.Test = 5.58. Page-footer, % of Total.Val = 6.00. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).All = 93-94. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 88-90. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 95-96. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 100. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 92-97. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 100. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 96-98. Page-header, Count. = 58022. Page-header, % of Total.Train = 5.10. Page-header, % of Total.Test = 6.70. Page-header, % of Total.Val = 5.06. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).All = 85-89. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 66-76. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 90-94. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 98-100. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 91-92. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 97-99. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 81-86. Picture, Count. = 45976. Picture, % of Total.Train = 4.21. Picture, % of Total.Test = 2.78. Picture, % of Total.Val = 5.31. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).All = 69-71. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 56-59. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 82-86. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 69-82. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 80-95. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 66-71. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 59-76. Section-header, Count. = 142884. Section-header, % of Total.Train = 12.60. Section-header, % of Total.Test = 15.77. Section-header, % of Total.Val = 12.85. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).All = 83-84. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 76-81. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 90-92. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 94-95. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 87-94. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 69-73. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 78-86. Table, Count. = 34733. Table, % of Total.Train = 3.20. Table, % of Total.Test = 2.27. Table, % of Total.Val = 3.60. Table, triple inter-annotator mAP @ 0.5-0.95 (%).All = 77-81. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 75-80. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 83-86. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 98-99. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 58-80. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 79-84. Table, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 70-85. Text, Count. = 510377. Text, % of Total.Train = 45.82. Text, % of Total.Test = 49.28. Text, % of Total.Val = 45.00. Text, triple inter-annotator mAP @ 0.5-0.95 (%).All = 84-86. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 81-86. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 88-93. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 89-93. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 87-92. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 71-79. Text, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 87-95. Title, Count. = 5071. Title, % of Total.Train = 0.47. Title, % of Total.Test = 0.30. Title, % of Total.Val = 0.50. Title, triple inter-annotator mAP @ 0.5-0.95 (%).All = 60-72. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 24-63. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 50-63. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 94-100. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 82-96. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 68-79. Title, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 24-56. Total, Count. = 1107470. Total, % of Total.Train = 941123. Total, % of Total.Test = 99816. Total, % of Total.Val = 66531. Total, triple inter-annotator mAP @ 0.5-0.95 (%).All = 82-83. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 71-74. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 79-81. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 89-94. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 86-91. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 71-76. Total, triple inter-annotator mAP @ 0.5-0.95 (%).T en = 68-85", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/tables/3", @@ -3982,12 +4796,19 @@ ], "captions": [ "Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row \"Total\") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the tripleannotated pages, from which we obtain accuracy ranges. Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurr ence (as % of row \"Total\") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges. B" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "Figure 3: face. The laid te be drawn the respe", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/144", @@ -4016,12 +4837,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "we distribute d the annotation workload and performed continuous quality contr ols. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised. Phase 1: Data selection and preparation. Our inclusion cri-", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/145", @@ -4050,12 +4878,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "of pages ed by seerties. For cument figur es or object how", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/146", @@ -4084,12 +4919,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "d the colfealayout labels. Pageand $_{Title}$. class cificity ed for of the ambiguous, while coverage ensures that all meaningful items on a page can be annotated. We refrained from class labels that are very specific to a document category, such as Abstract in the Scientific Articles category. We also avoided class labels that are tightly linked to the semantics of the text. Labels such as Author and $_{Affiliation}$, as seen", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/147", @@ -4118,12 +4960,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "teria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources in DocBank, are often only distinguishable by discriminating on $^{3}$https://arxiv.org/ Figure 4: Table 1 from the DocLayNet paper in the original PDF (A), as rendered Markdown (B) and in JSON representation (C). Spanning table cells, such as the multi-column header \"triple interannotator mAP@0.5-0.95 (%)\", is repeated for each column in the Markdown representation (B), which guarantees that every data point can be traced back to row and column headings only by its grid coordinates in the table. In the JSON representation, the span information is reflected in the fields of each table cell (C).", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/148", @@ -4152,12 +5001,19 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } }, { "text": "9", "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "doc_items": [ { "self_ref": "#/texts/149", @@ -4186,8 +5042,13 @@ ], "headings": [ "Baselines for Object Detection" - ] + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } } } ] -} +} \ No newline at end of file