From 9d7e831fb23c5069361bcb6be8d562f36393398b Mon Sep 17 00:00:00 2001 From: Shubham Gupta <26436285+sh-gupta@users.noreply.github.com> Date: Mon, 18 Nov 2024 13:34:16 +0100 Subject: [PATCH] feat: add get_image for all DocItem (#67) * Moved image attribute from FloatingItem to DocItem Signed-off-by: Shubham Gupta <26436285+sh-gupta@users.noreply.github.com> * Revert "Moved image attribute from FloatingItem to DocItem" This reverts commit e48cd47d3743a25ce17bbaf31b3152ed53365fba. Signed-off-by: Shubham Gupta <26436285+sh-gupta@users.noreply.github.com> * Added get_image to DocItem and FloatingItem Signed-off-by: Shubham Gupta <26436285+sh-gupta@users.noreply.github.com> * Added tests for get_image in DocItem and FloatingItem Signed-off-by: Shubham Gupta <26436285+sh-gupta@users.noreply.github.com> * Updated get_image docstring in DocItem and FloatingItem Signed-off-by: Shubham Gupta <26436285+sh-gupta@users.noreply.github.com> --------- Signed-off-by: Shubham Gupta <26436285+sh-gupta@users.noreply.github.com> Co-authored-by: Shubham Gupta --- docling_core/types/doc/document.py | 36 ++++++++ test/test_docling_doc.py | 128 +++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 9611ee7..6a47598 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -551,6 +551,28 @@ def get_location_tokens( return location + def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]: + """Returns the image of this DocItem. + + The function returns None if this DocItem has no valid provenance or + if a valid image of the page containing this DocItem is not available + in doc. + """ + if not len(self.prov): + return None + + page = doc.pages.get(self.prov[0].page_no) + if page is None or page.size is None or page.image is None: + return None + + page_image = page.image.pil_image + crop_bbox = ( + self.prov[0] + .bbox.to_top_left_origin(page_height=page.size.height) + .scaled(scale=page_image.height / page.size.height) + ) + return page_image.crop(crop_bbox.as_tuple()) + class TextItem(DocItem): """TextItem.""" @@ -633,6 +655,20 @@ def caption_text(self, doc: "DoclingDocument") -> str: text += cap.resolve(doc).text return text + def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]: + """Returns the image corresponding to this FloatingItem. + + This function returns the PIL image from self.image if one is available. + Otherwise, it uses DocItem.get_image to get an image of this FloatingItem. + + In particular, when self.image is None, the function returns None if this + FloatingItem has no valid provenance or the doc does not contain a valid image + for the required page. + """ + if self.image is not None: + return self.image.pil_image + return super().get_image(doc=doc) + class PictureItem(FloatingItem): """PictureItem.""" diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py index 04128ba..18adb20 100644 --- a/test/test_docling_doc.py +++ b/test/test_docling_doc.py @@ -1,4 +1,5 @@ from collections import deque +from unittest.mock import Mock import pytest import yaml @@ -7,6 +8,7 @@ from docling_core.types.doc.document import ( CURRENT_VERSION, + BoundingBox, DocItem, DoclingDocument, DocumentOrigin, @@ -15,7 +17,9 @@ KeyValueItem, ListItem, PictureItem, + ProvenanceItem, SectionHeaderItem, + Size, TableCell, TableData, TableItem, @@ -407,3 +411,127 @@ def test_version_doc(): comp_version = f"{major_split[0]}.{minor_split[0]}.{int(patch_split[0]) + 1}" doc = DoclingDocument(name="Untitled 1", version=comp_version) assert doc.version == CURRENT_VERSION + + +def test_docitem_get_image(): + # Prepare the document + doc = DoclingDocument(name="Dummy") + + page1_image = PILImage.new(mode="RGB", size=(200, 400), color=(0, 0, 0)) + doc_item_image = PILImage.new(mode="RGB", size=(20, 40), color=(255, 0, 0)) + page1_image.paste(doc_item_image, box=(20, 40)) + + doc.add_page( # With image + page_no=1, + size=Size(width=20, height=40), + image=ImageRef.from_pil(page1_image, dpi=72), + ) + doc.add_page(page_no=2, size=Size(width=20, height=40), image=None) # Without image + + # DocItem with no provenance + doc_item = DocItem(self_ref="#", label=DocItemLabel.TEXT, prov=[]) + assert doc_item.get_image(doc=doc) is None + + # DocItem on an invalid page + doc_item = DocItem( + self_ref="#", + label=DocItemLabel.TEXT, + prov=[ProvenanceItem(page_no=3, bbox=Mock(spec=BoundingBox), charspan=(1, 2))], + ) + assert doc_item.get_image(doc=doc) is None + + # DocItem on a page without page image + doc_item = DocItem( + self_ref="#", + label=DocItemLabel.TEXT, + prov=[ProvenanceItem(page_no=2, bbox=Mock(spec=BoundingBox), charspan=(1, 2))], + ) + assert doc_item.get_image(doc=doc) is None + + # DocItem on a page with valid page image + doc_item = DocItem( + self_ref="#", + label=DocItemLabel.TEXT, + prov=[ + ProvenanceItem( + page_no=1, bbox=BoundingBox(l=2, t=4, r=4, b=8), charspan=(1, 2) + ) + ], + ) + returned_doc_item_image = doc_item.get_image(doc=doc) + assert ( + returned_doc_item_image is not None + and returned_doc_item_image.tobytes() == doc_item_image.tobytes() + ) + + +def test_floatingitem_get_image(): + # Prepare the document + doc = DoclingDocument(name="Dummy") + + page1_image = PILImage.new(mode="RGB", size=(200, 400), color=(0, 0, 0)) + floating_item_image = PILImage.new(mode="RGB", size=(20, 40), color=(255, 0, 0)) + page1_image.paste(floating_item_image, box=(20, 40)) + + doc.add_page( # With image + page_no=1, + size=Size(width=20, height=40), + image=ImageRef.from_pil(page1_image, dpi=72), + ) + doc.add_page(page_no=2, size=Size(width=20, height=40), image=None) # Without image + + # FloatingItem with explicit image different from image based on provenance + new_image = PILImage.new(mode="RGB", size=(40, 80), color=(0, 255, 0)) + floating_item = FloatingItem( + self_ref="#", + label=DocItemLabel.PICTURE, + prov=[ + ProvenanceItem( + page_no=1, bbox=BoundingBox(l=2, t=4, r=6, b=12), charspan=(1, 2) + ) + ], + image=ImageRef.from_pil(image=new_image, dpi=72), + ) + retured_image = floating_item.get_image(doc=doc) + assert retured_image is not None and retured_image.tobytes() == new_image.tobytes() + + # FloatingItem without explicit image and no provenance + floating_item = FloatingItem( + self_ref="#", label=DocItemLabel.PICTURE, prov=[], image=None + ) + assert floating_item.get_image(doc=doc) is None + + # FloatingItem without explicit image on invalid page + floating_item = FloatingItem( + self_ref="#", + label=DocItemLabel.PICTURE, + prov=[ProvenanceItem(page_no=3, bbox=Mock(spec=BoundingBox), charspan=(1, 2))], + image=None, + ) + assert floating_item.get_image(doc=doc) is None + + # FloatingItem without explicit image on a page without page image + floating_item = FloatingItem( + self_ref="#", + label=DocItemLabel.PICTURE, + prov=[ProvenanceItem(page_no=2, bbox=Mock(spec=BoundingBox), charspan=(1, 2))], + image=None, + ) + assert floating_item.get_image(doc=doc) is None + + # FloatingItem without explicit image on a page with page image + floating_item = FloatingItem( + self_ref="#", + label=DocItemLabel.PICTURE, + prov=[ + ProvenanceItem( + page_no=1, bbox=BoundingBox(l=2, t=4, r=4, b=8), charspan=(1, 2) + ) + ], + image=None, + ) + retured_image = floating_item.get_image(doc=doc) + assert ( + retured_image is not None + and retured_image.tobytes() == floating_item_image.tobytes() + )