From 9d7e831fb23c5069361bcb6be8d562f36393398b Mon Sep 17 00:00:00 2001
From: Shubham Gupta <26436285+sh-gupta@users.noreply.github.com>
Date: Mon, 18 Nov 2024 13:34:16 +0100
Subject: [PATCH] feat: add get_image for all DocItem (#67)

* Moved image attribute from FloatingItem to DocItem

Signed-off-by: Shubham Gupta <26436285+sh-gupta@users.noreply.github.com>

* Revert "Moved image attribute from FloatingItem to DocItem"

This reverts commit e48cd47d3743a25ce17bbaf31b3152ed53365fba.

Signed-off-by: Shubham Gupta <26436285+sh-gupta@users.noreply.github.com>

* Added get_image to DocItem and FloatingItem

Signed-off-by: Shubham Gupta <26436285+sh-gupta@users.noreply.github.com>

* Added tests for get_image in DocItem and FloatingItem

Signed-off-by: Shubham Gupta <26436285+sh-gupta@users.noreply.github.com>

* Updated get_image docstring in DocItem and FloatingItem

Signed-off-by: Shubham Gupta <26436285+sh-gupta@users.noreply.github.com>

---------

Signed-off-by: Shubham Gupta <26436285+sh-gupta@users.noreply.github.com>
Co-authored-by: Shubham Gupta <Shubham.Gupta1@ibm.com>
---
 docling_core/types/doc/document.py |  36 ++++++++
 test/test_docling_doc.py           | 128 +++++++++++++++++++++++++++++
 2 files changed, 164 insertions(+)

diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 9611ee7..6a47598 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -551,6 +551,28 @@ def get_location_tokens(
 
         return location
 
+    def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
+        """Returns the image of this DocItem.
+
+        The function returns None if this DocItem has no valid provenance or
+        if a valid image of the page containing this DocItem is not available
+        in doc.
+        """
+        if not len(self.prov):
+            return None
+
+        page = doc.pages.get(self.prov[0].page_no)
+        if page is None or page.size is None or page.image is None:
+            return None
+
+        page_image = page.image.pil_image
+        crop_bbox = (
+            self.prov[0]
+            .bbox.to_top_left_origin(page_height=page.size.height)
+            .scaled(scale=page_image.height / page.size.height)
+        )
+        return page_image.crop(crop_bbox.as_tuple())
+
 
 class TextItem(DocItem):
     """TextItem."""
@@ -633,6 +655,20 @@ def caption_text(self, doc: "DoclingDocument") -> str:
             text += cap.resolve(doc).text
         return text
 
+    def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
+        """Returns the image corresponding to this FloatingItem.
+
+        This function returns the PIL image from self.image if one is available.
+        Otherwise, it uses DocItem.get_image to get an image of this FloatingItem.
+
+        In particular, when self.image is None, the function returns None if this
+        FloatingItem has no valid provenance or the doc does not contain a valid image
+        for the required page.
+        """
+        if self.image is not None:
+            return self.image.pil_image
+        return super().get_image(doc=doc)
+
 
 class PictureItem(FloatingItem):
     """PictureItem."""
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
index 04128ba..18adb20 100644
--- a/test/test_docling_doc.py
+++ b/test/test_docling_doc.py
@@ -1,4 +1,5 @@
 from collections import deque
+from unittest.mock import Mock
 
 import pytest
 import yaml
@@ -7,6 +8,7 @@
 
 from docling_core.types.doc.document import (
     CURRENT_VERSION,
+    BoundingBox,
     DocItem,
     DoclingDocument,
     DocumentOrigin,
@@ -15,7 +17,9 @@
     KeyValueItem,
     ListItem,
     PictureItem,
+    ProvenanceItem,
     SectionHeaderItem,
+    Size,
     TableCell,
     TableData,
     TableItem,
@@ -407,3 +411,127 @@ def test_version_doc():
     comp_version = f"{major_split[0]}.{minor_split[0]}.{int(patch_split[0]) + 1}"
     doc = DoclingDocument(name="Untitled 1", version=comp_version)
     assert doc.version == CURRENT_VERSION
+
+
+def test_docitem_get_image():
+    # Prepare the document
+    doc = DoclingDocument(name="Dummy")
+
+    page1_image = PILImage.new(mode="RGB", size=(200, 400), color=(0, 0, 0))
+    doc_item_image = PILImage.new(mode="RGB", size=(20, 40), color=(255, 0, 0))
+    page1_image.paste(doc_item_image, box=(20, 40))
+
+    doc.add_page(  # With image
+        page_no=1,
+        size=Size(width=20, height=40),
+        image=ImageRef.from_pil(page1_image, dpi=72),
+    )
+    doc.add_page(page_no=2, size=Size(width=20, height=40), image=None)  # Without image
+
+    # DocItem with no provenance
+    doc_item = DocItem(self_ref="#", label=DocItemLabel.TEXT, prov=[])
+    assert doc_item.get_image(doc=doc) is None
+
+    # DocItem on an invalid page
+    doc_item = DocItem(
+        self_ref="#",
+        label=DocItemLabel.TEXT,
+        prov=[ProvenanceItem(page_no=3, bbox=Mock(spec=BoundingBox), charspan=(1, 2))],
+    )
+    assert doc_item.get_image(doc=doc) is None
+
+    # DocItem on a page without page image
+    doc_item = DocItem(
+        self_ref="#",
+        label=DocItemLabel.TEXT,
+        prov=[ProvenanceItem(page_no=2, bbox=Mock(spec=BoundingBox), charspan=(1, 2))],
+    )
+    assert doc_item.get_image(doc=doc) is None
+
+    # DocItem on a page with valid page image
+    doc_item = DocItem(
+        self_ref="#",
+        label=DocItemLabel.TEXT,
+        prov=[
+            ProvenanceItem(
+                page_no=1, bbox=BoundingBox(l=2, t=4, r=4, b=8), charspan=(1, 2)
+            )
+        ],
+    )
+    returned_doc_item_image = doc_item.get_image(doc=doc)
+    assert (
+        returned_doc_item_image is not None
+        and returned_doc_item_image.tobytes() == doc_item_image.tobytes()
+    )
+
+
+def test_floatingitem_get_image():
+    # Prepare the document
+    doc = DoclingDocument(name="Dummy")
+
+    page1_image = PILImage.new(mode="RGB", size=(200, 400), color=(0, 0, 0))
+    floating_item_image = PILImage.new(mode="RGB", size=(20, 40), color=(255, 0, 0))
+    page1_image.paste(floating_item_image, box=(20, 40))
+
+    doc.add_page(  # With image
+        page_no=1,
+        size=Size(width=20, height=40),
+        image=ImageRef.from_pil(page1_image, dpi=72),
+    )
+    doc.add_page(page_no=2, size=Size(width=20, height=40), image=None)  # Without image
+
+    # FloatingItem with explicit image different from image based on provenance
+    new_image = PILImage.new(mode="RGB", size=(40, 80), color=(0, 255, 0))
+    floating_item = FloatingItem(
+        self_ref="#",
+        label=DocItemLabel.PICTURE,
+        prov=[
+            ProvenanceItem(
+                page_no=1, bbox=BoundingBox(l=2, t=4, r=6, b=12), charspan=(1, 2)
+            )
+        ],
+        image=ImageRef.from_pil(image=new_image, dpi=72),
+    )
+    retured_image = floating_item.get_image(doc=doc)
+    assert retured_image is not None and retured_image.tobytes() == new_image.tobytes()
+
+    # FloatingItem without explicit image and no provenance
+    floating_item = FloatingItem(
+        self_ref="#", label=DocItemLabel.PICTURE, prov=[], image=None
+    )
+    assert floating_item.get_image(doc=doc) is None
+
+    # FloatingItem without explicit image on invalid page
+    floating_item = FloatingItem(
+        self_ref="#",
+        label=DocItemLabel.PICTURE,
+        prov=[ProvenanceItem(page_no=3, bbox=Mock(spec=BoundingBox), charspan=(1, 2))],
+        image=None,
+    )
+    assert floating_item.get_image(doc=doc) is None
+
+    # FloatingItem without explicit image on a page without page image
+    floating_item = FloatingItem(
+        self_ref="#",
+        label=DocItemLabel.PICTURE,
+        prov=[ProvenanceItem(page_no=2, bbox=Mock(spec=BoundingBox), charspan=(1, 2))],
+        image=None,
+    )
+    assert floating_item.get_image(doc=doc) is None
+
+    # FloatingItem without explicit image on a page with page image
+    floating_item = FloatingItem(
+        self_ref="#",
+        label=DocItemLabel.PICTURE,
+        prov=[
+            ProvenanceItem(
+                page_no=1, bbox=BoundingBox(l=2, t=4, r=4, b=8), charspan=(1, 2)
+            )
+        ],
+        image=None,
+    )
+    retured_image = floating_item.get_image(doc=doc)
+    assert (
+        retured_image is not None
+        and retured_image.tobytes() == floating_item_image.tobytes()
+    )