From 616d27bc931d5bd172dcb42cc186fe087dc4a806 Mon Sep 17 00:00:00 2001
From: "Jorj X. McKie" <jorj.x.mckie@outlook.de>
Date: Mon, 2 Sep 2024 09:59:35 -0400
Subject: [PATCH] Fixing #312

Output images always are in PNG format, therefore any colorspace.n not in (1, 3) must be converted.
---
 pdf2docx/image/ImagesExtractor.py | 181 ++++++++++++++++--------------
 1 file changed, 95 insertions(+), 86 deletions(-)

diff --git a/pdf2docx/image/ImagesExtractor.py b/pdf2docx/image/ImagesExtractor.py
index 4100fb1..6a82b3b 100644
--- a/pdf2docx/image/ImagesExtractor.py
+++ b/pdf2docx/image/ImagesExtractor.py
@@ -1,4 +1,4 @@
-'''Extract images from PDF.
+"""Extract images from PDF.
 
 Both raster images and vector graphics are considered:
 
@@ -6,32 +6,30 @@
   and ``Page.get_images()``. Note the process for png images with alpha channel.
 * Vector graphics are actually composed of a group of paths, represented by operators like
   ``re``, ``m``, ``l`` and ``c``. They're detected by finding the contours with ``opencv``.
-'''
+"""
 
 import logging
 import fitz
 from ..common.Collection import Collection
 from ..common.share import BlockType
-from ..common.algorithm import (recursive_xy_cut, inner_contours, xy_project_profile)
+from ..common.algorithm import recursive_xy_cut, inner_contours, xy_project_profile
 
 
 class ImagesExtractor:
-    '''Extract images from PDF.'''
+    """Extract images from PDF."""
 
-    def __init__(self, page:fitz.Page) -> None:
-        '''Extract images from PDF page.
+    def __init__(self, page: fitz.Page) -> None:
+        """Extract images from PDF page.
 
         Args:
             page (fitz.Page): pdf page to extract images.
-        '''
+        """
         self._page = page
 
-
-    def clip_page_to_pixmap(self,
-                            bbox:fitz.Rect=None,
-                            rm_image:bool=False,
-                            zoom:float=3.0):
-        '''Clip page pixmap according to ``bbox``.
+    def clip_page_to_pixmap(
+        self, bbox: fitz.Rect = None, rm_image: bool = False, zoom: float = 3.0
+    ):
+        """Clip page pixmap according to ``bbox``.
 
         Args:
             bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page.
@@ -42,9 +40,11 @@ def clip_page_to_pixmap(self,
 
         Returns:
             fitz.Pixmap: The extracted pixmap.
-        '''
+        """
         # hide text and images
-        stream_dict = self._hide_page_text_and_images(self._page, rm_text=True, rm_image=rm_image)
+        stream_dict = self._hide_page_text_and_images(
+            self._page, rm_text=True, rm_image=rm_image
+        )
 
         if bbox is None:
             clip_bbox = self._page.rect
@@ -56,26 +56,28 @@ def clip_page_to_pixmap(self,
         else:
             clip_bbox = bbox
 
-        clip_bbox =  self._page.rect & clip_bbox
+        clip_bbox = self._page.rect & clip_bbox
 
         # improve resolution
         # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution
         # - https://github.com/pymupdf/PyMuPDF/issues/181
         matrix = fitz.Matrix(zoom, zoom)
-        pix = self._page.get_pixmap(clip=clip_bbox, matrix=matrix) # type: fitz.Pixmap
+        pix = self._page.get_pixmap(clip=clip_bbox, matrix=matrix)  # type: fitz.Pixmap
 
         # recovery page if hide text
         doc = self._page.parent
-        for xref, stream in stream_dict.items(): doc.update_stream(xref, stream)
+        for xref, stream in stream_dict.items():
+            doc.update_stream(xref, stream)
 
         return pix
 
-
-    def clip_page_to_dict(self,
-                          bbox:fitz.Rect=None,
-                          rm_image:bool=False,
-                          clip_image_res_ratio:float=3.0):
-        '''Clip page pixmap (without text) according to ``bbox`` and convert to source image.
+    def clip_page_to_dict(
+        self,
+        bbox: fitz.Rect = None,
+        rm_image: bool = False,
+        clip_image_res_ratio: float = 3.0,
+    ):
+        """Clip page pixmap (without text) according to ``bbox`` and convert to source image.
 
         Args:
             bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page.
@@ -85,13 +87,14 @@ def clip_page_to_dict(self,
 
         Returns:
             list: A list of image raw dict.
-        '''
-        pix = self.clip_page_to_pixmap(bbox=bbox, rm_image=rm_image, zoom=clip_image_res_ratio)
+        """
+        pix = self.clip_page_to_pixmap(
+            bbox=bbox, rm_image=rm_image, zoom=clip_image_res_ratio
+        )
         return self._to_raw_dict(pix, bbox)
 
-
-    def extract_images(self, clip_image_res_ratio:float=3.0):
-        '''Extract normal images with ``Page.get_images()``.
+    def extract_images(self, clip_image_res_ratio: float = 3.0):
+        """Extract normal images with ``Page.get_images()``.
 
         Args:
             clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap.
@@ -103,7 +106,7 @@ def extract_images(self, clip_image_res_ratio:float=3.0):
         .. note::
             ``Page.get_images()`` contains each image only once, which may less than the
             real count of images in a page.
-        '''
+        """
         # pdf document
         doc = self._page.parent
         rotation = self._page.rotation
@@ -122,13 +125,15 @@ def extract_images(self, clip_image_res_ratio:float=3.0):
 
             # find all occurrences referenced to this image
             rects = self._page.get_image_rects(item)
-            unrotated_page_bbox = self._page.cropbox # note the difference to page.rect
+            unrotated_page_bbox = self._page.cropbox  # note the difference to page.rect
             for bbox in rects:
                 # ignore small images
-                if bbox.get_area()<=4: continue
+                if bbox.get_area() <= 4:
+                    continue
 
                 # ignore images outside page
-                if not unrotated_page_bbox.intersects(bbox): continue
+                if not unrotated_page_bbox.intersects(bbox):
+                    continue
 
                 # collect images
                 ic.append((bbox, item))
@@ -143,8 +148,11 @@ def extract_images(self, clip_image_res_ratio:float=3.0):
             # clip page with the union bbox of all intersected images
             if len(group) > 1:
                 clip_bbox = fitz.Rect()
-                for (bbox, item) in group: clip_bbox |= bbox
-                raw_dict = self.clip_page_to_dict(clip_bbox, False, clip_image_res_ratio)
+                for bbox, item in group:
+                    clip_bbox |= bbox
+                raw_dict = self.clip_page_to_dict(
+                    clip_bbox, False, clip_image_res_ratio
+                )
 
             else:
                 bbox, item = group[0]
@@ -166,7 +174,7 @@ def extract_images(self, clip_image_res_ratio:float=3.0):
                 # (21, 0, 331, 369, 1, '', '', 'Im3', 'CCITTFaxDecode', 0)
                 # (22, 25, 1265, 1303, 8, 'DeviceGray', '', 'Im4', 'DCTDecode', 0)
                 # (23, 0, 1731, 1331, 8, 'DeviceGray', '', 'Im5', 'DCTDecode', 0)
-                if item[5]=='':
+                if item[5] == "":
                     raw_dict = self.clip_page_to_dict(bbox, False, clip_image_res_ratio)
 
                 # normal images
@@ -177,19 +185,16 @@ def extract_images(self, clip_image_res_ratio:float=3.0):
                     # rotate image with opencv if page is rotated
                     raw_dict = self._to_raw_dict(pix, bbox)
                     if rotation:
-                        raw_dict['image'] = self._rotate_image(pix, -rotation)
+                        raw_dict["image"] = self._rotate_image(pix, -rotation)
 
             images.append(raw_dict)
 
         return images
 
-
-    def detect_svg_contours(self,
-                            min_svg_gap_dx:float,
-                            min_svg_gap_dy:float,
-                            min_w:float,
-                            min_h:float):
-        '''Find contour of potential vector graphics.
+    def detect_svg_contours(
+        self, min_svg_gap_dx: float, min_svg_gap_dy: float, min_w: float, min_h: float
+    ):
+        """Find contour of potential vector graphics.
 
         Args:
             min_svg_gap_dx (float): Merge svg if the horizontal gap is less than this value.
@@ -199,7 +204,7 @@ def detect_svg_contours(self,
 
         Returns:
             list: A list of potential svg region: (external_bbox, inner_bboxes:list).
-        '''
+        """
         import cv2 as cv
 
         # clip page and convert to opencv image
@@ -211,11 +216,14 @@ def detect_svg_contours(self,
         _, binary = cv.threshold(gray, 253, 255, cv.THRESH_BINARY_INV)
 
         # external bbox: split images with recursive xy cut
-        external_bboxes = recursive_xy_cut(binary, min_dx=min_svg_gap_dx, min_dy=min_svg_gap_dy)
+        external_bboxes = recursive_xy_cut(
+            binary, min_dx=min_svg_gap_dx, min_dy=min_svg_gap_dy
+        )
 
         # inner contours
-        grouped_inner_bboxes = [inner_contours(binary, bbox, min_w, min_h)
-                                for bbox in external_bboxes]
+        grouped_inner_bboxes = [
+            inner_contours(binary, bbox, min_w, min_h) for bbox in external_bboxes
+        ]
 
         # combined external and inner contours
         groups = list(zip(external_bboxes, grouped_inner_bboxes))
@@ -226,26 +234,25 @@ def detect_svg_contours(self,
             # plot projection profile for each sub-image
             for i, (x0, y0, x1, y1) in enumerate(external_bboxes):
                 arr = xy_project_profile(src[y0:y1, x0:x1, :], binary[y0:y1, x0:x1])
-                cv.imshow(f'sub-image-{i}', arr)
+                cv.imshow(f"sub-image-{i}", arr)
 
             for bbox, inner_bboxes in groups:
                 # plot external bbox
                 x0, y0, x1, y1 = bbox
-                cv.rectangle(src, (x0, y0), (x1, y1), (255,0,0), 1)
+                cv.rectangle(src, (x0, y0), (x1, y1), (255, 0, 0), 1)
 
                 # plot inner bbox
                 for u0, v0, u1, v1 in inner_bboxes:
-                    cv.rectangle(src, (u0, v0), (u1, v1), (0,0,255), 1)
+                    cv.rectangle(src, (u0, v0), (u1, v1), (0, 0, 255), 1)
 
             cv.imshow("img", src)
             cv.waitKey(0)
 
         return groups
 
-
     @staticmethod
-    def _to_raw_dict(image:fitz.Pixmap, bbox:fitz.Rect):
-        '''Store Pixmap ``image`` to raw dict.
+    def _to_raw_dict(image: fitz.Pixmap, bbox: fitz.Rect):
+        """Store Pixmap ``image`` to raw dict.
 
         Args:
             image (fitz.Pixmap): Pixmap to store.
@@ -253,35 +260,36 @@ def _to_raw_dict(image:fitz.Pixmap, bbox:fitz.Rect):
 
         Returns:
             dict: Raw dict of the pixmap.
-        '''
+        """
+        if image.colorspace.n > 3:  # must convert: we only support PNG
+            image = fitz.Pixmap(fitz.csRGB, image)
         return {
-            'type': BlockType.IMAGE.value,
-            'bbox': tuple(bbox),
-            'width': image.width,
-            'height': image.height,
-            'image': image.tobytes()
+            "type": BlockType.IMAGE.value,
+            "bbox": tuple(bbox),
+            "width": image.width,
+            "height": image.height,
+            "image": image.tobytes(),
         }
 
-
     @staticmethod
-    def _rotate_image(pixmap:fitz.Pixmap, rotation:int):
-        '''Rotate image represented by image bytes.
+    def _rotate_image(pixmap: fitz.Pixmap, rotation: int):
+        """Rotate image represented by image bytes.
 
         Args:
             pixmap (fitz.Pixmap): Image to rotate.
             rotation (int): Rotation angle.
 
         Return: image bytes.
-        '''
+        """
         import cv2 as cv
         import numpy as np
 
         # convert to opencv image
         img = ImagesExtractor._pixmap_to_cv_image(pixmap)
-        h, w = img.shape[:2] # get image height, width
+        h, w = img.shape[:2]  # get image height, width
 
         # calculate the center of the image
-        x0, y0 = w//2, h//2
+        x0, y0 = w // 2, h // 2
 
         # default scale value for now -> might be extracted from PDF page property
         scale = 1.0
@@ -305,13 +313,12 @@ def _rotate_image(pixmap:fitz.Pixmap, rotation:int):
         rotated_img = cv.warpAffine(img, matrix, (W, H))
 
         # convert back to bytes
-        _, im_png = cv.imencode('.png', rotated_img)
+        _, im_png = cv.imencode(".png", rotated_img)
         return im_png.tobytes()
 
-
     @staticmethod
-    def _hide_page_text_and_images(page:fitz.Page, rm_text:bool, rm_image:bool):
-        '''Hide page text and images.'''
+    def _hide_page_text_and_images(page: fitz.Page, rm_text: bool, rm_image: bool):
+        """Hide page text and images."""
         # NOTE: text might exist in both content stream and form object stream
         # - content stream, i.e. direct page content
         # - form object, i.e. contents referenced by this page
@@ -328,11 +335,11 @@ def hide_text(stream):
             res = stream
             found = False
             # set 3 Tr to text block
-            for k in ['BT', 'Tm', 'Td', '2 Tr']:
+            for k in ["BT", "Tm", "Td", "2 Tr"]:
                 bk = k.encode()
                 if bk in stream:
                     found = True
-                    res = res.replace(bk, f'{k} 3 Tr'.encode())
+                    res = res.replace(bk, f"{k} 3 Tr".encode())
             return res, found
 
         # (2) hide image
@@ -343,13 +350,13 @@ def hide_images(stream):
             # image names, e.g. [[270, 0, 261, 115, 8, 'DeviceRGB', '', 'Im1', 'DCTDecode']]
             img_names = [item[7] for item in page.get_images(full=True)]
             for k in img_names:
-                bk = f'/{k} Do'.encode()
+                bk = f"/{k} Do".encode()
                 if bk in stream:
                     found = True
-                    res = res.replace(bk, b'')
+                    res = res.replace(bk, b"")
             return res, found
 
-        doc = page.parent # type: fitz.Document
+        doc = page.parent  # type: fitz.Document
         source = {}
         for xref in xref_list:
             src = doc.xref_stream(xref)
@@ -362,18 +369,17 @@ def hide_images(stream):
 
             if found_text or found_images:
                 doc.update_stream(xref, stream)
-                source[xref] = src # save original stream
+                source[xref] = src  # save original stream
 
         return source
 
-
     @staticmethod
-    def _recover_pixmap(doc:fitz.Document, item:list):
+    def _recover_pixmap(doc: fitz.Document, item: list):
         """Restore pixmap with soft mask considered.
 
         References:
 
-            * https://pymupdf.readthedocs.io/en/latest/document.html#Document.getPageImageList        
+            * https://pymupdf.readthedocs.io/en/latest/document.html#Document.getPageImageList
             * https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-handle-stencil-masks
             * https://github.com/pymupdf/PyMuPDF/issues/670
 
@@ -401,28 +407,31 @@ def _recover_pixmap(doc:fitz.Document, item:list):
                 pix = temp
 
             # check dimension
-            if pix.width==mask.width and pix.height==mask.height:
+            if pix.width == mask.width and pix.height == mask.height:
                 pix = fitz.Pixmap(pix, mask)  # now compose final pixmap
             else:
-                logging.warning('Ignore image due to inconsistent size of color and mask pixmaps: %s', item)
+                logging.warning(
+                    "Ignore image due to inconsistent size of color and mask pixmaps: %s",
+                    item,
+                )
 
         # we may need to adjust something for CMYK pixmaps here ->
         # recreate pixmap in RGB color space if necessary
         # NOTE: pix.colorspace may be None for images with alpha channel values only
-        if 'CMYK' in item[5].upper():
+        if "CMYK" in item[5].upper():
             pix = fitz.Pixmap(fitz.csRGB, pix)
 
         return pix
 
-
     @staticmethod
-    def _pixmap_to_cv_image(pixmap:fitz.Pixmap):
-        '''Convert fitz Pixmap to opencv image.
+    def _pixmap_to_cv_image(pixmap: fitz.Pixmap):
+        """Convert fitz Pixmap to opencv image.
 
         Args:
             pixmap (fitz.Pixmap): PyMuPDF Pixmap.
-        '''
+        """
         import cv2 as cv
         import numpy as np
+
         img_byte = pixmap.tobytes()
         return cv.imdecode(np.frombuffer(img_byte, np.uint8), cv.IMREAD_COLOR)