First cut at propagating page boxes

This would fix the immediate issue, but does not address an offset mediabox.
ferdiga · Feb 11, 2024 · 11f53fe · 11f53fe
1 parent 123c0c7
commit 11f53fe
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 5 deletions.
diff --git a/docs/api.rst b/docs/api.rst
@@ -112,7 +112,9 @@ OCRmyPDF is strict about not writing to standard output so that
 users can safely use it in a pipeline and produce a valid output
 file. A caller application will have to ensure it does not write to
 standard output either, if it wants to be compatible with this
-behavior and support piping to a file.
+behavior and support piping to a file. Another benefit of running
+OCRmyPDF in a child process, as recommended above, is that it will
+not interfere with the parent process's standard output.
 
 Exceptions
 ----------

diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py
@@ -12,6 +12,7 @@
 import sys
 from collections.abc import Iterable, Iterator, Sequence
 from contextlib import suppress
+from io import BytesIO
 from pathlib import Path
 from shutil import copyfileobj, copystat
 from typing import Any, BinaryIO, TypeVar, cast
@@ -713,19 +714,30 @@ def create_pdf_page_from_image(
     pageinfo = page_context.pageinfo
     pagesize = 72.0 * float(pageinfo.width_inches), 72.0 * float(pageinfo.height_inches)
     effective_rotation = (pageinfo.rotation - orientation_correction) % 360
-    if effective_rotation % 180 == 90:
+    swap_axis = effective_rotation % 180 == 90
+    if swap_axis:
         pagesize = pagesize[1], pagesize[0]
 
+    bio = BytesIO()
+
     # This create a single page PDF
-    with open(image, 'rb') as imfile, open(output_file, 'wb') as pdf:
+    with open(image, 'rb') as imfile:
         log.debug('convert')
 
         layout_fun = img2pdf.get_layout_fun(pagesize)
         img2pdf.convert(
-            imfile, layout_fun=layout_fun, outputstream=pdf, **IMG2PDF_KWARGS
+            imfile,
+            layout_fun=layout_fun,
+            outputstream=bio,
+            engine=img2pdf.Engine.pikepdf,
+            rotation=img2pdf.Rotation.ifvalid,
         )
         log.debug('convert done')
 
+    # img2pdf does not generate boxes correctly, so we fix them
+    bio.seek(0)
+    fix_pagepdf_boxes(bio, output_file, page_context, swap_axis=swap_axis)
+
     output_file = page_context.plugin_manager.hook.filter_pdf_page(
         page=page_context, image_filename=image, output_pdf=output_file
     )
@@ -780,7 +792,29 @@ def ocr_engine_textonly_pdf(
         output_text=output_text,
         options=options,
     )
-    return (output_pdf, output_text)
+    return output_pdf, output_text
+
+
+def fix_pagepdf_boxes(
+    infile: Path | BinaryIO,
+    out_file: Path,
+    page_context: PageContext,
+    swap_axis: bool = False,
+) -> Path:
+    """Fix the bounding boxes in a single page PDF."""
+    with pikepdf.open(infile) as pdf:
+        for page in pdf.pages:
+            # page.BleedBox = page_context.pageinfo.bleedbox
+            # page.ArtBox = page_context.pageinfo.artbox
+            cropbox = page_context.pageinfo.cropbox
+            trimbox = page_context.pageinfo.trimbox
+            if swap_axis:
+                cropbox = cropbox[1], cropbox[0], cropbox[3], cropbox[2]
+                trimbox = trimbox[1], trimbox[0], trimbox[3], trimbox[2]
+            page.CropBox = cropbox
+            page.TrimBox = trimbox
+        pdf.save(out_file)
+    return pdf
 
 
 def generate_postscript_stub(context: PdfContext) -> Path:

diff --git a/src/ocrmypdf/_pipelines/ocr.py b/src/ocrmypdf/_pipelines/ocr.py
@@ -22,6 +22,7 @@
 from ocrmypdf._jobcontext import PageContext, PdfContext
 from ocrmypdf._pipeline import (
     copy_final,
+    fix_pagepdf_boxes,
     get_pdfinfo,
     is_ocr_required,
     merge_sidecars,
@@ -81,6 +82,8 @@ def _exec_page_sync(page_context: PageContext) -> PageResult:
     )
     ocr_out, text_out = _image_to_ocr_text(page_context, ocr_image_out)
 
+    # fix_pagepdf_boxes(ocr_out, page_context)
+
     return PageResult(
         pageno=page_context.pageno,
         pdf_page_from_image=pdf_page_from_image_out,

diff --git a/src/ocrmypdf/pdfinfo/info.py b/src/ocrmypdf/pdfinfo/info.py
@@ -854,6 +854,11 @@ def _gather_pageinfo(
         width_pt = mediabox[2] - mediabox[0]
         height_pt = mediabox[3] - mediabox[1]
 
+        # self._artbox = [float(d) for d in page.artbox.as_list()]
+        # self._bleedbox = [float(d) for d in page.bleedbox.as_list()]
+        self._cropbox = [float(d) for d in page.cropbox.as_list()]
+        self._trimbox = [float(d) for d in page.trimbox.as_list()]
+
         check_this_page = pageno in check_pages
 
         if check_this_page and detailed_analysis:
@@ -970,6 +975,16 @@ def rotation(self, value):
         else:
             raise ValueError("rotation must be a cardinal angle")
 
+    @property
+    def cropbox(self) -> FloatRect:
+        """Return trimbox of page in PDF coordinates."""
+        return self._cropbox
+
+    @property
+    def trimbox(self) -> FloatRect:
+        """Return trimbox of page in PDF coordinates."""
+        return self._trimbox
+
     @property
     def images(self) -> list[ImageInfo]:
         """Return images."""