From 11f53fe9a96daf12938495521914fa2215a9bc20 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Tue, 31 Oct 2023 00:12:15 -0700 Subject: [PATCH] First cut at propagating page boxes This would fix the immediate issue, but does not address an offset mediabox. --- docs/api.rst | 4 +++- src/ocrmypdf/_pipeline.py | 42 ++++++++++++++++++++++++++++++---- src/ocrmypdf/_pipelines/ocr.py | 3 +++ src/ocrmypdf/pdfinfo/info.py | 15 ++++++++++++ 4 files changed, 59 insertions(+), 5 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 4bb4e6db1..abdbfda70 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -112,7 +112,9 @@ OCRmyPDF is strict about not writing to standard output so that users can safely use it in a pipeline and produce a valid output file. A caller application will have to ensure it does not write to standard output either, if it wants to be compatible with this -behavior and support piping to a file. +behavior and support piping to a file. Another benefit of running +OCRmyPDF in a child process, as recommended above, is that it will +not interfere with the parent process's standard output. Exceptions ---------- diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py index cd67eafae..bc58bd07c 100644 --- a/src/ocrmypdf/_pipeline.py +++ b/src/ocrmypdf/_pipeline.py @@ -12,6 +12,7 @@ import sys from collections.abc import Iterable, Iterator, Sequence from contextlib import suppress +from io import BytesIO from pathlib import Path from shutil import copyfileobj, copystat from typing import Any, BinaryIO, TypeVar, cast @@ -713,19 +714,30 @@ def create_pdf_page_from_image( pageinfo = page_context.pageinfo pagesize = 72.0 * float(pageinfo.width_inches), 72.0 * float(pageinfo.height_inches) effective_rotation = (pageinfo.rotation - orientation_correction) % 360 - if effective_rotation % 180 == 90: + swap_axis = effective_rotation % 180 == 90 + if swap_axis: pagesize = pagesize[1], pagesize[0] + bio = BytesIO() + # This create a single page PDF - with open(image, 'rb') as imfile, open(output_file, 'wb') as pdf: + with open(image, 'rb') as imfile: log.debug('convert') layout_fun = img2pdf.get_layout_fun(pagesize) img2pdf.convert( - imfile, layout_fun=layout_fun, outputstream=pdf, **IMG2PDF_KWARGS + imfile, + layout_fun=layout_fun, + outputstream=bio, + engine=img2pdf.Engine.pikepdf, + rotation=img2pdf.Rotation.ifvalid, ) log.debug('convert done') + # img2pdf does not generate boxes correctly, so we fix them + bio.seek(0) + fix_pagepdf_boxes(bio, output_file, page_context, swap_axis=swap_axis) + output_file = page_context.plugin_manager.hook.filter_pdf_page( page=page_context, image_filename=image, output_pdf=output_file ) @@ -780,7 +792,29 @@ def ocr_engine_textonly_pdf( output_text=output_text, options=options, ) - return (output_pdf, output_text) + return output_pdf, output_text + + +def fix_pagepdf_boxes( + infile: Path | BinaryIO, + out_file: Path, + page_context: PageContext, + swap_axis: bool = False, +) -> Path: + """Fix the bounding boxes in a single page PDF.""" + with pikepdf.open(infile) as pdf: + for page in pdf.pages: + # page.BleedBox = page_context.pageinfo.bleedbox + # page.ArtBox = page_context.pageinfo.artbox + cropbox = page_context.pageinfo.cropbox + trimbox = page_context.pageinfo.trimbox + if swap_axis: + cropbox = cropbox[1], cropbox[0], cropbox[3], cropbox[2] + trimbox = trimbox[1], trimbox[0], trimbox[3], trimbox[2] + page.CropBox = cropbox + page.TrimBox = trimbox + pdf.save(out_file) + return pdf def generate_postscript_stub(context: PdfContext) -> Path: diff --git a/src/ocrmypdf/_pipelines/ocr.py b/src/ocrmypdf/_pipelines/ocr.py index 374445edb..6d143038d 100644 --- a/src/ocrmypdf/_pipelines/ocr.py +++ b/src/ocrmypdf/_pipelines/ocr.py @@ -22,6 +22,7 @@ from ocrmypdf._jobcontext import PageContext, PdfContext from ocrmypdf._pipeline import ( copy_final, + fix_pagepdf_boxes, get_pdfinfo, is_ocr_required, merge_sidecars, @@ -81,6 +82,8 @@ def _exec_page_sync(page_context: PageContext) -> PageResult: ) ocr_out, text_out = _image_to_ocr_text(page_context, ocr_image_out) + # fix_pagepdf_boxes(ocr_out, page_context) + return PageResult( pageno=page_context.pageno, pdf_page_from_image=pdf_page_from_image_out, diff --git a/src/ocrmypdf/pdfinfo/info.py b/src/ocrmypdf/pdfinfo/info.py index e6b6f19dd..b3c452127 100644 --- a/src/ocrmypdf/pdfinfo/info.py +++ b/src/ocrmypdf/pdfinfo/info.py @@ -854,6 +854,11 @@ def _gather_pageinfo( width_pt = mediabox[2] - mediabox[0] height_pt = mediabox[3] - mediabox[1] + # self._artbox = [float(d) for d in page.artbox.as_list()] + # self._bleedbox = [float(d) for d in page.bleedbox.as_list()] + self._cropbox = [float(d) for d in page.cropbox.as_list()] + self._trimbox = [float(d) for d in page.trimbox.as_list()] + check_this_page = pageno in check_pages if check_this_page and detailed_analysis: @@ -970,6 +975,16 @@ def rotation(self, value): else: raise ValueError("rotation must be a cardinal angle") + @property + def cropbox(self) -> FloatRect: + """Return trimbox of page in PDF coordinates.""" + return self._cropbox + + @property + def trimbox(self) -> FloatRect: + """Return trimbox of page in PDF coordinates.""" + return self._trimbox + @property def images(self) -> list[ImageInfo]: """Return images."""