Skip to content

Commit

Permalink
First cut at propagating page boxes
Browse files Browse the repository at this point in the history
This would fix the immediate issue, but does not address an offset mediabox.
  • Loading branch information
jbarlow83 committed Feb 11, 2024
1 parent 123c0c7 commit 11f53fe
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 5 deletions.
4 changes: 3 additions & 1 deletion docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,9 @@ OCRmyPDF is strict about not writing to standard output so that
users can safely use it in a pipeline and produce a valid output
file. A caller application will have to ensure it does not write to
standard output either, if it wants to be compatible with this
behavior and support piping to a file.
behavior and support piping to a file. Another benefit of running
OCRmyPDF in a child process, as recommended above, is that it will
not interfere with the parent process's standard output.

Exceptions
----------
Expand Down
42 changes: 38 additions & 4 deletions src/ocrmypdf/_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import sys
from collections.abc import Iterable, Iterator, Sequence
from contextlib import suppress
from io import BytesIO
from pathlib import Path
from shutil import copyfileobj, copystat
from typing import Any, BinaryIO, TypeVar, cast
Expand Down Expand Up @@ -713,19 +714,30 @@ def create_pdf_page_from_image(
pageinfo = page_context.pageinfo
pagesize = 72.0 * float(pageinfo.width_inches), 72.0 * float(pageinfo.height_inches)
effective_rotation = (pageinfo.rotation - orientation_correction) % 360
if effective_rotation % 180 == 90:
swap_axis = effective_rotation % 180 == 90
if swap_axis:
pagesize = pagesize[1], pagesize[0]

bio = BytesIO()

# This create a single page PDF
with open(image, 'rb') as imfile, open(output_file, 'wb') as pdf:
with open(image, 'rb') as imfile:
log.debug('convert')

layout_fun = img2pdf.get_layout_fun(pagesize)
img2pdf.convert(
imfile, layout_fun=layout_fun, outputstream=pdf, **IMG2PDF_KWARGS
imfile,
layout_fun=layout_fun,
outputstream=bio,
engine=img2pdf.Engine.pikepdf,
rotation=img2pdf.Rotation.ifvalid,
)
log.debug('convert done')

# img2pdf does not generate boxes correctly, so we fix them
bio.seek(0)
fix_pagepdf_boxes(bio, output_file, page_context, swap_axis=swap_axis)

output_file = page_context.plugin_manager.hook.filter_pdf_page(
page=page_context, image_filename=image, output_pdf=output_file
)
Expand Down Expand Up @@ -780,7 +792,29 @@ def ocr_engine_textonly_pdf(
output_text=output_text,
options=options,
)
return (output_pdf, output_text)
return output_pdf, output_text


def fix_pagepdf_boxes(
infile: Path | BinaryIO,
out_file: Path,
page_context: PageContext,
swap_axis: bool = False,
) -> Path:
"""Fix the bounding boxes in a single page PDF."""
with pikepdf.open(infile) as pdf:
for page in pdf.pages:
# page.BleedBox = page_context.pageinfo.bleedbox
# page.ArtBox = page_context.pageinfo.artbox
cropbox = page_context.pageinfo.cropbox
trimbox = page_context.pageinfo.trimbox
if swap_axis:
cropbox = cropbox[1], cropbox[0], cropbox[3], cropbox[2]
trimbox = trimbox[1], trimbox[0], trimbox[3], trimbox[2]
page.CropBox = cropbox
page.TrimBox = trimbox
pdf.save(out_file)
return pdf


def generate_postscript_stub(context: PdfContext) -> Path:
Expand Down
3 changes: 3 additions & 0 deletions src/ocrmypdf/_pipelines/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from ocrmypdf._jobcontext import PageContext, PdfContext
from ocrmypdf._pipeline import (
copy_final,
fix_pagepdf_boxes,
get_pdfinfo,
is_ocr_required,
merge_sidecars,
Expand Down Expand Up @@ -81,6 +82,8 @@ def _exec_page_sync(page_context: PageContext) -> PageResult:
)
ocr_out, text_out = _image_to_ocr_text(page_context, ocr_image_out)

# fix_pagepdf_boxes(ocr_out, page_context)

return PageResult(
pageno=page_context.pageno,
pdf_page_from_image=pdf_page_from_image_out,
Expand Down
15 changes: 15 additions & 0 deletions src/ocrmypdf/pdfinfo/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -854,6 +854,11 @@ def _gather_pageinfo(
width_pt = mediabox[2] - mediabox[0]
height_pt = mediabox[3] - mediabox[1]

# self._artbox = [float(d) for d in page.artbox.as_list()]
# self._bleedbox = [float(d) for d in page.bleedbox.as_list()]
self._cropbox = [float(d) for d in page.cropbox.as_list()]
self._trimbox = [float(d) for d in page.trimbox.as_list()]

check_this_page = pageno in check_pages

if check_this_page and detailed_analysis:
Expand Down Expand Up @@ -970,6 +975,16 @@ def rotation(self, value):
else:
raise ValueError("rotation must be a cardinal angle")

@property
def cropbox(self) -> FloatRect:
"""Return trimbox of page in PDF coordinates."""
return self._cropbox

@property
def trimbox(self) -> FloatRect:
"""Return trimbox of page in PDF coordinates."""
return self._trimbox

@property
def images(self) -> list[ImageInfo]:
"""Return images."""
Expand Down

0 comments on commit 11f53fe

Please sign in to comment.