diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py index a129532dc..b12e4cb00 100644 --- a/src/ocrmypdf/_pipeline.py +++ b/src/ocrmypdf/_pipeline.py @@ -743,11 +743,13 @@ def render_hocr_page(hocr: Path, page_context: PageContext) -> Path: dpi = get_page_square_dpi(page_context, calculate_image_dpi(page_context)) debug_mode = options.pdf_renderer == 'hocrdebug' - hocrtransform = HocrTransform(hocr_filename=hocr, dpi=dpi.to_scalar()) # square - hocrtransform.to_pdf( + HocrTransform( + hocr_filename=hocr, + dpi=dpi.to_scalar(), # square + debug=debug_mode, + ).to_pdf( out_filename=output_file, image_filename=None, - invisible_text=True if not debug_mode else False, ) return output_file diff --git a/src/ocrmypdf/hocrtransform/_hocr.py b/src/ocrmypdf/hocrtransform/_hocr.py index 480da05fe..2c1e7c441 100644 --- a/src/ocrmypdf/hocrtransform/_hocr.py +++ b/src/ocrmypdf/hocrtransform/_hocr.py @@ -70,7 +70,7 @@ class HocrTransform: re.VERBOSE, ) - def __init__(self, *, hocr_filename: str | Path, dpi: float): + def __init__(self, *, hocr_filename: str | Path, dpi: float, debug: bool = False): """Initialize the HocrTransform object.""" self.dpi = dpi self.hocr = ElementTree.parse(os.fspath(hocr_filename)) @@ -91,12 +91,12 @@ def __init__(self, *, hocr_filename: str | Path, dpi: float): # Stop after first div that has page coordinates break self.render_options = DebugRenderOptions( - render_baseline=False, - render_triangle=False, - render_line_bbox=False, - render_word_bbox=False, - render_paragraph_bbox=False, - render_space_bbox=False, + render_baseline=debug, + render_triangle=debug, + render_line_bbox=debug, + render_word_bbox=debug, + render_paragraph_bbox=debug, + render_space_bbox=debug, ) def _get_element_text(self, element: Element): @@ -148,7 +148,7 @@ def to_pdf( out_filename: Path, image_filename: Path | None = None, fontname: str = "Helvetica", - invisible_text: bool = False, + invisible_text: bool = True, ) -> None: """Creates a PDF file with an image superimposed on top of the text. @@ -189,13 +189,16 @@ def to_pdf( in {'ocr_header', 'ocr_line', 'ocr_textfloat'} ): found_lines = True - direction = ( - TextDirection.RTL - if par.attrib.get('dir', 'ltr') == 'rtl' - else TextDirection.LTR - ) + direction = self._get_text_direction(par) + inject_word_breaks = self._get_inject_word_breaks(par) self._do_line( - canvas, line, "ocrx_word", fontname, invisible_text, direction + canvas, + line, + "ocrx_word", + fontname, + invisible_text, + direction, + inject_word_breaks, ) if not found_lines: @@ -208,6 +211,7 @@ def to_pdf( fontname, invisible_text, TextDirection.LTR, + True, ) # put the image on the page, scaled to fill the page if image_filename is not None: @@ -218,6 +222,30 @@ def to_pdf( # finish up the page and save it canvas.save(out_filename) + def _get_text_direction(self, par): + """Get the text direction of the paragraph. + + Arabic, Hebrew, Persian, are right-to-left languages. + """ + return ( + TextDirection.RTL + if par.attrib.get('dir', 'ltr') == 'rtl' + else TextDirection.LTR + ) + + def _get_inject_word_breaks(self, par): + """Determine whether word breaks should be injected. + + In Chinese, Japanese, and Korean, word breaks are not injected, because + words are usually one or two characters and separators are usually explicit. + In all other languages, we inject word breaks to help word segmentation. + """ + lang = par.attrib.get('lang', '') + log.debug(lang) + if lang in {'chi_sim', 'chi_tra', 'jpn', 'kor'}: + return False + return True + @classmethod def polyval(cls, poly, x): # pragma: no cover """Calculate the value of a polynomial at a point.""" @@ -231,6 +259,7 @@ def _do_line( fontname: str, invisible_text: bool, text_direction: TextDirection, + inject_word_breaks: bool, ): """Render the text for a given line. @@ -292,6 +321,7 @@ def _do_line( elem, next_elem, text_direction, + inject_word_breaks, ) canvas.do.draw_text(text) @@ -305,6 +335,7 @@ def _do_line_word( elem: Element, next_elem: Element | None, text_direction: TextDirection, + inject_word_breaks: bool, ): """Render the text for a single word.""" if elem is None: @@ -339,6 +370,8 @@ def _do_line_word( # PDF viewers identify the word break, and horizontally scaling it to # occupy the space the between the words helps the PDF viewer # avoid combiningthewordstogether. + if not inject_word_breaks: + return next_box = line_matrix.inverse().transform(hocr_next_box) if text_direction == TextDirection.LTR: space_box = Rectangle(box.urx, box.lly, next_box.llx, next_box.ury)