Make hocrdebug work, and try to handle CJK spacing better

ocrmypdf · Nov 21, 2023 · d217856 · d217856
1 parent e2be457
commit d217856
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 17 deletions.
diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py
@@ -743,11 +743,13 @@ def render_hocr_page(hocr: Path, page_context: PageContext) -> Path:
     dpi = get_page_square_dpi(page_context, calculate_image_dpi(page_context))
     debug_mode = options.pdf_renderer == 'hocrdebug'
 
-    hocrtransform = HocrTransform(hocr_filename=hocr, dpi=dpi.to_scalar())  # square
-    hocrtransform.to_pdf(
+    HocrTransform(
+        hocr_filename=hocr,
+        dpi=dpi.to_scalar(),  # square
+        debug=debug_mode,
+    ).to_pdf(
         out_filename=output_file,
         image_filename=None,
-        invisible_text=True if not debug_mode else False,
     )
     return output_file
 

diff --git a/src/ocrmypdf/hocrtransform/_hocr.py b/src/ocrmypdf/hocrtransform/_hocr.py
@@ -70,7 +70,7 @@ class HocrTransform:
         re.VERBOSE,
     )
 
-    def __init__(self, *, hocr_filename: str | Path, dpi: float):
+    def __init__(self, *, hocr_filename: str | Path, dpi: float, debug: bool = False):
         """Initialize the HocrTransform object."""
         self.dpi = dpi
         self.hocr = ElementTree.parse(os.fspath(hocr_filename))
@@ -91,12 +91,12 @@ def __init__(self, *, hocr_filename: str | Path, dpi: float):
             # Stop after first div that has page coordinates
             break
         self.render_options = DebugRenderOptions(
-            render_baseline=False,
-            render_triangle=False,
-            render_line_bbox=False,
-            render_word_bbox=False,
-            render_paragraph_bbox=False,
-            render_space_bbox=False,
+            render_baseline=debug,
+            render_triangle=debug,
+            render_line_bbox=debug,
+            render_word_bbox=debug,
+            render_paragraph_bbox=debug,
+            render_space_bbox=debug,
         )
 
     def _get_element_text(self, element: Element):
@@ -148,7 +148,7 @@ def to_pdf(
         out_filename: Path,
         image_filename: Path | None = None,
         fontname: str = "Helvetica",
-        invisible_text: bool = False,
+        invisible_text: bool = True,
     ) -> None:
         """Creates a PDF file with an image superimposed on top of the text.
 
@@ -189,13 +189,16 @@ def to_pdf(
                     in {'ocr_header', 'ocr_line', 'ocr_textfloat'}
                 ):
                     found_lines = True
-                    direction = (
-                        TextDirection.RTL
-                        if par.attrib.get('dir', 'ltr') == 'rtl'
-                        else TextDirection.LTR
-                    )
+                    direction = self._get_text_direction(par)
+                    inject_word_breaks = self._get_inject_word_breaks(par)
                     self._do_line(
-                        canvas, line, "ocrx_word", fontname, invisible_text, direction
+                        canvas,
+                        line,
+                        "ocrx_word",
+                        fontname,
+                        invisible_text,
+                        direction,
+                        inject_word_breaks,
                     )
 
             if not found_lines:
@@ -208,6 +211,7 @@ def to_pdf(
                     fontname,
                     invisible_text,
                     TextDirection.LTR,
+                    True,
                 )
         # put the image on the page, scaled to fill the page
         if image_filename is not None:
@@ -218,6 +222,30 @@ def to_pdf(
         # finish up the page and save it
         canvas.save(out_filename)
 
+    def _get_text_direction(self, par):
+        """Get the text direction of the paragraph.
+
+        Arabic, Hebrew, Persian, are right-to-left languages.
+        """
+        return (
+            TextDirection.RTL
+            if par.attrib.get('dir', 'ltr') == 'rtl'
+            else TextDirection.LTR
+        )
+
+    def _get_inject_word_breaks(self, par):
+        """Determine whether word breaks should be injected.
+
+        In Chinese, Japanese, and Korean, word breaks are not injected, because
+        words are usually one or two characters and separators are usually explicit.
+        In all other languages, we inject word breaks to help word segmentation.
+        """
+        lang = par.attrib.get('lang', '')
+        log.debug(lang)
+        if lang in {'chi_sim', 'chi_tra', 'jpn', 'kor'}:
+            return False
+        return True
+
     @classmethod
     def polyval(cls, poly, x):  # pragma: no cover
         """Calculate the value of a polynomial at a point."""
@@ -231,6 +259,7 @@ def _do_line(
         fontname: str,
         invisible_text: bool,
         text_direction: TextDirection,
+        inject_word_breaks: bool,
     ):
         """Render the text for a given line.
 
@@ -292,6 +321,7 @@ def _do_line(
                     elem,
                     next_elem,
                     text_direction,
+                    inject_word_breaks,
                 )
             canvas.do.draw_text(text)
 
@@ -305,6 +335,7 @@ def _do_line_word(
         elem: Element,
         next_elem: Element | None,
         text_direction: TextDirection,
+        inject_word_breaks: bool,
     ):
         """Render the text for a single word."""
         if elem is None:
@@ -339,6 +370,8 @@ def _do_line_word(
         # PDF viewers identify the word break, and horizontally scaling it to
         # occupy the space the between the words helps the PDF viewer
         # avoid combiningthewordstogether.
+        if not inject_word_breaks:
+            return
         next_box = line_matrix.inverse().transform(hocr_next_box)
         if text_direction == TextDirection.LTR:
             space_box = Rectangle(box.urx, box.lly, next_box.llx, next_box.ury)