Fix use_threads logic for get_pdfinfo

Some debug code was level in place that forced pdfinfo to run with only one worker when --use-threads was issued. That is how it ought to be, since threaded pdfinfo workers just fight over the GIL and there is no sense in parallelizing them. Also, the user's --use-threads or --no-use-threads would be ignored in the case of pdfinfo. By setting max_workers=1 we disabled worker processes. This fixes how that decision is made (putting it in the relevant code, which knows its constraints) and allows the user to influence the thread/process decision again.
ferdiga · Oct 24, 2023 · 53c953a · 53c953a
1 parent c278fec
commit 53c953a
Show file tree

Hide file tree

Showing 5 changed files with 25 additions and 8 deletions.
diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py
@@ -165,6 +165,7 @@ def get_pdfinfo(
     detailed_analysis: bool = False,
     progbar: bool = False,
     max_workers: int | None = None,
+    use_threads: bool = True,
     check_pages=None,
 ) -> PdfInfo:
     """Get the PDF info."""
@@ -174,6 +175,7 @@ def get_pdfinfo(
             detailed_analysis=detailed_analysis,
             progbar=progbar,
             max_workers=max_workers,
+            use_threads=use_threads,
             check_pages=check_pages,
             executor=executor,
         )

diff --git a/src/ocrmypdf/_pipelines/hocr_to_ocr_pdf.py b/src/ocrmypdf/_pipelines/hocr_to_ocr_pdf.py
@@ -125,7 +125,8 @@ def run_hocr_to_ocr_pdf_pipeline(
             executor=executor,
             detailed_analysis=options.redo_ocr,
             progbar=options.progress_bar,
-            max_workers=options.jobs if not options.use_threads else 1,  # To help debug
+            max_workers=options.jobs,
+            use_threads=options.use_threads,
             check_pages=options.pages,
         )
         context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)

diff --git a/src/ocrmypdf/_pipelines/pdf_to_hocr.py b/src/ocrmypdf/_pipelines/pdf_to_hocr.py
@@ -108,7 +108,8 @@ def run_hocr_pipeline(
             executor=executor,
             detailed_analysis=options.redo_ocr,
             progbar=options.progress_bar,
-            max_workers=options.jobs if not options.use_threads else 1,  # To help debug
+            max_workers=options.jobs,
+            use_threads=options.use_threads,
             check_pages=options.pages,
         )
         context = PdfContext(

diff --git a/src/ocrmypdf/_pipelines/standard.py b/src/ocrmypdf/_pipelines/standard.py
@@ -185,7 +185,8 @@ def _run_pipeline(
             executor=executor,
             detailed_analysis=options.redo_ocr,
             progbar=options.progress_bar,
-            max_workers=options.jobs if not options.use_threads else 1,  # To help debug
+            max_workers=options.jobs,
+            use_threads=options.use_threads,
             check_pages=options.pages,
         )
 

diff --git a/src/ocrmypdf/pdfinfo/info.py b/src/ocrmypdf/pdfinfo/info.py
@@ -706,11 +706,12 @@ def _pdf_pageinfo_sync(args):
 def _pdf_pageinfo_concurrent(
     pdf,
     executor: Executor,
+    max_workers: int,
+    use_threads: bool,
     infile,
     progbar,
-    max_workers,
     check_pages,
-    detailed_analysis=False,
+    detailed_analysis: bool = False,
 ) -> Sequence[PageInfo | None]:
     pages: Sequence[PageInfo | None] = [None] * len(pdf.pages)
 
@@ -726,13 +727,17 @@ def update_pageinfo(result, pbar):
 
     total = len(pdf.pages)
 
-    use_threads = False  # No performance gain if threaded due to GIL
     n_workers = min(1 + len(pages) // 4, max_workers)
     if n_workers == 1:
-        # But if we decided on only one worker, there is no point in using
+        # If we decided on only one worker, there is no point in using
         # a separate process.
         use_threads = True
 
+    if use_threads and n_workers > 1:
+        # If we are using threads, there is no point in using more than one
+        # worker thread - they will just fight over the GIL.
+        n_workers = 1
+
     # If we use a thread, we can pass the already-open Pdf for them to use
     # If we use processes, we pass a None which tells the init function to open its
     # own
@@ -742,6 +747,11 @@ def update_pageinfo(result, pbar):
         (n, initial_pdf, infile, check_pages, detailed_analysis) for n in range(total)
     )
     assert n_workers == 1 if use_threads else n_workers >= 1, "Not multithreadable"
+    logger.debug(
+        f"Gathering info with {n_workers} "
+        + ('thread' if use_threads else 'process')
+        + " workers"
+    )
     executor(
         use_threads=use_threads,
         max_workers=n_workers,
@@ -1055,6 +1065,7 @@ def __init__(
         detailed_analysis: bool = False,
         progbar: bool = False,
         max_workers: int | None = None,
+        use_threads: bool = True,
         check_pages=None,
         executor: Executor = DEFAULT_EXECUTOR,
     ):
@@ -1069,9 +1080,10 @@ def __init__(
             self._pages = _pdf_pageinfo_concurrent(
                 pdf,
                 executor,
+                max_workers,
+                use_threads,
                 infile,
                 progbar,
-                max_workers,
                 check_pages=check_pages,
                 detailed_analysis=detailed_analysis,
             )