Skip to content

Commit

Permalink
Fix use_threads logic for get_pdfinfo
Browse files Browse the repository at this point in the history
Some debug code was level in place that forced pdfinfo to run with only
one worker when --use-threads was issued. That is how it ought to be,
since threaded pdfinfo workers just fight over the GIL and there is no
sense in parallelizing them.

Also, the user's --use-threads or --no-use-threads would be ignored in
the case of pdfinfo. By setting max_workers=1 we disabled worker processes.

This fixes how that decision is made (putting it in the relevant code, which
knows its constraints) and allows the user to influence the
thread/process decision again.
  • Loading branch information
jbarlow83 committed Oct 24, 2023
1 parent c278fec commit 53c953a
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 8 deletions.
2 changes: 2 additions & 0 deletions src/ocrmypdf/_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ def get_pdfinfo(
detailed_analysis: bool = False,
progbar: bool = False,
max_workers: int | None = None,
use_threads: bool = True,
check_pages=None,
) -> PdfInfo:
"""Get the PDF info."""
Expand All @@ -174,6 +175,7 @@ def get_pdfinfo(
detailed_analysis=detailed_analysis,
progbar=progbar,
max_workers=max_workers,
use_threads=use_threads,
check_pages=check_pages,
executor=executor,
)
Expand Down
3 changes: 2 additions & 1 deletion src/ocrmypdf/_pipelines/hocr_to_ocr_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,8 @@ def run_hocr_to_ocr_pdf_pipeline(
executor=executor,
detailed_analysis=options.redo_ocr,
progbar=options.progress_bar,
max_workers=options.jobs if not options.use_threads else 1, # To help debug
max_workers=options.jobs,
use_threads=options.use_threads,
check_pages=options.pages,
)
context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)
Expand Down
3 changes: 2 additions & 1 deletion src/ocrmypdf/_pipelines/pdf_to_hocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ def run_hocr_pipeline(
executor=executor,
detailed_analysis=options.redo_ocr,
progbar=options.progress_bar,
max_workers=options.jobs if not options.use_threads else 1, # To help debug
max_workers=options.jobs,
use_threads=options.use_threads,
check_pages=options.pages,
)
context = PdfContext(
Expand Down
3 changes: 2 additions & 1 deletion src/ocrmypdf/_pipelines/standard.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,8 @@ def _run_pipeline(
executor=executor,
detailed_analysis=options.redo_ocr,
progbar=options.progress_bar,
max_workers=options.jobs if not options.use_threads else 1, # To help debug
max_workers=options.jobs,
use_threads=options.use_threads,
check_pages=options.pages,
)

Expand Down
22 changes: 17 additions & 5 deletions src/ocrmypdf/pdfinfo/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,11 +706,12 @@ def _pdf_pageinfo_sync(args):
def _pdf_pageinfo_concurrent(
pdf,
executor: Executor,
max_workers: int,
use_threads: bool,
infile,
progbar,
max_workers,
check_pages,
detailed_analysis=False,
detailed_analysis: bool = False,
) -> Sequence[PageInfo | None]:
pages: Sequence[PageInfo | None] = [None] * len(pdf.pages)

Expand All @@ -726,13 +727,17 @@ def update_pageinfo(result, pbar):

total = len(pdf.pages)

use_threads = False # No performance gain if threaded due to GIL
n_workers = min(1 + len(pages) // 4, max_workers)
if n_workers == 1:
# But if we decided on only one worker, there is no point in using
# If we decided on only one worker, there is no point in using
# a separate process.
use_threads = True

if use_threads and n_workers > 1:
# If we are using threads, there is no point in using more than one
# worker thread - they will just fight over the GIL.
n_workers = 1

# If we use a thread, we can pass the already-open Pdf for them to use
# If we use processes, we pass a None which tells the init function to open its
# own
Expand All @@ -742,6 +747,11 @@ def update_pageinfo(result, pbar):
(n, initial_pdf, infile, check_pages, detailed_analysis) for n in range(total)
)
assert n_workers == 1 if use_threads else n_workers >= 1, "Not multithreadable"
logger.debug(
f"Gathering info with {n_workers} "
+ ('thread' if use_threads else 'process')
+ " workers"
)
executor(
use_threads=use_threads,
max_workers=n_workers,
Expand Down Expand Up @@ -1055,6 +1065,7 @@ def __init__(
detailed_analysis: bool = False,
progbar: bool = False,
max_workers: int | None = None,
use_threads: bool = True,
check_pages=None,
executor: Executor = DEFAULT_EXECUTOR,
):
Expand All @@ -1069,9 +1080,10 @@ def __init__(
self._pages = _pdf_pageinfo_concurrent(
pdf,
executor,
max_workers,
use_threads,
infile,
progbar,
max_workers,
check_pages=check_pages,
detailed_analysis=detailed_analysis,
)
Expand Down

0 comments on commit 53c953a

Please sign in to comment.