Skip to content

Commit

Permalink
Improve documentation of new public hOCR APIs
Browse files Browse the repository at this point in the history
  • Loading branch information
jbarlow83 committed Oct 24, 2023
1 parent 16eb562 commit f238e72
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 27 deletions.
11 changes: 0 additions & 11 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -124,14 +124,3 @@ handler. OCRmyPDF will clean up its temporary files and worker processes
automatically when an exception occurs.

When OCRmyPDF succeeds conditionally, it returns an integer exit code.

Reference
---------

.. autofunction:: ocrmypdf.ocr

.. autoclass:: ocrmypdf.Verbosity
:members:
:undoc-members:

.. autofunction:: ocrmypdf.configure_logging
12 changes: 12 additions & 0 deletions docs/apiref.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,18 @@ ocrmypdf
.. autoclass:: ocrmypdf.PdfContext
:members:

.. autoclass:: ocrmypdf.Verbosity
:members:
:undoc-members:

.. autofunction:: ocrmypdf.configure_logging

.. autofunction:: ocrmypdf.ocr

.. autofunction:: ocrmypdf.pdf_to_hocr

.. autofunction:: ocrmypdf.hocr_to_ocr_pdf

ocrmypdf.exceptions
===================

Expand Down
53 changes: 37 additions & 16 deletions src/ocrmypdf/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
StrPath = Union[Path, AnyStr]
PathOrIO = Union[BinaryIO, StrPath]

# Installing plugins affects the global state of the Python interpreter,
# so we need to use a lock to prevent multiple threads from installing
# plugins at the same time.
_api_lock = threading.Lock()


Expand Down Expand Up @@ -354,10 +357,6 @@ def ocr( # noqa: D417

parser = get_parser()
with _api_lock:
# We can't allow multiple ocrmypdf.ocr() threads to run in parallel, because
# they might install different plugins, and generally speaking we have areas
# of code that use global state.

if not plugin_manager:
plugin_manager = get_plugin_manager(plugins)
plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member
Expand All @@ -375,7 +374,7 @@ def ocr( # noqa: D417
return run_pipeline(options=options, plugin_manager=plugin_manager)


def pdf_to_hocr(
def pdf_to_hocr( # noqa: D417
input_pdf: Path,
output_folder: Path,
*,
Expand Down Expand Up @@ -419,7 +418,24 @@ def pdf_to_hocr(
keep_temporary_files: bool | None = None,
**kwargs,
):
"""Run OCRmyPDF and produces an output folder containing hOCR files."""
"""Partially run OCRmyPDF and produces an output folder containing hOCR files.
Given a PDF file, this function will run OCRmyPDF up to the point where
the PDF is rasterized to images, OCRed, and the hOCR files are produced,
all of which are saved to the output folder. This is useful for applications
that want to provide an interface for users to edit the text before
rendering the final PDF.
Use :func:`hocr_to_ocr_pdf` to produce the final PDF.
For arguments not explicitly documented here, see documentation for the
equivalent command line parameter.
Args:
input_pdf: Input PDF file path.
output_folder: Output folder path.
**kwargs: Keyword arguments.
"""
# No new variable names should be assigned until these two steps are run
create_options_kwargs = {
k: v
Expand All @@ -431,10 +447,6 @@ def pdf_to_hocr(
parser = get_parser()

with _api_lock:
# We can't allow multiple ocrmypdf.ocr() threads to run in parallel, because
# they might install different plugins, and generally speaking we have areas
# of code that use global state.

if not plugin_manager:
plugin_manager = get_plugin_manager(plugins)
plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member
Expand All @@ -455,7 +467,7 @@ def pdf_to_hocr(
return run_hocr_pipeline(options=options, plugin_manager=plugin_manager)


def hocr_to_ocr_pdf(
def hocr_to_ocr_pdf( # noqa: D417
work_folder: Path,
output_file: Path,
*,
Expand All @@ -474,7 +486,20 @@ def hocr_to_ocr_pdf(
plugins: Iterable[StrPath] | None = None,
**kwargs,
):
"""Run OCRmyPDF and produces an output folder containing hOCR files."""
"""Run OCRmyPDF on a work folder and produce an output PDF.
After running :func:`pdf_to_hocr`, this function will run OCRmyPDF on the work
folder to produce an output PDF. This function consolidates any changes made
to the hOCR files in the work folder and produces a final PDF.
For arguments not explicitly documented here, see documentation for the
equivalent command line parameter.
Args:
work_folder: Work folder path, as generated by :func:`pdf_to_hocr`.
output_file: Output PDF file path.
**kwargs: Keyword arguments.
"""
# No new variable names should be assigned until these two steps are run
create_options_kwargs = {
k: v
Expand All @@ -486,10 +511,6 @@ def hocr_to_ocr_pdf(
parser = get_parser()

with _api_lock:
# We can't allow multiple ocrmypdf.ocr() threads to run in parallel, because
# they might install different plugins, and generally speaking we have areas
# of code that use global state.

if not plugin_manager:
plugin_manager = get_plugin_manager(plugins)
plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member
Expand Down

0 comments on commit f238e72

Please sign in to comment.