Skip to content

Commit

Permalink
Merge branch 'master' into warn-empty-page
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Sep 11, 2023
2 parents 15a2c08 + 1812061 commit 6b36e48
Show file tree
Hide file tree
Showing 26 changed files with 480 additions and 252 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ Added:
* Web API: Lock pages output file groups of a workspace to prevent simultaneous non-additive access to workspaces, #1069
* Web API: Support job dependency for caching complete fully-deterministic workflows, #1069
* Web API: Processing server will start all ready requests, not just the first one, #1069
* `ocrd_utils.config` to collect all configuration based on environment variables in one place, #1081

Changed:

* Processors now have `worker` and `server` subcommands, with separate --help, for starting processing worker/processor server, #1087
* Move `tf_disable_interactive_logs` (to silence keras/tensorflow print statements) to `ocrd_utils.logging` and do not call on module-level, #1090, #1091

## [2.53.0] - 2023-08-21

Expand Down
81 changes: 28 additions & 53 deletions ocrd/ocrd/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,69 +8,44 @@
import re
import click

from ocrd_utils import config

__all__ = ['cli']

_epilog = """
_epilog = f"""
\b
\b
Variables:
PATH Search path for processor executables
(affects `ocrd process` and `ocrd resmgr`)
HOME Directory to look for `ocrd_logging.conf`,
fallback for unset XDG variables.
\b
XDG_CONFIG_HOME
Directory to look for `./ocrd/resources.yml`
(i.e. `ocrd resmgr` user database) - defaults to
`$HOME/.config`.
XDG_DATA_HOME
Directory to look for `./ocrd-resources/*`
(i.e. `ocrd resmgr` data location) - defaults to
`$HOME/.local/share`.
PATH
Search path for processor executables
(affects `ocrd process` and `ocrd resmgr`)
\b
OCRD_DOWNLOAD_RETRIES
Number of times to retry failed attempts
for downloads of workspace files.
OCRD_DOWNLOAD_TIMEOUT
Timeout in seconds for connecting or reading
(comma-separated) when downloading.
{config.describe('HOME')}
\b
OCRD_METS_CACHING
Whether to enable in-memory storage of OcrdMets
data structures for speedup during processing or
workspace operations.
{config.describe('XDG_CONFIG_HOME')}
\b
OCRD_MAX_PROCESSOR_CACHE
Maximum number of processor instances
(for each set of parameters) to be kept
in memory (including loaded models) for
processing workers or processor servers.
{config.describe('XDG_DATA_HOME')}
\b
OCRD_NETWORK_SERVER_ADDR_PROCESSING
Default address of Processing Server to connect to
(for `ocrd network client processing`).
OCRD_NETWORK_SERVER_ADDR_WORKFLOW
Default address of Workflow Server to connect to
(for `ocrd network client workflow`).
OCRD_NETWORK_SERVER_ADDR_WORKSPACE
Default address of Workspace Server to connect to
(for `ocrd network client workspace`).
{config.describe('OCRD_DOWNLOAD_RETRIES')}
\b
{config.describe('OCRD_DOWNLOAD_TIMEOUT')}
\b
{config.describe('OCRD_METS_CACHING')}
\b
{config.describe('OCRD_DOWNLOAD_TIMEOUT')}
\b
{config.describe('OCRD_MAX_PROCESSOR_CACHE')}
\b
{config.describe('OCRD_NETWORK_SERVER_ADDR_PROCESSING')}
\b
{config.describe('OCRD_NETWORK_SERVER_ADDR_WORKFLOW')}
\b
{config.describe('OCRD_NETWORK_SERVER_ADDR_WORKSPACE')}
\b
{config.describe('OCRD_PROFILE_FILE')}
\b
OCRD_PROFILE
Whether to enable gathering runtime statistics
on the `ocrd.profile` logger (comma-separated):
- `CPU`: yields CPU and wall-time,
- `RSS`: also yields peak memory (resident set size)
- `PSS`: also yields peak memory (proportional set size)
OCRD_PROFILE_FILE
When profiling is enabled, data file to write to
(for external analysis tools like snakeviz).
{config.describe('OCRD_PROFILE', wrap_text=False)}
"""

def command_with_replaced_help(*replacements):
Expand Down
9 changes: 6 additions & 3 deletions ocrd/ocrd/cli/ocrd_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
:nested: full
"""
from inspect import getmodule
from json import dumps
import codecs
import sys
Expand Down Expand Up @@ -115,16 +116,18 @@ def moduledir(self):
show_resource=res_name)

@ocrd_tool_tool.command('help', help="Generate help for processors")
@click.argument('subcommand', required=False)
@pass_ocrd_tool
def ocrd_tool_tool_params_help(ctx):
def ocrd_tool_tool_params_help(ctx, subcommand):
class BashProcessor(Processor):
# set docstrings to empty
# fixme: override the module-level docstring, too
__doc__ = None
# HACK: override the module-level docstring, too
getmodule(OcrdToolCtx).__doc__ = None
def process(self):
return super()
BashProcessor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name],
show_help=True)
show_help=True, subcommand=subcommand)

# ----------------------------------------------------------------------
# ocrd ocrd-tool tool categories
Expand Down
1 change: 0 additions & 1 deletion ocrd/ocrd/cli/resmgr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
:nested: full
"""
import sys
from os import environ
from pathlib import Path
from distutils.spawn import find_executable as which
from yaml import safe_load, safe_dump
Expand Down
112 changes: 53 additions & 59 deletions ocrd/ocrd/decorators/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from os import environ
import sys

from ocrd_utils import (
Expand All @@ -7,7 +6,7 @@
set_json_key_value_overrides,
)

from ocrd_utils import getLogger, initLogging, parse_json_string_with_comments
from ocrd_utils import getLogger, initLogging, parse_json_string_with_comments, config
from ocrd_validators import WorkspaceValidator

from ocrd_network import ProcessingWorker, ProcessorServer
Expand All @@ -20,6 +19,8 @@
from .ocrd_cli_options import ocrd_cli_options
from .mets_find_options import mets_find_options

SUBCOMMANDS = ['worker', 'server']

def ocrd_cli_wrap_processor(
processorClass,
mets=None,
Expand All @@ -35,8 +36,8 @@ def ocrd_cli_wrap_processor(
show_resource=None,
list_resources=False,
# ocrd_network params start #
agent_type=None,
agent_address=None,
subcommand=None,
address=None,
queue=None,
database=None,
# ocrd_network params end #
Expand All @@ -51,17 +52,20 @@ def ocrd_cli_wrap_processor(
dump_json=dump_json,
dump_module_dir=dump_module_dir,
show_help=help,
subcommand=subcommand,
show_version=version,
show_resource=show_resource,
list_resources=list_resources
)
sys.exit()
if subcommand:
# Used for checking/starting network agents for the WebAPI architecture
check_and_run_network_agent(processorClass, subcommand, address, database, queue)
elif address or queue or database:
raise ValueError(f"Subcommand options --adress --queue and --database are only valid for subcommands 'worker' or 'server'")

initLogging()

# Used for checking/starting network agents for the WebAPI architecture
# Has no side effects if neither of the 4 ocrd_network parameters are passed
check_and_run_network_agent(processorClass, agent_type, agent_address, database, queue)
initLogging()

LOG = getLogger('ocrd_cli_wrap_processor')
# LOG.info('kwargs=%s' % kwargs)
Expand Down Expand Up @@ -102,11 +106,10 @@ def ocrd_cli_wrap_processor(
if not report.is_valid:
raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors))
# Set up profiling behavior from environment variables/flags
if not profile and 'OCRD_PROFILE' in environ:
if 'CPU' in environ['OCRD_PROFILE']:
profile = True
if not profile_file and 'OCRD_PROFILE_FILE' in environ:
profile_file = environ['OCRD_PROFILE_FILE']
if not profile and 'CPU' in config.OCRD_PROFILE:
profile = True
if not profile_file and config.is_set('OCRD_PROFILE_FILE'):
profile_file = config.OCRD_PROFILE_FILE
if profile or profile_file:
import cProfile
import pstats
Expand All @@ -128,59 +131,50 @@ def exit():
run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs)


def check_and_run_network_agent(ProcessorClass, agent_type: str, agent_address: str, database: str, queue: str):
if not agent_type and (agent_address or database or queue):
raise ValueError("Options '--database', '--queue', and '--address' are valid only with '--type'")
if not agent_type:
return
def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str):
"""
"""
if subcommand not in SUBCOMMANDS:
raise ValueError(f"SUBCOMMAND can only be one of {SUBCOMMANDS}")

if not database:
raise ValueError("Options '--type' and '--database' are mutually inclusive")
allowed_agent_types = ['server', 'worker']
if agent_type not in allowed_agent_types:
agents_str = ', '.join(allowed_agent_types)
raise ValueError(f"Wrong type parameter. Allowed types: {agents_str}")
if agent_type == 'server':
if not agent_address:
raise ValueError("Options '--type=server' and '--address' are mutually inclusive")
raise ValueError(f"Option '--database' is invalid for subcommand {subcommand}")

if subcommand == 'server':
if not address:
raise ValueError(f"Option '--address' required for subcommand {subcommand}")
if queue:
raise ValueError("Options '--type=server' and '--queue' are mutually exclusive")
if agent_type == 'worker':
raise ValueError(f"Option '--queue' invalid for subcommand {subcommand}")
if subcommand == 'worker':
if address:
raise ValueError(f"Option '--address' invalid for subcommand {subcommand}")
if not queue:
raise ValueError("Options '--type=worker' and '--queue' are mutually inclusive")
if agent_address:
raise ValueError("Options '--type=worker' and '--address' are mutually exclusive")
raise ValueError(f"Option '--queue' required for subcommand {subcommand}")

import logging
logging.getLogger('ocrd.network').setLevel(logging.DEBUG)

processor = ProcessorClass(workspace=None, dump_json=True)
if agent_type == 'worker':
try:
# TODO: Passing processor_name and ocrd_tool is reduntant
processing_worker = ProcessingWorker(
rabbitmq_addr=queue,
mongodb_addr=database,
processor_name=processor.ocrd_tool['executable'],
ocrd_tool=processor.ocrd_tool,
processor_class=ProcessorClass,
)
# The RMQConsumer is initialized and a connection to the RabbitMQ is performed
processing_worker.connect_consumer()
# Start consuming from the queue with name `processor_name`
processing_worker.start_consuming()
except Exception as e:
sys.exit(f"Processing worker has failed with error: {e}")
if agent_type == 'server':
try:
# TODO: Better validate that inside the ProcessorServer itself
host, port = agent_address.split(':')
processor_server = ProcessorServer(
mongodb_addr=database,
processor_name=processor.ocrd_tool['executable'],
processor_class=ProcessorClass,
)
processor_server.run_server(host=host, port=int(port))
except Exception as e:
sys.exit(f"Processor server has failed with error: {e}")
if subcommand == 'worker':
# TODO: Passing processor_name and ocrd_tool is reduntant
processing_worker = ProcessingWorker(
rabbitmq_addr=queue,
mongodb_addr=database,
processor_name=processor.ocrd_tool['executable'],
ocrd_tool=processor.ocrd_tool,
processor_class=ProcessorClass,
)
# The RMQConsumer is initialized and a connection to the RabbitMQ is performed
processing_worker.connect_consumer()
# Start consuming from the queue with name `processor_name`
processing_worker.start_consuming()
elif subcommand == 'server':
# TODO: Better validate that inside the ProcessorServer itself
host, port = address.split(':')
processor_server = ProcessorServer(
mongodb_addr=database,
processor_name=processor.ocrd_tool['executable'],
processor_class=ProcessorClass,
)
processor_server.run_server(host=host, port=int(port))
sys.exit(0)
11 changes: 8 additions & 3 deletions ocrd/ocrd/decorators/ocrd_cli_options.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import click
from click import option, Path
from click import option, Path, group, command, argument
from .parameter_option import parameter_option, parameter_override_option
from .loglevel_option import loglevel_option
from ocrd_network import (
Expand Down Expand Up @@ -39,8 +39,7 @@ def cli(mets_url):
parameter_option,
parameter_override_option,
loglevel_option,
option('--type', 'agent_type', type=click.Choice(['worker', 'server'])),
option('--address', 'agent_address', type=ServerAddressParamType()),
option('--address', type=ServerAddressParamType()),
option('--queue', type=QueueServerParamType()),
option('--database', type=DatabaseParamType()),
option('-C', '--show-resource'),
Expand All @@ -49,6 +48,12 @@ def cli(mets_url):
option('-D', '--dump-module-dir', is_flag=True, default=False),
option('-h', '--help', is_flag=True, default=False),
option('-V', '--version', is_flag=True, default=False),
# Subcommand, only used for 'worker'/'server'. Cannot be handled in
# click because processors use the @command decorator and even if they
# were using `group`, you cannot combine have a command with
# subcommands. So we have to work around that by creating a
# pseudo-subcommand handled in ocrd_cli_wrap_processor
argument('subcommand', nargs=1, required=False, type=click.Choice(['worker', 'server'])),
]
for param in params:
param(f)
Expand Down
Loading

0 comments on commit 6b36e48

Please sign in to comment.