Skip to content

Commit

Permalink
ocrd-dummy: make copying optional and disable by default, #803
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Mar 7, 2022
1 parent decb06f commit 4097791
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 18 deletions.
11 changes: 9 additions & 2 deletions ocrd/ocrd/processor/builtin/dummy/ocrd-tool.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
{
"executable": "ocrd-dummy",
"description": "Bare-bones processor that copies file from input group to output group",
"description": "Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group",
"steps": ["preprocessing/optimization"],
"categories": ["Image preprocessing"],
"input_file_grp": "DUMMY_INPUT",
"output_file_grp": "DUMMY_OUTPUT"
"output_file_grp": "DUMMY_OUTPUT",
"parameters": {
"copy_files": {
"type": "boolean",
"default": false,
"description": "Whether to actually copy files (true) or just create PAGE-XML as a side effect (false)"
}
}
}
31 changes: 17 additions & 14 deletions ocrd/ocrd/processor/builtin/dummy_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,14 @@

class DummyProcessor(Processor):
"""
Bare-bones processor that only copies mets:file from input group to output group.
Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group
"""

def process(self):
LOG = getLogger('ocrd.dummy')
assert_file_grp_cardinality(self.input_file_grp, 1)
assert_file_grp_cardinality(self.output_file_grp, 1)
copy_files = self.parameter['copy_files']
for input_file in self.input_files:
input_file = self.workspace.download_file(input_file)
file_id = make_file_id(input_file, self.output_file_grp)
Expand All @@ -39,24 +40,26 @@ def process(self):
LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id)
if input_file.mimetype == MIMETYPE_PAGE:
# Source file is PAGE-XML: Write out in-memory PcGtsType
self.workspace.add_file(
ID=file_id,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
mimetype=input_file.mimetype,
local_filename=local_filename,
content=to_xml(pcgts).encode('utf-8'))
else:
# Source file is not PAGE-XML: Copy byte-by-byte
with open(input_file.local_filename, 'rb') as f:
content = f.read()
if copy_files:
self.workspace.add_file(
ID=file_id,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
mimetype=input_file.mimetype,
local_filename=local_filename,
content=content)
content=to_xml(pcgts).encode('utf-8'))
else:
# Source file is not PAGE-XML: Copy byte-by-byte
if copy_files:
with open(input_file.local_filename, 'rb') as f:
content = f.read()
self.workspace.add_file(
ID=file_id,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
mimetype=input_file.mimetype,
local_filename=local_filename,
content=content)
if input_file.mimetype.startswith('image/'):
# write out the PAGE-XML representation for this image
page_file_id = file_id + '_PAGE'
Expand All @@ -76,7 +79,7 @@ def process(self):

def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL
kwargs['version'] = '0.0.2'
kwargs['version'] = '0.0.3'
super(DummyProcessor, self).__init__(*args, **kwargs)

@click.command()
Expand Down
2 changes: 2 additions & 0 deletions tests/processor/test_ocrd_dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def test_copies_ok(self):
DummyProcessor,
input_file_grp='OCR-D-IMG',
output_file_grp='OUTPUT',
parameter={'copy_files': True},
workspace=workspace
)
output_files = workspace.mets.find_all_files(fileGrp='OUTPUT')
Expand All @@ -38,6 +39,7 @@ def test_copies_ok(self):
DummyProcessor,
input_file_grp='OUTPUT',
output_file_grp='OUTPUT2',
parameter={'copy_files': True},
workspace=workspace
)
output2_files = workspace.mets.find_all_files(fileGrp='OUTPUT2')
Expand Down
4 changes: 2 additions & 2 deletions tests/test_task_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ def test_task_run(self):
ws.save_mets()
files_before = len(ws.mets.find_all_files())
run_tasks('mets.xml', 'DEBUG', None, [
"dummy -I OCR-D-IMG -O GRP1",
"dummy -I GRP1 -O GRP2",
"dummy -I OCR-D-IMG -O GRP1 -P copy_files true",
"dummy -I GRP1 -O GRP2 -P copy_files true",
])
ws.reload_mets()
# step 1: 2 images in OCR-D-IMG -> 2 images 2 PAGEXML in GRP1
Expand Down

0 comments on commit 4097791

Please sign in to comment.