diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 6107688bc..7c3d83e85 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -18,6 +18,7 @@ import sys import tarfile import io +from ocrd.workspace import Workspace from ocrd_utils import ( VERSION as OCRD_VERSION, @@ -49,7 +50,7 @@ class Processor(): def __init__( self, - workspace, + workspace : Workspace, ocrd_tool=None, parameter=None, # TODO OCR-D/core#274 @@ -163,7 +164,7 @@ def verify(self): """ return True - def process(self): + def process(self) -> None: """ Process the :py:attr:`workspace` from the given :py:attr:`input_file_grp` diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 0fb00af53..774332a73 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -24,7 +24,7 @@ class DummyProcessor(Processor): Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group """ - def process(self): + def process(self) -> None: LOG = getLogger('ocrd.dummy') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 633e45acf..61c78c2e9 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -5,6 +5,7 @@ from re import sub from tempfile import NamedTemporaryFile from contextlib import contextmanager +from typing import Optional, Union from cv2 import COLOR_GRAY2BGR, COLOR_RGB2BGR, cvtColor from PIL import Image @@ -13,6 +14,7 @@ import requests from ocrd_models import OcrdMets, OcrdFile +from ocrd_models.ocrd_file import ClientSideOcrdFile from ocrd_models.ocrd_page import parse, BorderType, to_xml from ocrd_modelfactory import exif_from_filename, page_from_file from ocrd_utils import ( @@ -71,7 +73,16 @@ class Workspace(): baseurl (string) : Base URL to prefix to relative URL. """ - def __init__(self, resolver, directory, mets=None, mets_basename=DEFAULT_METS_BASENAME, automatic_backup=False, baseurl=None, mets_server_url=None): + def __init__( + self, + resolver, + directory, + mets : Optional[Union[OcrdMets, ClientSideOcrdMets]] = None, + mets_basename=DEFAULT_METS_BASENAME, + automatic_backup=False, + baseurl=None, + mets_server_url=None + ): self.resolver = resolver self.directory = directory self.mets_target = str(Path(directory, mets_basename)) @@ -328,14 +339,16 @@ def rename_file_group(self, old, new): local_filename_replacements = {} log.info("Moving files") for mets_file in self.mets.find_files(fileGrp=old, local_only=True): - new_local_filename = old_local_filename = str(mets_file.local_filename) + new_local_filename = old_local_filename = mets_file.local_filename + assert new_local_filename + assert old_local_filename # Directory part new_local_filename = sub(r'^%s/' % old, r'%s/' % new, new_local_filename) # File part new_local_filename = sub(r'/%s' % old, r'/%s' % new, new_local_filename) local_filename_replacements[str(mets_file.local_filename)] = new_local_filename # move file from ``old`` to ``new`` - mets_file.local_filename.rename(new_local_filename) + Path(old_local_filename).rename(new_local_filename) # change the url of ``mets:file`` mets_file.local_filename = new_local_filename # change the file ID and update structMap @@ -375,7 +388,7 @@ def rename_file_group(self, old, new): @deprecated_alias(pageId="page_id") @deprecated_alias(ID="file_id") - def add_file(self, file_grp, content=None, **kwargs): + def add_file(self, file_grp, content=None, **kwargs) -> Union[OcrdFile, ClientSideOcrdFile]: """ Add a file to the :py:class:`ocrd_models.ocrd_mets.OcrdMets` of the workspace. diff --git a/src/ocrd_modelfactory/__init__.py b/src/ocrd_modelfactory/__init__.py index e345ee061..7afc5b176 100644 --- a/src/ocrd_modelfactory/__init__.py +++ b/src/ocrd_modelfactory/__init__.py @@ -5,6 +5,7 @@ """ from datetime import datetime from pathlib import Path +from typing import Tuple, Union from yaml import safe_load, safe_dump from PIL import Image @@ -78,7 +79,7 @@ def page_from_image(input_file, with_tree=False): revmap = dict(((node, element) for element, node in mapping.items())) return pcgts, etree, mapping, revmap -def page_from_file(input_file, with_tree=False): +def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET.Element, dict, dict]]: """ Create :py:class:`~ocrd_models.ocrd_page.OcrdPage` from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` or a file path diff --git a/src/ocrd_models/ocrd_file.py b/src/ocrd_models/ocrd_file.py index e8205a33d..963da2f96 100644 --- a/src/ocrd_models/ocrd_file.py +++ b/src/ocrd_models/ocrd_file.py @@ -1,13 +1,13 @@ """ API to ``mets:file`` """ -from os.path import splitext, basename from pathlib import Path +from typing import Any, List, Optional, Union from ocrd_utils import deprecation_warning -from .ocrd_xml_base import ET -from .constants import NAMESPACES as NS, TAG_METS_FLOCAT, TAG_METS_FILE +from .ocrd_xml_base import ET # type: ignore +from .constants import NAMESPACES as NS, TAG_METS_FLOCAT class OcrdFile(): """ @@ -22,9 +22,8 @@ def __init__(self, el, mimetype=None, pageId=None, local_filename=None, mets=Non mets (OcrdMets): Containing :py:class:`ocrd_models.ocrd_mets.OcrdMets`. mimetype (string): ``@MIMETYPE`` of this ``mets:file`` pageId (string): ``@ID`` of the physical ``mets:structMap`` entry corresponding to this ``mets:file`` - local_filename (string): Local filename url (string): original ``@xlink:href`` of this ``mets:file`` - local_filename (Path): ``@xlink:href`` pointing to the locally cached version of the file in the workspace + local_filename (string): ``@xlink:href`` pointing to the locally cached version of the file in the workspace ID (string): ``@ID`` of this ``mets:file`` loctype (string): DEPRECATED do not use """ @@ -39,7 +38,7 @@ def __init__(self, el, mimetype=None, pageId=None, local_filename=None, mets=Non self.pageId = pageId if local_filename: - self.local_filename = Path(local_filename) + self.local_filename = local_filename if url: self.url = url @@ -70,38 +69,38 @@ def __eq__(self, other): # self.fileGrp == other.fileGrp @property - def basename(self): + def basename(self) -> str: """ Get the ``.name`` of the local file """ if not self.local_filename: - return - return self.local_filename.name + return '' + return Path(self.local_filename).name @property - def extension(self): + def extension(self) -> str: if not self.local_filename: - return - return ''.join(self.local_filename.suffixes) + return '' + return ''.join(Path(self.local_filename).suffixes) @property - def basename_without_extension(self): + def basename_without_extension(self) -> str: """ Get the ``os.path.basename`` of the local file, if any, with extension removed. """ if not self.local_filename: - return - return self.local_filename.name[:-len(self.extension)] + return '' + return Path(self.local_filename).name[:-len(self.extension)] @property - def ID(self): + def ID(self) -> str: """ Get the ``@ID`` of the ``mets:file``. """ return self._el.get('ID') @ID.setter - def ID(self, ID): + def ID(self, ID : Optional[str]) -> None: """ Set the ``@ID`` of the ``mets:file`` to :py:attr:`ID`. """ @@ -116,7 +115,7 @@ def ID(self, ID): self.pageId = pageId @property - def pageId(self): + def pageId(self) -> str: """ Get the ``@ID`` of the physical ``mets:structMap`` entry corresponding to this ``mets:file`` (physical page manifestation). """ @@ -125,7 +124,7 @@ def pageId(self): return self.mets.get_physical_page_for_file(self) @pageId.setter - def pageId(self, pageId): + def pageId(self, pageId : Optional[str]) -> None: """ Get the ``@ID`` of the physical ``mets:structMap`` entry corresponding to this ``mets:file`` (physical page manifestation) to :py:attr:`pageId`. """ @@ -136,21 +135,21 @@ def pageId(self, pageId): self.mets.set_physical_page_for_file(pageId, self) @property - def loctypes(self): + def loctypes(self) -> List[str]: """ Get the ``@LOCTYPE``s of the ``mets:file``. """ return [x.get('LOCTYPE') for x in self._el.findall('mets:FLocat', NS)] @property - def mimetype(self): + def mimetype(self) -> str: """ Get the ``@MIMETYPE`` of the ``mets:file``. """ return self._el.get('MIMETYPE') @mimetype.setter - def mimetype(self, mimetype): + def mimetype(self, mimetype : Optional[str]) -> None: """ Set the ``@MIMETYPE`` of the ``mets:file`` to :py:attr:`mimetype`. """ @@ -159,7 +158,7 @@ def mimetype(self, mimetype): self._el.set('MIMETYPE', mimetype) @property - def fileGrp(self): + def fileGrp(self) -> str: """ The ``@USE`` of the containing ``mets:fileGrp`` """ @@ -169,7 +168,7 @@ def fileGrp(self): raise ValueError("OcrdFile not related to METS") @property - def url(self): + def url(self) -> str: """ Get the remote/original URL ``@xlink:href`` of this ``mets:file``. """ @@ -179,7 +178,7 @@ def url(self): return '' @url.setter - def url(self, url): + def url(self, url : Optional[str]) -> None: """ Set the remote/original URL ``@xlink:href`` of this ``mets:file`` to :py:attr:`url`. """ @@ -194,16 +193,17 @@ def url(self, url): el_FLocat.set("LOCTYPE", "URL") @property - def local_filename(self): + def local_filename(self) -> Optional[str]: """ Get the local/cached ``@xlink:href`` of this ``mets:file``. """ el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"]', NS) if el_FLocat is not None: - return Path(el_FLocat.get("{%s}href" % NS["xlink"])) + return el_FLocat.get("{%s}href" % NS["xlink"]) + return None @local_filename.setter - def local_filename(self, fname): + def local_filename(self, fname : Optional[Union[Path, str]]): """ Set the local/cached ``@xlink:href`` of this ``mets:file`` to :py:attr:`local_filename`. """ @@ -212,9 +212,11 @@ def local_filename(self, fname): if el_FLocat is not None: self._el.remove(el_FLocat) return + else: + fname = str(fname) if el_FLocat is None: el_FLocat = ET.SubElement(self._el, TAG_METS_FLOCAT) - el_FLocat.set("{%s}href" % NS["xlink"], str(fname)) + el_FLocat.set("{%s}href" % NS["xlink"], fname) el_FLocat.set("LOCTYPE", "OTHER") el_FLocat.set("OTHERLOCTYPE", "FILE") @@ -226,7 +228,18 @@ class ClientSideOcrdFile: this represents the response of the :py:class:`ocrd.mets_server.OcrdMetsServer`. """ - def __init__(self, el, mimetype=None, pageId=None, loctype='OTHER', local_filename=None, mets=None, url=None, ID=None, fileGrp=None): + def __init__( + self, + el, + mimetype: str = '', + pageId: str = '', + loctype: str ='OTHER', + local_filename: Optional[str] = None, + mets : Any = None, + url: str = '', + ID: str = '', + fileGrp: str = '' + ): """ Args: el (): ignored diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index ae4d75e29..19ccfb80e 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -4,7 +4,8 @@ from datetime import datetime import re import typing -from lxml import etree as ET +from typing_extensions import Optional +from lxml import etree as ET # type: ignore from copy import deepcopy from warnings import warn @@ -34,7 +35,7 @@ METS_XML_EMPTY, ) -from .ocrd_xml_base import OcrdXmlDocument, ET +from .ocrd_xml_base import OcrdXmlDocument from .ocrd_file import OcrdFile from .ocrd_agent import OcrdAgent @@ -134,9 +135,9 @@ def _clear_caches(self): Deallocates the caches """ - self._file_cache = None - self._page_cache = None - self._fptr_cache = None + self._file_cache = {} + self._page_cache = {} + self._fptr_cache = {} def refresh_caches(self): if self._cache_flag: @@ -243,7 +244,7 @@ def find_files( pageId=None, mimetype=None, url=None, - local_filename=None, + local_filename : Optional[str] = None, local_only=False, include_fileGrp=None, exclude_fileGrp=None, diff --git a/src/ocrd_models/ocrd_xml_base.py b/src/ocrd_models/ocrd_xml_base.py index 7faefbad9..0617158b1 100644 --- a/src/ocrd_models/ocrd_xml_base.py +++ b/src/ocrd_models/ocrd_xml_base.py @@ -2,7 +2,7 @@ Base class for XML documents loaded from either content or filename. """ from os.path import exists -from lxml import etree as ET +from lxml import etree as ET # type: ignore from .constants import NAMESPACES from .utils import xmllint_format diff --git a/tests/cli/test_workspace.py b/tests/cli/test_workspace.py index e96a39136..cf7eef3d0 100644 --- a/tests/cli/test_workspace.py +++ b/tests/cli/test_workspace.py @@ -461,7 +461,7 @@ def test_bulk_add0(self): assert len(ws.mets.find_all_files(ID='//FILE_OCR-D-IMG_000.*')) == 10 assert len(ws.mets.find_all_files(ID='//FILE_.*_000.*')) == 20 assert len(ws.mets.find_all_files(pageId='PHYS_0001')) == 2 - assert ws.mets.find_all_files(ID='FILE_OCR-D-PAGE_0001')[0].local_filename == Path('OCR-D-PAGE/FILE_0001.xml') + assert ws.mets.find_all_files(ID='FILE_OCR-D-PAGE_0001')[0].local_filename == 'OCR-D-PAGE/FILE_0001.xml' def test_bulk_add_missing_param(self): with pushd_popd(tempdir=True) as wsdir: @@ -498,7 +498,7 @@ def test_bulk_add_gen_id(self): ws.reload_mets() print(out) assert next(ws.mets.find_files()).ID == 'b_c' - assert next(ws.mets.find_files()).local_filename == Path('d') + assert next(ws.mets.find_files()).local_filename == 'd' assert next(ws.mets.find_files()).url == 'https://host/b/d' def test_bulk_add_derive_local_filename(self): @@ -517,7 +517,7 @@ def test_bulk_add_derive_local_filename(self): # print('out', out) # print('err', err) ws.reload_mets() - assert next(ws.mets.find_files()).local_filename == Path('srcdir/src.xml') + assert next(ws.mets.find_files()).local_filename == 'srcdir/src.xml' def test_bulk_add_stdin(self): resolver = Resolver() @@ -550,7 +550,7 @@ def test_bulk_add_stdin(self): f = next(ws.mets.find_files()) assert f.mimetype == 'image/png' assert f.ID == 'FILE_0001_BIN.IMG-wolf' - assert f.local_filename == Path('BIN/FILE_0001_BIN.IMG-wolf.png') + assert f.local_filename == 'BIN/FILE_0001_BIN.IMG-wolf.png' assert f.url == 'https://host/FILE_0001_BIN.IMG-wolf/BIN/FILE_0001_BIN.IMG-wolf.png' def test_list_page(self): diff --git a/tests/model/test_ocrd_file.py b/tests/model/test_ocrd_file.py index 98b9fa424..e057be784 100644 --- a/tests/model/test_ocrd_file.py +++ b/tests/model/test_ocrd_file.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +from pathlib import Path import pytest from tests.base import ( @@ -61,7 +62,7 @@ def test_basename_from_url(): Changed behavior, basename no longer derived from f.url """ f = create_ocrd_file_with_defaults(url="http://foo.bar/quux") - assert f.basename == None + assert f.basename == '' @pytest.mark.parametrize("local_filename,extension", @@ -74,6 +75,8 @@ def test_create_ocrd_file_with_defaults_extension(local_filename, extension): f = create_ocrd_file_with_defaults(local_filename=local_filename) assert f.extension == extension + assert not isinstance(f.local_filename, Path) + assert isinstance(f.local_filename, str) @pytest.mark.parametrize("local_filename,wo_extension", diff --git a/tests/processor/test_ocrd_dummy.py b/tests/processor/test_ocrd_dummy.py index 1f4ecc055..41b585c6b 100644 --- a/tests/processor/test_ocrd_dummy.py +++ b/tests/processor/test_ocrd_dummy.py @@ -3,7 +3,6 @@ from io import BytesIO import os -from pathlib import Path from PIL import Image @@ -33,9 +32,8 @@ def test_copies_ok(self): ) output_files = workspace.mets.find_all_files(fileGrp='OUTPUT') output_files.sort(key=lambda x: x.url) - print([str(s) for s in output_files]) - assert output_files[0].local_filename == Path('OUTPUT/OUTPUT_PHYS_0001.tif') - assert output_files[1].local_filename == Path('OUTPUT/OUTPUT_PHYS_0001.xml') + assert output_files[0].local_filename == 'OUTPUT/OUTPUT_PHYS_0001.tif' + assert output_files[1].local_filename == 'OUTPUT/OUTPUT_PHYS_0001.xml' self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID) assert page_from_file(output_files[1]).get_Page().imageFilename == str(output_files[0].local_filename) self.assertEqual(len(output_files), 6) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 59ee50e08..f61de4baf 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -1,3 +1,5 @@ +from collections.abc import Generator +from typing import Iterable, Tuple from pytest import fixture, raises from tests.base import assets @@ -22,7 +24,7 @@ TRANSPORTS = ['/tmp/ocrd-mets-server.sock', 'http://127.0.0.1:12345'] @fixture(scope='function', name='start_mets_server', params=TRANSPORTS) -def fixture_start_mets_server(request): +def fixture_start_mets_server(request) -> Iterable[Tuple[str, Workspace]]: def _start_mets_server(*args, **kwargs): mets_server = OcrdMetsServer(*args, **kwargs) mets_server.startup() @@ -99,7 +101,6 @@ def test_mets_server_add_file(start_mets_server): 'FOO' ]) assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == NO_FILES - assert len(workspace_server.mets.find_all_files(file_grp='FOO')) == NO_FILES # not yet synced workspace_file = Workspace(Resolver(), WORKSPACE_DIR) @@ -184,7 +185,7 @@ def test_mets_server_socket_stop(start_mets_server): # make sure the socket file was deleted on shutdown assert not Path(mets_server_url).exists() -def test_find_all_files(start_mets_server): +def test_find_all_files(start_mets_server : Tuple[str, Workspace]): _, workspace_server = start_mets_server assert len(workspace_server.mets.find_all_files()) == 35, '35 files total' assert len(workspace_server.mets.find_all_files(fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' @@ -203,7 +204,7 @@ def test_find_all_files(start_mets_server): assert len(workspace_server.mets.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' assert len(workspace_server.mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' -def test_reload(start_mets_server): +def test_reload(start_mets_server : Tuple[str, Workspace]): _, workspace_server = start_mets_server workspace_server_copy = Workspace(Resolver(), workspace_server.directory) assert len(workspace_server.mets.find_all_files()) == 35, '35 files total' diff --git a/tests/test_resolver.py b/tests/test_resolver.py index bf9f4bb72..c162feb21 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -174,7 +174,7 @@ def test_workspace_from_url0(): # assert assert '%s.tif' % f.ID == 'FILE_0001_IMAGE.tif' - assert f.local_filename == Path('OCR-D-IMG/FILE_0001_IMAGE.tif') + assert f.local_filename == 'OCR-D-IMG/FILE_0001_IMAGE.tif' def test_resolve_image0(): diff --git a/tests/test_workspace.py b/tests/test_workspace.py index 18dd23331..df1818131 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -76,8 +76,8 @@ def test_workspace_add_file(plain_workspace): assert f.ID == 'ID1' assert f.mimetype == 'image/tiff' assert not f.url - assert f.local_filename == fpath - assert f.local_filename.exists() + assert f.local_filename == str(fpath) + assert Path(f.local_filename).exists() def test_workspace_add_file_overwrite(plain_workspace): @@ -97,7 +97,7 @@ def test_workspace_add_file_overwrite(plain_workspace): assert f.ID == 'ID1' assert f.mimetype == 'image/tiff' assert not f.url - assert f.local_filename == fpath + assert f.local_filename == str(fpath) assert f.pageId == 'phys1' assert fpath.exists() @@ -317,7 +317,7 @@ def test_rename_file_group(tmp_path): # act workspace.rename_file_group('OCR-D-IMG', 'FOOBAR') next_ocrd_file = next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001')) - next_ocrd_file.local_filename = tmp_path / relative_name + next_ocrd_file.local_filename = str(tmp_path / relative_name) pcgts_after = page_from_file(next_ocrd_file) # assert @@ -358,12 +358,12 @@ def test_remove_file_group_flat(plain_workspace): """ # act - added_res = plain_workspace.add_file('FOO', file_id='foo', mimetype='foo/bar', local_filename='file.ext', content='foo', page_id=None).url + added_res = plain_workspace.add_file('FOO', file_id='foo', mimetype='foo/bar', local_filename='file.ext', content='foo', page_id=None).local_filename # requires additional prepending of current path because not pushd_popd-magic at work - added_path = Path(join(plain_workspace.directory, added_res)) + added_filename = join(plain_workspace.directory, added_res) # assert - assert added_path.exists() + assert Path(added_filename).exists() plain_workspace.remove_file_group('FOO', recursive=True) @@ -408,8 +408,8 @@ def test_download_to_directory_from_workspace_download_file(plain_workspace): plain_workspace.download_file(f1) plain_workspace.download_file(f2) - assert f1.local_filename == Path('test.tif') - assert f2.local_filename == Path('test.xml') + assert f1.local_filename == 'test.tif' + assert f2.local_filename == 'test.xml' def test_save_image_file_invalid_mimetype_raises_exception(plain_workspace): @@ -652,12 +652,12 @@ def test_merge_no_copy_files(tmp_path): ws1.merge(ws2, copy_files=False, fileId_mapping={'f1': 'f1_copy_files'}) - assert next(ws1.mets.find_files(ID='f1_copy_files')).local_filename == Path('ws2/GRP2/f1') + assert next(ws1.mets.find_files(ID='f1_copy_files')).local_filename == 'ws2/GRP2/f1' with pytest.raises(FileExistsError): ws1.merge(ws2, copy_files=True, fileId_mapping={'f1': 'f1_copy_files'}) ws1.merge(ws2, copy_files=True, fileId_mapping={'f1': 'f1_copy_files'}, force=True) - assert next(ws1.mets.find_files(ID='f1_copy_files')).local_filename == Path('GRP2/f1') + assert next(ws1.mets.find_files(ID='f1_copy_files')).local_filename == 'GRP2/f1' def test_merge_overwrite(tmp_path): # arrange