From 28ca20d2b0770456fb47a6f95dc544786f5693ce Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 21 Dec 2022 16:56:33 +0100 Subject: [PATCH 01/12] restart mets:FLocat handling --- Makefile | 2 +- ocrd/ocrd/workspace.py | 25 ++++---- ocrd_models/ocrd_models/ocrd_file.py | 89 ++++++++++++---------------- repo/spec | 2 +- tests/model/test_ocrd_file.py | 19 ++---- tests/test_workspace.py | 12 +--- 6 files changed, 57 insertions(+), 92 deletions(-) diff --git a/Makefile b/Makefile index 6df7d5fd0..7141c023a 100644 --- a/Makefile +++ b/Makefile @@ -196,7 +196,7 @@ assets: repo/assets .PHONY: test # Run all unit tests test: assets - $(PYTHON) -m pytest --continue-on-collection-errors --durations=10\ + $(PYTHON) -m pytest $(PYTEST_ARGS) --durations=10\ --ignore=$(TESTDIR)/test_logging.py \ --ignore=$(TESTDIR)/test_logging_conf.py \ --ignore-glob="$(TESTDIR)/**/*bench*.py" \ diff --git a/ocrd/ocrd/workspace.py b/ocrd/ocrd/workspace.py index b6752f990..f56f48723 100644 --- a/ocrd/ocrd/workspace.py +++ b/ocrd/ocrd/workspace.py @@ -167,23 +167,22 @@ def download_file(self, f, _recursion_count=0): log.debug('download_file %s [_recursion_count=%s]' % (f, _recursion_count)) with pushd_popd(self.directory): try: - # If the f.url is already a file path, and is within self.directory, do nothing - url_path = Path(f.url).absolute() - if not (url_path.exists() and url_path.relative_to(str(Path(self.directory).resolve()))): - raise Exception("Not already downloaded, moving on") - except Exception as e: + # If the f.local_filename is already a file path, and is within self.directory, do nothing + file_path = Path(f.local_filename).absolute() + if not (file_path.exists() and file_path.relative_to(str(Path(self.directory).resolve()))): + raise FileNotFoundError("Not already downloaded, moving on") + except FileNotFoundError as e: basename = '%s%s' % (f.ID, MIME_TO_EXT.get(f.mimetype, '')) if f.ID else f.basename try: - f.url = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename) + f.local_filename = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename) except FileNotFoundError as e: if not self.baseurl: raise Exception("No baseurl defined by workspace. Cannot retrieve '%s'" % f.url) if _recursion_count >= 1: raise FileNotFoundError("Already tried prepending baseurl '%s'. Cannot retrieve '%s'" % (self.baseurl, f.url)) log.debug("First run of resolver.download_to_directory(%s) failed, try prepending baseurl '%s': %s", f.url, self.baseurl, e) - f.url = '%s/%s' % (self.baseurl, f.url) - f.url = self.download_file(f, _recursion_count + 1).local_filename - f.local_filename = f.url + f.local_filename = '%s/%s' % (self.baseurl, f.url) + f.local_filename = self.download_file(f, _recursion_count + 1).local_filename return f def remove_file(self, file_id, force=False, keep_file=False, page_recursive=False, page_same_group=False): @@ -307,16 +306,16 @@ def rename_file_group(self, old, new): url_replacements = {} log.info("Moving files") for mets_file in self.mets.find_files(fileGrp=old, local_only=True): - new_url = old_url = mets_file.url + new_url = old_url = mets_file.local_filename # Directory part new_url = sub(r'^%s/' % old, r'%s/' % new, new_url) # File part new_url = sub(r'/%s' % old, r'/%s' % new, new_url) - url_replacements[mets_file.url] = new_url + url_replacements[mets_file.local_filename] = new_url # move file from ``old`` to ``new`` - move(mets_file.url, new_url) + move(mets_file.local_filename, new_url) # change the url of ``mets:file`` - mets_file.url = new_url + mets_file.local_filename = new_url # change the file ID and update structMap # change the file ID and update structMap new_id = sub(r'^%s' % old, r'%s' % new, mets_file.ID) diff --git a/ocrd_models/ocrd_models/ocrd_file.py b/ocrd_models/ocrd_models/ocrd_file.py index 8b0217659..156e6a64a 100644 --- a/ocrd_models/ocrd_models/ocrd_file.py +++ b/ocrd_models/ocrd_models/ocrd_file.py @@ -13,11 +13,7 @@ class OcrdFile(): Represents a single ``mets:file/mets:FLocat`` (METS file entry). """ - # @staticmethod - # def create(mimetype, ID, url, local_filename): - # el_fileGrp.SubElement('file') - - def __init__(self, el, mimetype=None, pageId=None, loctype='OTHER', local_filename=None, mets=None, url=None, ID=None): + def __init__(self, el, mimetype=None, pageId=None, local_filename=None, mets=None, url=None, ID=None): """ Args: el (LxmlElement): etree Element of the ``mets:file`` this represents. Create new if not provided @@ -25,9 +21,9 @@ def __init__(self, el, mimetype=None, pageId=None, loctype='OTHER', local_filena mets (OcrdMets): Containing :py:class:`ocrd_models.ocrd_mets.OcrdMets`. mimetype (string): ``@MIMETYPE`` of this ``mets:file`` pageId (string): ``@ID`` of the physical ``mets:structMap`` entry corresponding to this ``mets:file`` - loctype (string): ``@LOCTYPE`` of this ``mets:file`` local_filename (string): Local filename - url (string): ``@xlink:href`` of this ``mets:file`` + url (string): original ``@xlink:href`` of this ``mets:file`` + local_filename (string): ``@xlink:href`` pointing to the locally cached version of the file in the workspace ID (string): ``@ID`` of this ``mets:file`` """ if el is None: @@ -37,16 +33,11 @@ def __init__(self, el, mimetype=None, pageId=None, loctype='OTHER', local_filena self.ID = ID self.mimetype = mimetype self.local_filename = local_filename - self.loctype = loctype self.pageId = pageId if url: self.url = url - if not(local_filename): - if self.url and is_local_filename(self.url): - self.local_filename = get_local_filename(self.url) - def __str__(self): """ String representation of this ``mets:file``. @@ -139,46 +130,11 @@ def pageId(self, pageId): self.mets.set_physical_page_for_file(pageId, self) @property - def loctype(self): + def loctypes(self): """ - Get the ``@LOCTYPE`` of the ``mets:file``. - """ - el_FLocat = self._el.find('mets:FLocat', NS) - return '' if el_FLocat is None else el_FLocat.get('LOCTYPE') - - @loctype.setter - def loctype(self, loctype): + Get the ``@LOCTYPE``s of the ``mets:file``. """ - Set the ``@LOCTYPE`` of the ``mets:file`` to :py:attr:`loctype`. - """ - if loctype is None: - return - loctype = loctype.upper() - el_FLocat = self._el.find('mets:FLocat', NS) - if el_FLocat is None: - el_FLocat = ET.SubElement(self._el, TAG_METS_FLOCAT) - el_FLocat.set('LOCTYPE', loctype) - if loctype == 'OTHER': - self.otherloctype = 'FILE' - else: - self.otherloctype = None - - @property - def otherloctype(self): - el_FLocat = self._el.find('mets:FLocat', NS) - return '' if el_FLocat is None else el_FLocat.get('OTHERLOCTYPE') - - @otherloctype.setter - def otherloctype(self, otherloctype): - el_FLocat = self._el.find('mets:FLocat', NS) - if el_FLocat is None: - el_FLocat = ET.SubElement(self._el, TAG_METS_FLOCAT) - if not otherloctype: - if 'OTHERLOCTYPE' in el_FLocat.attrib: - del el_FLocat.attrib['OTHERLOCTYPE'] - else: - el_FLocat.set('LOCTYPE', 'OTHER') - el_FLocat.set('OTHERLOCTYPE', otherloctype) + return [x.get('LOCTYPE') for x in self._el.findall('mets:FLocat', NS)] @property def mimetype(self): @@ -211,7 +167,7 @@ def url(self): """ Get the ``@xlink:href`` of this ``mets:file``. """ - el_FLocat = self._el.find(TAG_METS_FLOCAT) + el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="URL"]', NS) if el_FLocat is not None: return el_FLocat.get("{%s}href" % NS["xlink"]) return '' @@ -221,9 +177,38 @@ def url(self, url): """ Set the ``@xlink:href`` of this ``mets:file`` to :py:attr:`url`. """ + el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="URL"]', NS) if url is None: + if el_FLocat: + self._el.remove(el_FLocat) return - el_FLocat = self._el.find('mets:FLocat', NS) if el_FLocat is None: el_FLocat = ET.SubElement(self._el, TAG_METS_FLOCAT) el_FLocat.set("{%s}href" % NS["xlink"], url) + el_FLocat.set("LOCTYPE", "URL") + + @property + def local_filename(self): + """ + Get the ``@xlink:href`` of this ``mets:file``. + """ + el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"]', NS) + if el_FLocat is not None: + return el_FLocat.get("{%s}href" % NS["xlink"]) + return '' + + @local_filename.setter + def local_filename(self, fname): + """ + Set the ``@xlink:href`` of this ``mets:file`` to :py:attr:`local_filename`. + """ + el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="URL"][@OTHERLOCTYPE="FILE"]', NS) + if fname is None: + if el_FLocat: + self._el.remove(el_FLocat) + return + if el_FLocat is None: + el_FLocat = ET.SubElement(self._el, TAG_METS_FLOCAT) + el_FLocat.set("{%s}href" % NS["xlink"], fname) + el_FLocat.set("LOCTYPE", "OTHER") + el_FLocat.set("OTHERLOCTYPE", "FILE") diff --git a/repo/spec b/repo/spec index 506b33936..1194310c1 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit 506b33936d89080a683fa8a26837f2a23b23e5e2 +Subproject commit 1194310c18d90b280c380bdc3cb04adb6a41120f diff --git a/tests/model/test_ocrd_file.py b/tests/model/test_ocrd_file.py index cdaaffcfa..b73185d3c 100644 --- a/tests/model/test_ocrd_file.py +++ b/tests/model/test_ocrd_file.py @@ -26,18 +26,9 @@ def test_ocrd_file_without_filegrp(): assert "set fileGrp" in str(val_err.value) -def test_set_loctype(): - f = create_ocrd_file_with_defaults() - assert f.loctype == 'OTHER' - assert f.otherloctype == 'FILE' - f.otherloctype = 'foo' - assert f.otherloctype == 'foo' - f.loctype = 'URN' - assert f.loctype == 'URN' - assert f.otherloctype == None - f.otherloctype = 'foo' - assert f.loctype, 'OTHER' - +def test_get_loctypes(): + f = create_ocrd_file_with_defaults(local_filename='foo', url='bar') + assert f.loctypes == ['OTHER', 'URL'] def test_set_url(): f = create_ocrd_file_with_defaults() @@ -50,8 +41,7 @@ def test_set_url(): def test_constructor_url(): f = create_ocrd_file_with_defaults(url="foo") assert f.url == 'foo' - assert f.local_filename == 'foo' - + assert f.local_filename == '' def test_set_id_none(): f = create_ocrd_file_with_defaults() @@ -120,7 +110,6 @@ def test_ocrd_file_equality(): f5 = mets.add_file('TEMP', ID='TEMP_1', mimetype='image/tiff') assert f3 == f5 - def test_fptr_changed_for_change_id(): mets = OcrdMets.empty_mets() f1 = mets.add_file('FOO', ID='FOO_1', mimetype='image/tiff', pageId='p0001') diff --git a/tests/test_workspace.py b/tests/test_workspace.py index 5a47f3365..2f57274ef 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -105,10 +105,7 @@ def test_workspace_add_file_overwrite(plain_workspace): def test_workspace_add_file_basename_no_content(plain_workspace): plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', page_id=None) f = next(plain_workspace.mets.find_files()) - - # assert - assert f.url == None - + assert f.url == '' def test_workspace_add_file_binary_content(plain_workspace): fpath = join(plain_workspace.directory, 'subdir', 'ID1.tif') @@ -164,15 +161,10 @@ def _url_to_file(the_path): def test_download_very_self_file(plain_workspace): - - # arrange with some dummy stuff the_file = _url_to_file(abspath(__file__)) - - # act fn = plain_workspace.download_file(the_file) - - # assert assert fn, join('DEPRECATED', basename(__file__)) + assert fn == the_file.local_filename def test_download_url_without_baseurl_raises_exception(tmp_path): From 143ac6c8a5231cea9a10e94a2f16d2c478c2e67d Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 18 Aug 2023 18:00:20 +0200 Subject: [PATCH 02/12] update assets to loctype-file branch --- repo/assets | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repo/assets b/repo/assets index bcfa982e8..4c7396455 160000 --- a/repo/assets +++ b/repo/assets @@ -1 +1 @@ -Subproject commit bcfa982e81319513a13ae58ab2c216e014e52bd7 +Subproject commit 4c7396455ebc0e7734d2408ca736ef642c4dd32a From 4f0ab46ead465e2c0c5c666c9e35bb2e22bf2aa9 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 18 Aug 2023 18:34:56 +0200 Subject: [PATCH 03/12] flocat: fix OcrdMets.find_files --- Makefile | 6 +++++- ocrd_models/ocrd_models/ocrd_file.py | 15 +++++++------ ocrd_models/ocrd_models/ocrd_mets.py | 32 ++++++++++++++++++---------- ocrd_utils/ocrd_utils/__init__.py | 6 +++++- ocrd_utils/ocrd_utils/deprecate.py | 10 +++++---- ocrd_utils/ocrd_utils/str.py | 2 ++ tests/model/test_ocrd_mets.py | 3 ++- 7 files changed, 50 insertions(+), 24 deletions(-) diff --git a/Makefile b/Makefile index 7141c023a..9ebb2e47f 100644 --- a/Makefile +++ b/Makefile @@ -195,9 +195,13 @@ assets: repo/assets .PHONY: test # Run all unit tests +# XXX TODO need to fix this before merge, silencing the deprecation warnings about PEP420 and ignoring the long-running bashlib test test: assets - $(PYTHON) -m pytest $(PYTEST_ARGS) --durations=10\ + $(PYTHON) \ + -W 'ignore:Deprecated call to `pkg_r:DeprecationWarning' \ + -m pytest $(PYTEST_ARGS) --durations=10\ --ignore=$(TESTDIR)/test_logging.py \ + --ignore=$(TESTDIR)/cli/test_bashlib.py \ --ignore=$(TESTDIR)/test_logging_conf.py \ --ignore-glob="$(TESTDIR)/**/*bench*.py" \ $(TESTDIR) diff --git a/ocrd_models/ocrd_models/ocrd_file.py b/ocrd_models/ocrd_models/ocrd_file.py index 156e6a64a..9a6b0a210 100644 --- a/ocrd_models/ocrd_models/ocrd_file.py +++ b/ocrd_models/ocrd_models/ocrd_file.py @@ -3,7 +3,7 @@ """ from os.path import splitext, basename -from ocrd_utils import is_local_filename, get_local_filename, MIME_TO_EXT, EXT_TO_MIME +from ocrd_utils import deprecation_warning from .ocrd_xml_base import ET from .constants import NAMESPACES as NS, TAG_METS_FLOCAT, TAG_METS_FILE @@ -13,7 +13,7 @@ class OcrdFile(): Represents a single ``mets:file/mets:FLocat`` (METS file entry). """ - def __init__(self, el, mimetype=None, pageId=None, local_filename=None, mets=None, url=None, ID=None): + def __init__(self, el, mimetype=None, pageId=None, local_filename=None, mets=None, url=None, ID=None, loctype=None): """ Args: el (LxmlElement): etree Element of the ``mets:file`` this represents. Create new if not provided @@ -25,9 +25,12 @@ def __init__(self, el, mimetype=None, pageId=None, local_filename=None, mets=Non url (string): original ``@xlink:href`` of this ``mets:file`` local_filename (string): ``@xlink:href`` pointing to the locally cached version of the file in the workspace ID (string): ``@ID`` of this ``mets:file`` + loctype (string): DEPRECATED do not use """ if el is None: raise ValueError("Must provide mets:file element this OcrdFile represents") + if loctype: + deprecation_warning("'loctype' is not supported in OcrdFile anymore, use 'url' or 'local_filename'") self._el = el self.mets = mets self.ID = ID @@ -165,7 +168,7 @@ def fileGrp(self): @property def url(self): """ - Get the ``@xlink:href`` of this ``mets:file``. + Get the remote/original URL ``@xlink:href`` of this ``mets:file``. """ el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="URL"]', NS) if el_FLocat is not None: @@ -175,7 +178,7 @@ def url(self): @url.setter def url(self, url): """ - Set the ``@xlink:href`` of this ``mets:file`` to :py:attr:`url`. + Set the remote/original URL ``@xlink:href`` of this ``mets:file`` to :py:attr:`url`. """ el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="URL"]', NS) if url is None: @@ -190,7 +193,7 @@ def url(self, url): @property def local_filename(self): """ - Get the ``@xlink:href`` of this ``mets:file``. + Get the local/cached ``@xlink:href`` of this ``mets:file``. """ el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"]', NS) if el_FLocat is not None: @@ -200,7 +203,7 @@ def local_filename(self): @local_filename.setter def local_filename(self, fname): """ - Set the ``@xlink:href`` of this ``mets:file`` to :py:attr:`local_filename`. + Set the local/cached ``@xlink:href`` of this ``mets:file`` to :py:attr:`local_filename`. """ el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="URL"][@OTHERLOCTYPE="FILE"]', NS) if fname is None: diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index dddf3f556..54dc9b4a4 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -9,8 +9,8 @@ from copy import deepcopy from ocrd_utils import ( - is_local_filename, getLogger, + deprecation_warning, generate_range, VERSION, REGEX_PREFIX, @@ -242,7 +242,7 @@ def find_all_files(self, *args, **kwargs): return list(self.find_files(*args, **kwargs)) # pylint: disable=multiple-statements - def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None, local_only=False): + def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None, local_filename=None, local_only=False): """ Search ``mets:file`` entries in this METS document and yield results. The :py:attr:`ID`, :py:attr:`pageId`, :py:attr:`fileGrp`, @@ -259,7 +259,8 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None ID (string) : ``@ID`` of the ``mets:file`` fileGrp (string) : ``@USE`` of the ``mets:fileGrp`` to list files of pageId (string) : ``@ID`` of the corresponding physical ``mets:structMap`` entry (physical page) - url (string) : ``@xlink:href`` (URL or path) of ``mets:Flocat`` of ``mets:file`` + url (string) : ``@xlink:href`` remote/original URL of ``mets:Flocat`` of ``mets:file`` + local_filename (string) : ``@xlink:href`` local/cached filename of ``mets:Flocat`` of ``mets:file`` mimetype (string) : ``@MIMETYPE`` of ``mets:file`` local (boolean) : Whether to restrict results to local files in the filesystem Yields: @@ -331,7 +332,7 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None if not mimetype.fullmatch(cand.get('MIMETYPE') or ''): continue if url: - cand_locat = cand.find('mets:FLocat', namespaces=NS) + cand_locat = cand.find('mets:FLocat[@LOCTYPE="URL"]', namespaces=NS) if cand_locat is None: continue cand_url = cand_locat.get('{%s}href' % NS['xlink']) @@ -340,14 +341,23 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None else: if not url.fullmatch(cand_url): continue - # Note: why we instantiate a class only to find out that the local_only is set afterwards - # Checking local_only and url before instantiation should be better? - f = OcrdFile(cand, mets=self, loctype=cand.get('LOCTYPE')) + if local_filename: + cand_locat = cand.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"]', namespaces=NS) + if cand_locat is None: + continue + cand_local_filename = cand_locat.get('{%s}href' % NS['xlink']) + if isinstance(local_filename, str): + if cand_local_filename != local_filename: continue + else: + if not local_filename.fullmatch(cand_local_filename): continue - # If only local resources should be returned and f is not a file path: skip the file - if local_only and not is_local_filename(f.url): - continue - yield f + if local_only: + deprecation_warning("'local_only' is deprecated, use 'local_filename=\"//.+\"' instead") + is_local = cand.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"][@xlink:href]', namespaces=NS) + if is_local is None: + continue + + yield OcrdFile(cand, mets=self) def add_file_group(self, fileGrp): """ diff --git a/ocrd_utils/ocrd_utils/__init__.py b/ocrd_utils/ocrd_utils/__init__.py index 319bc852b..115fb8043 100644 --- a/ocrd_utils/ocrd_utils/__init__.py +++ b/ocrd_utils/ocrd_utils/__init__.py @@ -99,6 +99,8 @@ Exports of :py:mod:`ocrd_utils.logging` * :py:func:`deprecated_alias` +* :py:func:`rename_kwargs` +* :py:func:`deprecation_warning` Decorator to mark a kwarg as deprecated """ @@ -118,7 +120,9 @@ ) from .deprecate import ( - deprecated_alias) + deprecated_alias, + rename_kwargs, + deprecation_warning) from .image import ( adjust_canvas_to_rotation, diff --git a/ocrd_utils/ocrd_utils/deprecate.py b/ocrd_utils/ocrd_utils/deprecate.py index 6688fde5f..d17efeb58 100644 --- a/ocrd_utils/ocrd_utils/deprecate.py +++ b/ocrd_utils/ocrd_utils/deprecate.py @@ -1,10 +1,8 @@ import functools import warnings -""" -https://stackoverflow.com/questions/49802412/how-to-implement-deprecation-in-python-with-argument-alias -by user2357112 supports Monica -""" +def deprecation_warning(msg, stacklevel=2): + warnings.warn(msg, DeprecationWarning, stacklevel) def deprecated_alias(**aliases): """ @@ -19,6 +17,10 @@ def wrapper(*args, **kwargs): return deco def rename_kwargs(func_name, kwargs, aliases): + """ + https://stackoverflow.com/questions/49802412/how-to-implement-deprecation-in-python-with-argument-alias + by user2357112 supports Monica + """ for alias, new in aliases.items(): if alias in kwargs: if new in kwargs: diff --git a/ocrd_utils/ocrd_utils/str.py b/ocrd_utils/ocrd_utils/str.py index 7211699f0..241f7527e 100644 --- a/ocrd_utils/ocrd_utils/str.py +++ b/ocrd_utils/ocrd_utils/str.py @@ -5,6 +5,7 @@ import re import json from .constants import REGEX_FILE_ID +from .deprecate import deprecation_warning __all__ = [ 'assert_file_grp_cardinality', @@ -137,6 +138,7 @@ def is_local_filename(url): """ Whether a url is a local filename. """ + deprecation_warning("Deprecated so we spot inconsistent URL/file handling") return url.startswith('file://') or not('://' in url) def is_string(val): diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index ad72a6ebe..17ab0a8ed 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -85,7 +85,8 @@ def test_find_all_files(sbb_sample_01): assert len(sbb_sample_01.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff' assert len(sbb_sample_01.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*' assert len(sbb_sample_01.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE - assert len(sbb_sample_01.find_all_files(url='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' + assert len(sbb_sample_01.find_all_files(local_filename='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 FILE xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' + assert len(sbb_sample_01.find_all_files(url='https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif')) == 1, '1 URL xlink:href="https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif"' assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' From 58bb31b52c716a774288b07ec559a5dd96f919f3 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 18 Aug 2023 19:50:45 +0200 Subject: [PATCH 04/12] OcrdFile: return local_filename as pathlib.Path --- ocrd_models/ocrd_models/ocrd_file.py | 33 ++++++++++++++-------------- tests/model/test_ocrd_file.py | 7 ++++-- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_file.py b/ocrd_models/ocrd_models/ocrd_file.py index 9a6b0a210..8daa0681b 100644 --- a/ocrd_models/ocrd_models/ocrd_file.py +++ b/ocrd_models/ocrd_models/ocrd_file.py @@ -2,6 +2,7 @@ API to ``mets:file`` """ from os.path import splitext, basename +from pathlib import Path from ocrd_utils import deprecation_warning @@ -23,7 +24,7 @@ def __init__(self, el, mimetype=None, pageId=None, local_filename=None, mets=Non pageId (string): ``@ID`` of the physical ``mets:structMap`` entry corresponding to this ``mets:file`` local_filename (string): Local filename url (string): original ``@xlink:href`` of this ``mets:file`` - local_filename (string): ``@xlink:href`` pointing to the locally cached version of the file in the workspace + local_filename (Path): ``@xlink:href`` pointing to the locally cached version of the file in the workspace ID (string): ``@ID`` of this ``mets:file`` loctype (string): DEPRECATED do not use """ @@ -35,9 +36,10 @@ def __init__(self, el, mimetype=None, pageId=None, local_filename=None, mets=Non self.mets = mets self.ID = ID self.mimetype = mimetype - self.local_filename = local_filename self.pageId = pageId + if local_filename: + self.local_filename = Path(local_filename) if url: self.url = url @@ -51,7 +53,7 @@ def __str__(self): # ]) # return 'OcrdFile[' + '\n\t' + props + '\n\t]' props = ', '.join([ - '='.join([k, getattr(self, k) if getattr(self, k) else '---']) + '='.join([k, str(getattr(self, k)) if getattr(self, k) else '---']) for k in ['ID', 'mimetype', 'url', 'local_filename'] ]) try: @@ -69,26 +71,26 @@ def __eq__(self, other): @property def basename(self): """ - Get the ``os.path.basename`` of the local file, if any. + Get the ``.name`` of the local file """ - return basename(self.local_filename if self.local_filename else self.url) + if not self.local_filename: + return + return self.local_filename.name @property def extension(self): - _basename, ext = splitext(self.basename) - if _basename.endswith('.tar'): - ext = ".tar" + ext - return ext + if not self.local_filename: + return + return ''.join(self.local_filename.suffixes) @property def basename_without_extension(self): """ Get the ``os.path.basename`` of the local file, if any, with extension removed. """ - ret = self.basename.rsplit('.', 1)[0] - if ret.endswith('.tar'): - ret = ret[0:len(ret)-4] - return ret + if not self.local_filename: + return + return self.local_filename.name[:-len(self.extension)] @property def ID(self): @@ -197,8 +199,7 @@ def local_filename(self): """ el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"]', NS) if el_FLocat is not None: - return el_FLocat.get("{%s}href" % NS["xlink"]) - return '' + return Path(el_FLocat.get("{%s}href" % NS["xlink"])) @local_filename.setter def local_filename(self, fname): @@ -212,6 +213,6 @@ def local_filename(self, fname): return if el_FLocat is None: el_FLocat = ET.SubElement(self._el, TAG_METS_FLOCAT) - el_FLocat.set("{%s}href" % NS["xlink"], fname) + el_FLocat.set("{%s}href" % NS["xlink"], str(fname)) el_FLocat.set("LOCTYPE", "OTHER") el_FLocat.set("OTHERLOCTYPE", "FILE") diff --git a/tests/model/test_ocrd_file.py b/tests/model/test_ocrd_file.py index b73185d3c..98b9fa424 100644 --- a/tests/model/test_ocrd_file.py +++ b/tests/model/test_ocrd_file.py @@ -41,7 +41,7 @@ def test_set_url(): def test_constructor_url(): f = create_ocrd_file_with_defaults(url="foo") assert f.url == 'foo' - assert f.local_filename == '' + assert f.local_filename == None def test_set_id_none(): f = create_ocrd_file_with_defaults() @@ -57,8 +57,11 @@ def test_basename(): def test_basename_from_url(): + """ + Changed behavior, basename no longer derived from f.url + """ f = create_ocrd_file_with_defaults(url="http://foo.bar/quux") - assert f.basename == 'quux' + assert f.basename == None @pytest.mark.parametrize("local_filename,extension", From 6e7f534273f73f4237f21c93caee30fba766f458 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Sat, 19 Aug 2023 16:04:29 +0200 Subject: [PATCH 05/12] revise download mechanisms in workspace and resolver --- ocrd/ocrd/resolver.py | 40 +++++++++++++++++++++------------------ ocrd/ocrd/workspace.py | 43 ++++++++++++++++++++++++++---------------- tests/test_resolver.py | 11 ++++++----- 3 files changed, 55 insertions(+), 39 deletions(-) diff --git a/ocrd/ocrd/resolver.py b/ocrd/ocrd/resolver.py index 1293e43ea..c385ddf02 100644 --- a/ocrd/ocrd/resolver.py +++ b/ocrd/ocrd/resolver.py @@ -28,24 +28,29 @@ class Resolver(): def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None, retries=None, timeout=None): """ - Download a file to a directory. + Download a URL ``url`` to a local file in ``directory``. - Early Shortcut: If `url` is a local file and that file is already in the directory, keep it there. + If ``url`` looks like a file path, check whether that exists. + If it does exist and is within ``directory` already, return early. + If it does exist but is outside of ``directory``. copy it. + If ``url` does not appear to be a file path, try downloading via HTTP, retrying ``retries`` times with timeout ``timeout`` between calls. - If `basename` is not given but subdir is, assume user knows what she's doing and - use last URL segment as the basename. + If ``basename`` is not given but ``subdir`` is, set ``basename`` to the last path segment of ``url``. - If `basename` is not given and no subdir is given, use the alnum characters in the URL as the basename. + If the target file already exists within ``directory``, behavior depends on ``if_exists``: + - ``skip`` (default): do nothing and return early. Note that this + - ``overwrite``: overwrite the existing file + - ``raise``: raise a ``FileExistsError`` Args: directory (string): Directory to download files to url (string): URL to download from Keyword Args: - basename (string, None): basename part of the filename on disk. + basename (string, None): basename part of the filename on disk. Defaults to last path segment of ``url`` if unset. if_exists (string, "skip"): What to do if target file already exists. One of ``skip`` (default), ``overwrite`` or ``raise`` - subdir (string, None): Subdirectory to create within the directory. Think ``mets:fileGrp``. + subdir (string, None): Subdirectory to create within the directory. Think ``mets:fileGrp[@USE]``. retries (int, None): Number of retries to attempt on network failure. timeout (tuple, None): Timeout in seconds for establishing a connection and reading next chunk of data. @@ -56,17 +61,16 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip', log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir) if not url: - raise Exception("'url' must be a string") + raise ValueError(f"'url' must be a non-empty string, not '{url}'") if not directory: - raise Exception("'directory' must be a string") # actually Path would also work + raise ValueError(f"'directory' must be a non-empty string, not '{url}'") # actually Path would also work directory = Path(directory) directory.mkdir(parents=True, exist_ok=True) - directory = str(directory.resolve()) subdir_path = Path(subdir if subdir else '') basename_path = Path(basename if basename else nth_url_segment(url)) - ret = str(Path(subdir_path, basename_path)) + ret = Path(subdir_path, basename_path) dst_path = Path(directory, ret) # log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url) @@ -80,32 +84,33 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip', src_path = None if is_local_filename(url): try: - # XXX this raises FNFE in Python 3.5 if src_path doesn't exist but not 3.6+ src_path = Path(get_local_filename(url)).resolve() except FileNotFoundError as e: log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path)) raise e if not src_path.exists(): - raise FileNotFoundError("File path passed as 'url' to download_to_directory does not exist: %s" % url) + raise FileNotFoundError(f"File path passed as 'url' to download_to_directory does not exist: '{url}") if src_path == dst_path: log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url)) - return ret + return str(ret) # Respect 'if_exists' arg if dst_path.exists(): if if_exists == 'skip': - return ret + return str(ret) if if_exists == 'raise': - raise FileExistsError("File already exists and if_exists == 'raise': %s" % (dst_path)) + raise FileExistsError(f"File already exists and if_exists == 'raise': {dst_path}") # Create dst_path parent dir dst_path.parent.mkdir(parents=True, exist_ok=True) # Copy files or download remote assets if src_path: + # src_path set, so it is a file source, we can copy directly log.debug("Copying file '%s' to '%s'", src_path, dst_path) dst_path.write_bytes(src_path.read_bytes()) else: + # src_path not set, it's an http URL, try to download log.debug("Downloading URL '%s' to '%s'", url, dst_path) if 'OCRD_DOWNLOAD_RETRIES' in environ: retries = retries or int(environ['OCRD_DOWNLOAD_RETRIES']) @@ -146,7 +151,7 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip', contents = handle_oai_response(response) dst_path.write_bytes(contents) - return ret + return str(ret) def workspace_from_url(self, mets_url, dst_dir=None, clobber_mets=False, mets_basename=None, download=False, src_baseurl=None): """ @@ -202,7 +207,6 @@ def workspace_from_url(self, mets_url, dst_dir=None, clobber_mets=False, mets_ba log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'", mets_basename, mets_url, src_baseurl, dst_dir) - self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip') workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl) diff --git a/ocrd/ocrd/workspace.py b/ocrd/ocrd/workspace.py index f56f48723..5ec1571cd 100644 --- a/ocrd/ocrd/workspace.py +++ b/ocrd/ocrd/workspace.py @@ -166,23 +166,34 @@ def download_file(self, f, _recursion_count=0): log = getLogger('ocrd.workspace.download_file') log.debug('download_file %s [_recursion_count=%s]' % (f, _recursion_count)) with pushd_popd(self.directory): - try: - # If the f.local_filename is already a file path, and is within self.directory, do nothing + if f.local_filename: file_path = Path(f.local_filename).absolute() - if not (file_path.exists() and file_path.relative_to(str(Path(self.directory).resolve()))): - raise FileNotFoundError("Not already downloaded, moving on") - except FileNotFoundError as e: + if file_path.exists(): + if file_path.relative_to(Path(self.directory).resolve()): + # If the f.local_filename exists and is within self.directory, nothing to do + log.info(f"'local_filename' {f.local_filename} already within {self.directory}, nothing to do") + return f + # f.local_filename exists, but not within self.directory, copy it + log.info("Copying 'local_filename' %s to workspace directory %s" % (f.local_filename, self.directory)) + f.local_filename = self.resolver.download_to_directory(self.directory, f.local_filename, subdir=f.fileGrp) + return f + else: + if f.url: + log.info("OcrdFile has 'local_filename' but it doesn't resolve, try to download from set 'url' %s", f.url) + elif self.baseurl: + log.info("OcrdFile has 'local_filename' but it doesn't resolve and no 'url', concatenate 'baseurl' %s and 'local_filename' %s", + self.baseurl, f.local_filename) + f.url = '%s/%s' % (self.baseurl, f.local_filename) + else: + raise FileNotFoundError(f"'local_filename' {f.local_filename} points to non-existing file," + "and no 'url' to download and no 'baseurl' set on workspace, nothing we can do.") + if f.url: + # If f.url is set, download the file to the workspace basename = '%s%s' % (f.ID, MIME_TO_EXT.get(f.mimetype, '')) if f.ID else f.basename - try: - f.local_filename = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename) - except FileNotFoundError as e: - if not self.baseurl: - raise Exception("No baseurl defined by workspace. Cannot retrieve '%s'" % f.url) - if _recursion_count >= 1: - raise FileNotFoundError("Already tried prepending baseurl '%s'. Cannot retrieve '%s'" % (self.baseurl, f.url)) - log.debug("First run of resolver.download_to_directory(%s) failed, try prepending baseurl '%s': %s", f.url, self.baseurl, e) - f.local_filename = '%s/%s' % (self.baseurl, f.url) - f.local_filename = self.download_file(f, _recursion_count + 1).local_filename + f.local_filename = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename) + else: + # If neither f.local_filename nor f.url is set, fail + raise ValueError("OcrdFile {f} has neither 'url' nor 'local_filename', so cannot be downloaded") return f def remove_file(self, file_id, force=False, keep_file=False, page_recursive=False, page_same_group=False): @@ -459,7 +470,7 @@ def _resolve_image_as_pil(self, image_url, coords=None): log = getLogger('ocrd.workspace._resolve_image_as_pil') with pushd_popd(self.directory): try: - f = next(self.mets.find_files(url=image_url)) + f = next(self.mets.find_files(local_filename=image_url)) pil_image = Image.open(self.download_file(f).local_filename) except StopIteration: with download_temporary_file(image_url) as f: diff --git a/tests/test_resolver.py b/tests/test_resolver.py index 32e64411a..84503edfe 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -181,9 +181,10 @@ def test_resolve_image0(): workspace = Resolver().workspace_from_url(METS_HEROLD) input_files = workspace.mets.find_all_files(fileGrp='OCR-D-IMG') f = input_files[0] - img_pil1 = workspace._resolve_image_as_pil(f.url) + print(f) + img_pil1 = workspace._resolve_image_as_pil(f.local_filename) assert img_pil1.size == (2875, 3749) - img_pil2 = workspace._resolve_image_as_pil(f.url, [[0, 0], [1, 1]]) + img_pil2 = workspace._resolve_image_as_pil(f.local_filename, [[0, 0], [1, 1]]) assert img_pil2.size == (1, 1) @@ -237,12 +238,12 @@ def test_workspace_from_nothing_noclobber(tmp_path): @pytest.mark.parametrize("url,basename,exc_msg", - [(None, None, "'url' must be a string"), - (None, 'foo', "'directory' must be a string")] + [(None, None, "'url' must be a non-empty string"), + (None, 'foo', "'directory' must be a non-empty string")] ) def test_download_to_directory_with_badargs(url, basename, exc_msg): - with pytest.raises(Exception) as exc: + with pytest.raises(ValueError) as exc: Resolver().download_to_directory(url, basename) # assert exception message contained From de514523dd87e5b1b14598864ec3c24a1a45546c Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Sat, 19 Aug 2023 18:38:55 +0200 Subject: [PATCH 06/12] continue to properly separate local_filename and url --- Makefile | 5 +- ocrd/ocrd/resolver.py | 17 +-- ocrd/ocrd/workspace.py | 116 +++++++++--------- .../ocrd_modelfactory/__init__.py | 2 +- ocrd_models/ocrd_models/ocrd_file.py | 7 +- ocrd_models/ocrd_models/ocrd_mets.py | 9 +- ocrd_utils/ocrd_utils/str.py | 6 +- .../ocrd_validators/workspace_validator.py | 6 +- tests/test_decorators.py | 2 +- tests/test_resolver.py | 2 +- tests/test_workspace.py | 73 +++++------ tests/validator/test_workspace_bagger.py | 2 +- 12 files changed, 129 insertions(+), 118 deletions(-) diff --git a/Makefile b/Makefile index 9ebb2e47f..efeb314e3 100644 --- a/Makefile +++ b/Makefile @@ -195,13 +195,14 @@ assets: repo/assets .PHONY: test # Run all unit tests -# XXX TODO need to fix this before merge, silencing the deprecation warnings about PEP420 and ignoring the long-running bashlib test +# XXX TODO need to fix this before merge, silencing the deprecation warnings about PEP420 and ignoring the long-running bashlib/resmgr test + #-W 'ignore:Deprecated call to `pkg_r:DeprecationWarning' test: assets $(PYTHON) \ - -W 'ignore:Deprecated call to `pkg_r:DeprecationWarning' \ -m pytest $(PYTEST_ARGS) --durations=10\ --ignore=$(TESTDIR)/test_logging.py \ --ignore=$(TESTDIR)/cli/test_bashlib.py \ + --ignore=$(TESTDIR)/test_resource_manager.py \ --ignore=$(TESTDIR)/test_logging_conf.py \ --ignore-glob="$(TESTDIR)/**/*bench*.py" \ $(TESTDIR) diff --git a/ocrd/ocrd/resolver.py b/ocrd/ocrd/resolver.py index c385ddf02..5e4c8f23a 100644 --- a/ocrd/ocrd/resolver.py +++ b/ocrd/ocrd/resolver.py @@ -61,10 +61,11 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip', log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir) if not url: - raise ValueError(f"'url' must be a non-empty string, not '{url}'") + raise ValueError(f"'url' must be a non-empty string, not '{url}'") # actually Path also ok if not directory: raise ValueError(f"'directory' must be a non-empty string, not '{url}'") # actually Path would also work + url = str(url) directory = Path(directory) directory.mkdir(parents=True, exist_ok=True) @@ -73,13 +74,13 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip', ret = Path(subdir_path, basename_path) dst_path = Path(directory, ret) - # log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url) - # print('url=%s', url) - # print('directory=%s', directory) - # print('subdir_path=%s', subdir_path) - # print('basename_path=%s', basename_path) - # print('ret=%s', ret) - # print('dst_path=%s', dst_path) + # log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url) + # print(f'>>> url={url}') + # print(f'>>> directory={directory}') + # print(f'>>> subdir_path={subdir_path}') + # print(f'>>> basename_path={basename_path}') + # print(f'>>> dst_path={dst_path}') + # print(f'>>> ret={ret}') src_path = None if is_local_filename(url): diff --git a/ocrd/ocrd/workspace.py b/ocrd/ocrd/workspace.py index 5ec1571cd..9ccb8ce99 100644 --- a/ocrd/ocrd/workspace.py +++ b/ocrd/ocrd/workspace.py @@ -114,18 +114,21 @@ def merge(self, other_workspace, copy_files=True, overwrite=False, **kwargs): """ def after_add_cb(f): """callback to run on merged OcrdFile instances in the destination""" + print(f) + if not f.local_filename: + # OcrdFile has no local_filename, so nothing to be copied + return if not copy_files: fpath_src = Path(other_workspace.directory).resolve() fpath_dst = Path(self.directory).resolve() dstprefix = fpath_src.relative_to(fpath_dst) # raises ValueError if not a subpath - if is_local_filename(f.url): - f.url = str(Path(dstprefix, f.url)) + f.local_filename = dstprefix / f.local_filename return - fpath_src = Path(other_workspace.directory, f.url) - fpath_dest = Path(self.directory, f.url) + fpath_src = Path(other_workspace.directory, f.local_filename) + fpath_dest = Path(self.directory, f.local_filename) if fpath_src.exists(): if fpath_dest.exists() and not overwrite: - raise Exception("Copying %s to %s would overwrite the latter" % (fpath_src, fpath_dest)) + raise FileExistsError("Copying %s to %s would overwrite the latter" % (fpath_src, fpath_dest)) if not fpath_dest.parent.is_dir(): makedirs(str(fpath_dest.parent)) with open(str(fpath_src), 'rb') as fstream_in, open(str(fpath_dest), 'wb') as fstream_out: @@ -164,28 +167,27 @@ def download_file(self, f, _recursion_count=0): Download a :py:class:`ocrd_models.ocrd_file.OcrdFile` to the workspace. """ log = getLogger('ocrd.workspace.download_file') - log.debug('download_file %s [_recursion_count=%s]' % (f, _recursion_count)) with pushd_popd(self.directory): if f.local_filename: file_path = Path(f.local_filename).absolute() if file_path.exists(): - if file_path.relative_to(Path(self.directory).resolve()): + try: + file_path.relative_to(Path(self.directory).resolve()) # raises ValueError if not relative # If the f.local_filename exists and is within self.directory, nothing to do log.info(f"'local_filename' {f.local_filename} already within {self.directory}, nothing to do") - return f - # f.local_filename exists, but not within self.directory, copy it - log.info("Copying 'local_filename' %s to workspace directory %s" % (f.local_filename, self.directory)) - f.local_filename = self.resolver.download_to_directory(self.directory, f.local_filename, subdir=f.fileGrp) + except ValueError: + # f.local_filename exists, but not within self.directory, copy it + log.info("Copying 'local_filename' %s to workspace directory %s" % (f.local_filename, self.directory)) + f.local_filename = self.resolver.download_to_directory(self.directory, f.local_filename, subdir=f.fileGrp) return f + if f.url: + log.info("OcrdFile has 'local_filename' but it doesn't resolve, try to download from set 'url' %s", f.url) + elif self.baseurl: + log.info("OcrdFile has 'local_filename' but it doesn't resolve and no 'url', concatenate 'baseurl' %s and 'local_filename' %s", + self.baseurl, f.local_filename) + f.url = '%s/%s' % (self.baseurl, f.local_filename) else: - if f.url: - log.info("OcrdFile has 'local_filename' but it doesn't resolve, try to download from set 'url' %s", f.url) - elif self.baseurl: - log.info("OcrdFile has 'local_filename' but it doesn't resolve and no 'url', concatenate 'baseurl' %s and 'local_filename' %s", - self.baseurl, f.local_filename) - f.url = '%s/%s' % (self.baseurl, f.local_filename) - else: - raise FileNotFoundError(f"'local_filename' {f.local_filename} points to non-existing file," + raise FileNotFoundError(f"'local_filename' {f.local_filename} points to non-existing file," "and no 'url' to download and no 'baseurl' set on workspace, nothing we can do.") if f.url: # If f.url is set, download the file to the workspace @@ -213,7 +215,7 @@ def remove_file(self, file_id, force=False, keep_file=False, page_recursive=Fals """ log = getLogger('ocrd.workspace.remove_file') log.debug('Deleting mets:file %s', file_id) - if not force and self.overwrite_mode: + if self.overwrite_mode: force = True if isinstance(file_id, OcrdFile): file_id = file_id.ID @@ -229,7 +231,7 @@ def remove_file(self, file_id, force=False, keep_file=False, page_recursive=Fals with pushd_popd(self.directory): ocrd_page = parse(self.download_file(ocrd_file).local_filename, silence=True) for img_url in ocrd_page.get_AllAlternativeImagePaths(): - img_kwargs = {'url': img_url} + img_kwargs = {'local_filename': img_url} if page_same_group: img_kwargs['fileGrp'] = ocrd_file.fileGrp for img_file in self.mets.find_files(**img_kwargs): @@ -305,34 +307,34 @@ def rename_file_group(self, old, new): log = getLogger('ocrd.workspace.rename_file_group') if old not in self.mets.file_groups: - raise ValueError("No such fileGrp: %s" % old) + raise ValueError(f"No such fileGrp: {old}") if new in self.mets.file_groups: - raise ValueError("fileGrp already exists %s" % new) + raise ValueError(f"fileGrp already exists {new}") with pushd_popd(self.directory): # create workspace dir ``new`` log.info("mkdir %s" % new) if not Path(new).is_dir(): Path(new).mkdir() - url_replacements = {} + local_filename_replacements = {} log.info("Moving files") for mets_file in self.mets.find_files(fileGrp=old, local_only=True): - new_url = old_url = mets_file.local_filename + new_local_filename = old_local_filename = str(mets_file.local_filename) # Directory part - new_url = sub(r'^%s/' % old, r'%s/' % new, new_url) + new_local_filename = sub(r'^%s/' % old, r'%s/' % new, new_local_filename) # File part - new_url = sub(r'/%s' % old, r'/%s' % new, new_url) - url_replacements[mets_file.local_filename] = new_url + new_local_filename = sub(r'/%s' % old, r'/%s' % new, new_local_filename) + local_filename_replacements[str(mets_file.local_filename)] = new_local_filename # move file from ``old`` to ``new`` - move(mets_file.local_filename, new_url) + mets_file.local_filename.rename(new_local_filename) # change the url of ``mets:file`` - mets_file.local_filename = new_url + mets_file.local_filename = new_local_filename # change the file ID and update structMap # change the file ID and update structMap new_id = sub(r'^%s' % old, r'%s' % new, mets_file.ID) try: next(self.mets.find_files(ID=new_id)) - log.warning("ID %s already exists, not changing ID while renaming %s -> %s" % (new_id, old_url, new_url)) + log.warning("ID %s already exists, not changing ID while renaming %s -> %s" % (new_id, old_local_filename, new_local_filename)) except StopIteration: mets_file.ID = new_id # change file paths in PAGE-XML imageFilename and filename attributes @@ -340,17 +342,17 @@ def rename_file_group(self, old, new): log.info("Renaming file references in PAGE-XML %s" % page_file) pcgts = page_from_file(page_file) changed = False - for old_url, new_url in url_replacements.items(): - if pcgts.get_Page().imageFilename == old_url: + for old_local_filename, new_local_filename in local_filename_replacements.items(): + if pcgts.get_Page().imageFilename == old_local_filename: changed = True - log.info("Rename pc:Page/@imageFilename: %s -> %s" % (old_url, new_url)) - pcgts.get_Page().imageFilename = new_url + log.info("Rename pc:Page/@imageFilename: %s -> %s" % (old_local_filename, new_local_filename)) + pcgts.get_Page().imageFilename = new_local_filename for ai in pcgts.get_Page().get_AllAlternativeImages(): - for old_url, new_url in url_replacements.items(): - if ai.filename == old_url: + for old_local_filename, new_local_filename in local_filename_replacements.items(): + if ai.filename == old_local_filename: changed = True - log.info("Rename pc:Page/../AlternativeImage: %s -> %s" % (old_url, new_url)) - ai.filename = new_url + log.info("Rename pc:Page/../AlternativeImage: %s -> %s" % (old_local_filename, new_local_filename)) + ai.filename = new_local_filename if changed: log.info("PAGE-XML changed, writing %s" % (page_file.local_filename)) with open(page_file.local_filename, 'w', encoding='utf-8') as f: @@ -393,11 +395,9 @@ def add_file(self, file_grp, content=None, **kwargs): with pushd_popd(self.directory): if kwargs.get('local_filename'): # If the local filename has folder components, create those folders - local_filename_dir = kwargs['local_filename'].rsplit('/', 1)[0] - if local_filename_dir != kwargs['local_filename'] and not Path(local_filename_dir).is_dir(): + local_filename_dir = str(kwargs['local_filename']).rsplit('/', 1)[0] + if local_filename_dir != str(kwargs['local_filename']) and not Path(local_filename_dir).is_dir(): makedirs(local_filename_dir) - if 'url' not in kwargs: - kwargs['url'] = kwargs['local_filename'] # print(kwargs) kwargs["pageId"] = kwargs.pop("page_id") @@ -437,15 +437,17 @@ def resolve_image_exif(self, image_url): """ if not image_url: # avoid "finding" just any file - raise Exception("Cannot resolve empty image path") + raise ValueError(f"'image_url' must be a non-empty string, not '{image_url}' ({type(image_url)})") try: - f = next(self.mets.find_files(url=image_url)) - image_filename = self.download_file(f).local_filename - ocrd_exif = exif_from_filename(image_filename) + f = next(self.mets.find_files(local_filename=str(image_url))) + return exif_from_filename(f.local_filename) except StopIteration: - with download_temporary_file(image_url) as f: - ocrd_exif = exif_from_filename(f.name) - return ocrd_exif + try: + f = next(self.mets.find_files(url=str(image_url))) + return exif_from_filename(self.download_file(f).local_filename) + except StopIteration: + with download_temporary_file(image_url) as f: + return exif_from_filename(f.name) @deprecated(version='1.0.0', reason="Use workspace.image_from_page and workspace.image_from_segment") def resolve_image_as_pil(self, image_url, coords=None): @@ -470,11 +472,15 @@ def _resolve_image_as_pil(self, image_url, coords=None): log = getLogger('ocrd.workspace._resolve_image_as_pil') with pushd_popd(self.directory): try: - f = next(self.mets.find_files(local_filename=image_url)) - pil_image = Image.open(self.download_file(f).local_filename) + f = next(self.mets.find_files(local_filename=str(image_url))) + pil_image = Image.open(f.local_filename) except StopIteration: - with download_temporary_file(image_url) as f: - pil_image = Image.open(f.name) + try: + f = next(self.mets.find_files(url=str(image_url))) + pil_image = Image.open(self.download_file(f).local_filename) + except StopIteration: + with download_temporary_file(image_url) as f: + pil_image = Image.open(f.name) pil_image.load() # alloc and give up the FD # Pillow does not properly support higher color depths @@ -1035,7 +1041,7 @@ def save_image_file(self, image, The (absolute) path of the created file. """ log = getLogger('ocrd.workspace.save_image_file') - if not force and self.overwrite_mode: + if self.overwrite_mode: force = True image_bytes = io.BytesIO() image.save(image_bytes, format=MIME_TO_PIL[mimetype]) diff --git a/ocrd_modelfactory/ocrd_modelfactory/__init__.py b/ocrd_modelfactory/ocrd_modelfactory/__init__.py index 9f7863c4d..2ec9398ec 100644 --- a/ocrd_modelfactory/ocrd_modelfactory/__init__.py +++ b/ocrd_modelfactory/ocrd_modelfactory/__init__.py @@ -67,7 +67,7 @@ def page_from_image(input_file, with_tree=False): imageWidth=exif.width, imageHeight=exif.height, # XXX brittle - imageFilename=input_file.url if input_file.url is not None else input_file.local_filename + imageFilename=str(input_file.local_filename) if input_file.local_filename else input_file.url ), pcGtsId=input_file.ID ) diff --git a/ocrd_models/ocrd_models/ocrd_file.py b/ocrd_models/ocrd_models/ocrd_file.py index 8daa0681b..b44524875 100644 --- a/ocrd_models/ocrd_models/ocrd_file.py +++ b/ocrd_models/ocrd_models/ocrd_file.py @@ -63,8 +63,9 @@ def __str__(self): return ' ' % (fileGrp, props) def __eq__(self, other): - return self.ID == other.ID # and \ - # self.url == other.url and \ + return self.ID == other.ID \ + and self.url == other.url \ + and self.local_filename == other.local_filename # EXT_TO_MIME[MIME_TO_EXT[self.mimetype]] == EXT_TO_MIME[MIME_TO_EXT[other.mimetype]] and \ # self.fileGrp == other.fileGrp @@ -206,7 +207,7 @@ def local_filename(self, fname): """ Set the local/cached ``@xlink:href`` of this ``mets:file`` to :py:attr:`local_filename`. """ - el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="URL"][@OTHERLOCTYPE="FILE"]', NS) + el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"]', NS) if fname is None: if el_FLocat: self._el.remove(el_FLocat) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 54dc9b4a4..9e4b32e64 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -352,7 +352,7 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None if not local_filename.fullmatch(cand_local_filename): continue if local_only: - deprecation_warning("'local_only' is deprecated, use 'local_filename=\"//.+\"' instead") + # deprecation_warning("'local_only' is deprecated, use 'local_filename=\"//.+\"' instead") is_local = cand.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"][@xlink:href]', namespaces=NS) if is_local is None: continue @@ -366,7 +366,7 @@ def add_file_group(self, fileGrp): fileGrp (string): ``@USE`` of the new ``mets:fileGrp``. """ if ',' in fileGrp: - raise Exception('fileGrp must not contain commas') + raise ValueError('fileGrp must not contain commas') el_fileSec = self._tree.getroot().find('mets:fileSec', NS) if el_fileSec is None: el_fileSec = ET.SubElement(self._tree.getroot(), TAG_METS_FILESEC) @@ -374,11 +374,11 @@ def add_file_group(self, fileGrp): if el_fileGrp is None: el_fileGrp = ET.SubElement(el_fileSec, TAG_METS_FILEGRP) el_fileGrp.set('USE', fileGrp) - + if self._cache_flag: # Assign an empty dictionary that will hold the files of the added fileGrp self._file_cache[fileGrp] = {} - + return el_fileGrp def rename_file_group(self, old, new): @@ -760,6 +760,7 @@ def merge(self, other_mets, force=False, fileGrp_mapping=None, fileId_mapping=No fileGrp_mapping.get(f_src.fileGrp, f_src.fileGrp), mimetype=f_src.mimetype, url=f_src.url, + local_filename=f_src.local_filename, ID=fileId_mapping.get(f_src.ID, f_src.ID), pageId=pageId_mapping.get(f_src.pageId, f_src.pageId), force=force) diff --git a/ocrd_utils/ocrd_utils/str.py b/ocrd_utils/ocrd_utils/str.py index 241f7527e..3c044e35b 100644 --- a/ocrd_utils/ocrd_utils/str.py +++ b/ocrd_utils/ocrd_utils/str.py @@ -120,7 +120,7 @@ def get_local_filename(url, start=None): start (string): Base path to remove from filename. Raise an exception if not a prefix of url """ if url.startswith('https://') or url.startswith('http:'): - raise Exception("Can't determine local filename of http(s) URL") + raise ValueError("Can't determine local filename of http(s) URL") if url.startswith('file://'): url = url[len('file://'):] # Goobi/Kitodo produces those, they are always absolute @@ -128,7 +128,7 @@ def get_local_filename(url, start=None): url = url[len('file:'):] if start: if not url.startswith(start): - raise Exception("Cannot remove prefix %s from url %s" % (start, url)) + raise ValueError("Cannot remove prefix %s from url %s" % (start, url)) if not start.endswith('/'): start += '/' url = url[len(start):] @@ -138,7 +138,7 @@ def is_local_filename(url): """ Whether a url is a local filename. """ - deprecation_warning("Deprecated so we spot inconsistent URL/file handling") + # deprecation_warning("Deprecated so we spot inconsistent URL/file handling") return url.startswith('file://') or not('://' in url) def is_string(val): diff --git a/ocrd_validators/ocrd_validators/workspace_validator.py b/ocrd_validators/ocrd_validators/workspace_validator.py index 5a408b2f9..f53bb0e41 100644 --- a/ocrd_validators/ocrd_validators/workspace_validator.py +++ b/ocrd_validators/ocrd_validators/workspace_validator.py @@ -288,10 +288,10 @@ def _validate_mets_files(self): for f in self.mets.find_files(): if f._el.get('GROUPID'): # pylint: disable=protected-access self.report.add_notice("File '%s' has GROUPID attribute - document might need an update" % f.ID) - if not f.url: - self.report.add_error("File '%s' has no mets:Flocat/@xlink:href" % f.ID) + if not f.url and not f.local_filename: + self.report.add_error("File '%s' has neither mets:Flocat[@LOCTYPE='URL']/@xlink:href nor mets:FLocat[@LOCTYPE='OTHER'][@OTHERLOCTYPE='FILE']/xlink:href" % f.ID) continue - if 'url' not in self.skip and ':/' in f.url: + if f.url and 'url' not in self.skip: if re.match(r'^file:/[^/]', f.url): self.report.add_error("File '%s' has an invalid (Java-specific) file URL '%s'" % (f.ID, f.url)) scheme = f.url[0:f.url.index(':')] diff --git a/tests/test_decorators.py b/tests/test_decorators.py index cd8085f8a..4f4f3ecc0 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -210,4 +210,4 @@ def test_parameter_override_wo_param(self): if __name__ == '__main__': - main() + main(__name__) diff --git a/tests/test_resolver.py b/tests/test_resolver.py index 84503edfe..4fb65439c 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -174,7 +174,7 @@ def test_workspace_from_url0(): # assert assert '%s.tif' % f.ID == 'FILE_0001_IMAGE.tif' - assert f.local_filename == 'OCR-D-IMG/FILE_0001_IMAGE.tif' + assert f.local_filename == Path('OCR-D-IMG/FILE_0001_IMAGE.tif') def test_resolve_image0(): diff --git a/tests/test_workspace.py b/tests/test_workspace.py index 2f57274ef..fecb9f5e3 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -59,7 +59,7 @@ def _fixture_plain_workspace(tmp_path): chdir(prev_dir) def test_workspace_add_file(plain_workspace): - fpath = str(plain_workspace.directory / 'ID1.tif') + fpath = plain_workspace.directory / 'ID1.tif' # act plain_workspace.add_file( @@ -75,31 +75,31 @@ def test_workspace_add_file(plain_workspace): # assert assert f.ID == 'ID1' assert f.mimetype == 'image/tiff' - assert f.url == fpath + assert not f.url assert f.local_filename == fpath - assert exists(fpath) + assert f.local_filename.exists() def test_workspace_add_file_overwrite(plain_workspace): - fpath = str(plain_workspace.directory / 'ID1.tif') + fpath = plain_workspace.directory / 'ID1.tif' # act - plain_workspace.add_file('GRP', ID='ID1', mimetype='image/tiff', content='CONTENT', pageId='phys1', local_filename=fpath) + plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id='phys1', local_filename=fpath) with pytest.raises(FileExistsError) as fn_exc: - plain_workspace.add_file('GRP', ID='ID1', mimetype='image/tiff', content='CONTENT', pageId=None, local_filename=fpath) - assert str(fn_exc.value) == "File with ID='ID1' already exists" + plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id=None, local_filename=fpath) + assert str(fn_exc.value) == "File with file_id='ID1' already exists" with pytest.raises(FileExistsError) as fn_exc: - plain_workspace.add_file('GRP', ID='ID1', mimetype='image/tiff', content='CONTENT', pageId='phys2', local_filename=fpath, force=True) + plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id='phys2', local_filename=fpath, force=True) assert 'cannot mitigate' in str(fn_exc.value) - plain_workspace.add_file('GRP', ID='ID1', mimetype='image/tiff', content='CONTENT2', pageId='phys1', local_filename=fpath, force=True) + plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT2', page_id='phys1', local_filename=fpath, force=True) f = plain_workspace.mets.find_all_files()[0] assert f.ID == 'ID1' assert f.mimetype == 'image/tiff' - assert f.url == fpath + assert not f.url assert f.local_filename == fpath assert f.pageId == 'phys1' - assert exists(fpath) + assert fpath.exists() def test_workspace_add_file_basename_no_content(plain_workspace): @@ -157,14 +157,14 @@ def test_workspace_backup(plain_workspace): def _url_to_file(the_path): dummy_mets = OcrdMets.empty_mets() dummy_url = abspath(the_path) - return dummy_mets.add_file('DEPRECATED', ID=Path(dummy_url).name, url=dummy_url) + return dummy_mets.add_file('TESTGRP', ID=Path(dummy_url).name, url=dummy_url) def test_download_very_self_file(plain_workspace): the_file = _url_to_file(abspath(__file__)) fn = plain_workspace.download_file(the_file) - assert fn, join('DEPRECATED', basename(__file__)) - assert fn == the_file.local_filename + assert fn, join('TESTGRP', basename(__file__)) + assert fn == the_file def test_download_url_without_baseurl_raises_exception(tmp_path): @@ -179,7 +179,7 @@ def test_download_url_without_baseurl_raises_exception(tmp_path): ws1.download_file(the_file) # assert exception message contents - assert "Already tried prepending baseurl '%s'" % str(tmp_path) in str(exc.value) + assert "File path passed as 'url' to download_to_directory does not exist:" in str(exc.value) def test_download_url_with_baseurl(tmp_path): @@ -200,7 +200,8 @@ def test_download_url_with_baseurl(tmp_path): f = Path(ws1.download_file(the_file).local_filename) # assert - assert str(f).endswith(join('OCR-D-IMG', '%s.tif' % SAMPLE_FILE_ID)) + assert f.name == f'{SAMPLE_FILE_ID}.tif' + assert f.parent.name == 'TESTGRP' assert Path(ws1.directory, f).exists() @@ -298,8 +299,8 @@ def test_remove_file_remote(plain_workspace): def test_rename_file_group(tmp_path): # arrange - copytree(assets.path_to('kant_aufklaerung_1784-page-region-line-word_glyph/data'), str(tmp_path)) - workspace = Workspace(Resolver(), directory=str(tmp_path)) + copytree(assets.path_to('kant_aufklaerung_1784-page-region-line-word_glyph/data'), tmp_path) + workspace = Workspace(Resolver(), directory=tmp_path) # before act # TODO clear semantics @@ -308,7 +309,7 @@ def test_rename_file_group(tmp_path): # which can be achieved with pushd_popd functionalities ocrd_file = next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001')) relative_name = ocrd_file.local_filename - ocrd_file.local_filename = join(tmp_path, relative_name) + ocrd_file.local_filename = tmp_path / relative_name pcgts_before = page_from_file(ocrd_file) # before assert assert pcgts_before.get_Page().imageFilename == 'OCR-D-IMG/INPUT_0017.tif' @@ -316,7 +317,7 @@ def test_rename_file_group(tmp_path): # act workspace.rename_file_group('OCR-D-IMG', 'FOOBAR') next_ocrd_file = next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001')) - next_ocrd_file.local_filename = join(tmp_path, relative_name) + next_ocrd_file.local_filename = tmp_path / relative_name pcgts_after = page_from_file(next_ocrd_file) # assert @@ -372,7 +373,7 @@ def _fixture_kant_complex(tmp_path): yield Workspace(Resolver, directory=tmp_path) -def test_remove_file_page_recursive(kant_complex_workspace): +def test_remove_file_page_recursive0(kant_complex_workspace): assert len(kant_complex_workspace.mets.find_all_files()) == 119 kant_complex_workspace.remove_file('OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001', page_recursive=True, page_same_group=False, keep_file=True) assert len(kant_complex_workspace.mets.find_all_files()) == 83 @@ -400,15 +401,15 @@ def test_download_to_directory_from_workspace_download_file(plain_workspace): f1 = plain_workspace.add_file('IMG', file_id='page1_img', mimetype='image/tiff', local_filename='test.tif', content='', page_id=None) f2 = plain_workspace.add_file('GT', file_id='page1_gt', mimetype='text/xml', local_filename='test.xml', content='', page_id=None) - assert f1.url == 'test.tif' - assert f2.url == 'test.xml' + assert not f1.url + assert not f2.url # these should be no-ops plain_workspace.download_file(f1) plain_workspace.download_file(f2) - assert f1.url == 'test.tif' - assert f2.url == 'test.xml' + assert f1.local_filename == Path('test.tif') + assert f2.local_filename == Path('test.xml') def test_save_image_file_invalid_mimetype_raises_exception(plain_workspace): @@ -502,8 +503,8 @@ def _fixture_workspace_sample_features(tmp_path): def test_image_feature_selectoro(workspace_sample_features): # arrange - with open(join(str(workspace_sample_features.directory), 'image_features.page.xml'), 'r') as f: - pcgts = parseString(f.read().encode('utf8')) + with open(Path(workspace_sample_features.directory) / 'image_features.page.xml', 'r', encoding='utf-8') as f: + pcgts = parseString(f.read().encode('utf-8')) # richest feature set is not last: _, info, _ = workspace_sample_features.image_from_page(pcgts.get_Page(), page_id='page1', feature_selector='dewarped') @@ -586,13 +587,13 @@ def test_deskewing(plain_workspace): def test_downsample_16bit_image(plain_workspace): # arrange image - img_path = join(plain_workspace.directory, '16bit.tif') - with gzip_open(join(dirname(__file__), 'data/OCR-D-IMG_APBB_Mitteilungen_62.0002.tif.gz'), 'rb') as gzip_in: + img_path = Path(plain_workspace.directory, '16bit.tif') + with gzip_open(Path(__file__).parent / 'data/OCR-D-IMG_APBB_Mitteilungen_62.0002.tif.gz', 'rb') as gzip_in: with open(img_path, 'wb') as tif_out: tif_out.write(gzip_in.read()) # act - plain_workspace.add_file('IMG', file_id='foo', url=img_path, mimetype='image/tiff', page_id=None) + plain_workspace.add_file('IMG', file_id='foo', local_filename=img_path, mimetype='image/tiff', page_id=None) # assert pil_before = Image.open(img_path) @@ -612,7 +613,7 @@ def test_mets_permissions(plain_workspace): assert filemode(stat(mets_path).st_mode) == '-rwxrwxrwx' -def test_merge(tmp_path): +def test_merge0(tmp_path): # arrange dst_path1 = tmp_path / 'kant_aufklaerung' @@ -647,16 +648,16 @@ def test_merge_no_copy_files(tmp_path): ws1 = Resolver().workspace_from_nothing(directory=dst_path1) ws2 = Resolver().workspace_from_nothing(directory=dst_path2) - ws2.add_file('GRP2', pageId='p01', mimetype='text/plain', ID='f1', local_filename='GRP2/f1', content='ws2') + ws2.add_file('GRP2', page_id='p01', mimetype='text/plain', file_id='f1', local_filename='GRP2/f1', content='ws2') ws1.merge(ws2, copy_files=False, fileId_mapping={'f1': 'f1_copy_files'}) - assert next(ws1.mets.find_files(ID='f1_copy_files')).url == 'ws2/GRP2/f1' + assert next(ws1.mets.find_files(ID='f1_copy_files')).local_filename == Path('ws2/GRP2/f1') with pytest.raises(FileExistsError): ws1.merge(ws2, copy_files=True, fileId_mapping={'f1': 'f1_copy_files'}) ws1.merge(ws2, copy_files=True, fileId_mapping={'f1': 'f1_copy_files'}, force=True) - assert next(ws1.mets.find_files(ID='f1_copy_files')).url == 'GRP2/f1' + assert next(ws1.mets.find_files(ID='f1_copy_files')).local_filename == Path('GRP2/f1') def test_merge_overwrite(tmp_path): # arrange @@ -669,8 +670,8 @@ def test_merge_overwrite(tmp_path): ws2 = Resolver().workspace_from_nothing(directory=dst_path2) with pytest.raises(Exception) as exc: - ws1.add_file('X', pageId='X', mimetype='X', ID='id123', local_filename='X/X', content='ws1') - ws2.add_file('X', pageId='X', mimetype='X', ID='id456', local_filename='X/X', content='ws2') + ws1.add_file('X', page_id='X', mimetype='X', file_id='id123', local_filename='X/X', content='ws1') + ws2.add_file('X', page_id='X', mimetype='X', file_id='id456', local_filename='X/X', content='ws2') ws1.merge(ws2) assert "would overwrite" == str(exc.value) diff --git a/tests/validator/test_workspace_bagger.py b/tests/validator/test_workspace_bagger.py index fc3b03503..a8300b447 100644 --- a/tests/validator/test_workspace_bagger.py +++ b/tests/validator/test_workspace_bagger.py @@ -183,4 +183,4 @@ def test_recreate_checksums_zipped(self): if __name__ == '__main__': - main() + main(__name__) From 40fb94ede514e5e9c90b2f3b7dd5dfe83734499b Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Sat, 19 Aug 2023 21:25:27 +0200 Subject: [PATCH 07/12] fix workspace bagger --- ocrd/ocrd/workspace.py | 1 + ocrd/ocrd/workspace_bagger.py | 50 +++++++++++------------- ocrd_models/ocrd_models/ocrd_file.py | 4 +- tests/validator/test_workspace_bagger.py | 2 +- 4 files changed, 27 insertions(+), 30 deletions(-) diff --git a/ocrd/ocrd/workspace.py b/ocrd/ocrd/workspace.py index 9ccb8ce99..98089515f 100644 --- a/ocrd/ocrd/workspace.py +++ b/ocrd/ocrd/workspace.py @@ -168,6 +168,7 @@ def download_file(self, f, _recursion_count=0): """ log = getLogger('ocrd.workspace.download_file') with pushd_popd(self.directory): + print(f) if f.local_filename: file_path = Path(f.local_filename).absolute() if file_path.exists(): diff --git a/ocrd/ocrd/workspace_bagger.py b/ocrd/ocrd/workspace_bagger.py index 447f33d13..4c6129d89 100644 --- a/ocrd/ocrd/workspace_bagger.py +++ b/ocrd/ocrd/workspace_bagger.py @@ -14,6 +14,7 @@ from ocrd_utils import ( pushd_popd, getLogger, + MIME_TO_EXT, is_local_filename, unzip_file_to_dir, @@ -57,31 +58,28 @@ def _log_or_raise(self, msg): def _bag_mets_files(self, workspace, bagdir, ocrd_mets, processes): mets = workspace.mets - changed_urls = {} + changed_local_filenames = {} log = getLogger('ocrd.workspace_bagger') # TODO allow filtering by fileGrp@USE and such with pushd_popd(workspace.directory): - # URLs of the files before changing + # local_filenames of the files before changing for f in mets.find_files(): - log.info("Resolving %s", f.url) - if is_local_filename(f.url): - # nothing to do then - pass - elif not f.url.startswith('http'): - self._log_or_raise("Not an http URL: %s" % f.url) - continue - log.info("Resolved %s", f.url) - - file_grp_dir = join(bagdir, 'data', f.fileGrp) - if not isdir(file_grp_dir): - makedirs(file_grp_dir) - - _basename = "%s%s" % (f.ID, f.extension) - _relpath = join(f.fileGrp, _basename) - self.resolver.download_to_directory(file_grp_dir, f.url, basename=_basename) - changed_urls[f.url] = _relpath - f.url = _relpath + log.info("Handling OcrdFile %s", f) + + file_grp_dir = Path(bagdir, 'data', f.fileGrp) + if not file_grp_dir.is_dir(): + file_grp_dir.mkdir() + + if f.local_filename: + _relpath = join(f.fileGrp, f.basename) + self.resolver.download_to_directory(file_grp_dir, f.local_filename, basename=f.basename) + changed_local_filenames[str(f.local_filename)] = _relpath + else: + _relpath = join(f.fileGrp, f"{f.ID}{MIME_TO_EXT[f.mimetype]}") + self.resolver.download_to_directory(file_grp_dir, f.url, basename=f.basename) + changed_local_filenames[f.url] = _relpath + f.local_filename = _relpath # save mets.xml with open(join(bagdir, 'data', ocrd_mets), 'wb') as f: @@ -95,21 +93,19 @@ def _bag_mets_files(self, workspace, bagdir, ocrd_mets, processes): for page_file in bag_workspace.mets.find_files(mimetype=MIMETYPE_PAGE): pcgts = page_from_file(page_file) changed = False - # page_doc.set(imageFileName - # for old, new in changed_urls: - for old, new in changed_urls.items(): + for old, new in changed_local_filenames.items(): if pcgts.get_Page().imageFilename == old: pcgts.get_Page().imageFilename = new changed = True # TODO replace AlternativeImage, recursively... if changed: - with open(page_file.url, 'w') as out: + with open(page_file.local_filename, 'w') as out: out.write(to_xml(pcgts)) # log.info("Replace %s -> %s in %s" % (old, new, page_file)) - chdir(bagdir) - total_bytes, total_files = make_manifests('data', processes, algorithms=['sha512']) - log.info("New vs. old: %s" % changed_urls) + with pushd_popd(bagdir): + total_bytes, total_files = make_manifests('data', processes, algorithms=['sha512']) + log.info("New vs. old: %s" % changed_local_filenames) return total_bytes, total_files def _set_bag_info(self, bag, total_bytes, total_files, ocrd_identifier, ocrd_base_version_checksum, ocrd_mets='mets.xml'): diff --git a/ocrd_models/ocrd_models/ocrd_file.py b/ocrd_models/ocrd_models/ocrd_file.py index b44524875..468e88e50 100644 --- a/ocrd_models/ocrd_models/ocrd_file.py +++ b/ocrd_models/ocrd_models/ocrd_file.py @@ -208,8 +208,8 @@ def local_filename(self, fname): Set the local/cached ``@xlink:href`` of this ``mets:file`` to :py:attr:`local_filename`. """ el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"]', NS) - if fname is None: - if el_FLocat: + if not fname: + if el_FLocat is not None: self._el.remove(el_FLocat) return if el_FLocat is None: diff --git a/tests/validator/test_workspace_bagger.py b/tests/validator/test_workspace_bagger.py index a8300b447..837819fdf 100644 --- a/tests/validator/test_workspace_bagger.py +++ b/tests/validator/test_workspace_bagger.py @@ -60,7 +60,7 @@ def test_bag_full(self): f = self.workspace.mets.find_all_files(ID='INPUT_0017')[0] f.url = 'bad-scheme://foo' f.local_filename = None - with self.assertRaisesRegex(Exception, "Not an http URL"): + with self.assertRaisesRegex(Exception, "No connection adapters were found for 'bad-scheme://foo'"): self.bagger.bag(self.workspace, 'kant_aufklaerung_1784', skip_zip=False) self.bagger.strict = False From 0f281b0b803a09545163dbc47ec2c6f3d751d53c Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Sun, 20 Aug 2023 14:56:36 +0200 Subject: [PATCH 08/12] adapt tests to local_filename/url change --- ocrd/ocrd/cli/workspace.py | 4 +-- .../ocrd_validators/workspace_validator.py | 36 +++++++++---------- tests/cli/test_workspace.py | 5 +-- tests/model/test_ocrd_page.py | 4 +-- tests/processor/test_ocrd_dummy.py | 6 ++-- tests/validator/test_workspace_backup.py | 4 +-- tests/validator/test_workspace_validator.py | 9 +++-- 7 files changed, 36 insertions(+), 32 deletions(-) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index a12cf983b..018aee643 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -372,7 +372,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp @workspace_cli.command('find') @mets_find_options @click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab", - default=['url'], + default=['local_filename', 'url'], multiple=True, type=click.Choice([ 'url', @@ -413,7 +413,7 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, down modified_mets = True if wait: time.sleep(wait) - ret.append([f.ID if field == 'pageId' else getattr(f, field) or '' + ret.append([f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field]) if modified_mets: workspace.save_mets() diff --git a/ocrd_validators/ocrd_validators/workspace_validator.py b/ocrd_validators/ocrd_validators/workspace_validator.py index f53bb0e41..22a34c723 100644 --- a/ocrd_validators/ocrd_validators/workspace_validator.py +++ b/ocrd_validators/ocrd_validators/workspace_validator.py @@ -185,16 +185,16 @@ def _validate_imagefilename(self): """ self.log.debug('_validate_imagefilename') for f in self.mets.find_files(mimetype=MIMETYPE_PAGE): - if not is_local_filename(f.url) and not self.download: - self.log.warning("Won't download remote PAGE XML '%s'", f.url) + if not f.local_filename and not self.download: + self.log.warning("Not available locally and 'download' is not set: %s", f) continue self.workspace.download_file(f) page = page_from_file(f).get_Page() imageFilename = page.imageFilename if not self.mets.find_files(url=imageFilename): - self.report.add_error("PAGE-XML %s : imageFilename '%s' not found in METS" % (f.url, imageFilename)) + self.report.add_error("PAGE-XML %s : imageFilename '%s' not found in METS" % (f.local_filename, imageFilename)) if is_local_filename(imageFilename) and not Path(imageFilename).exists(): - self.report.add_warning("PAGE-XML %s : imageFilename '%s' points to non-existent local file" % (f.url, imageFilename)) + self.report.add_warning("PAGE-XML %s : imageFilename '%s' points to non-existent local file" % (f.local_filename, imageFilename)) def _validate_dimension(self): """ @@ -202,8 +202,8 @@ def _validate_dimension(self): """ self.log.info('_validate_dimension') for f in self.mets.find_files(mimetype=MIMETYPE_PAGE): - if not is_local_filename(f.url) and not self.download: - self.log.warning("Won't download remote PAGE XML '%s'", f.url) + if not f.local_filename and not self.download: + self.log.warning("Not available locally and 'download' is not set: %s", f) continue self.workspace.download_file(f) page = page_from_file(f).get_Page() @@ -221,16 +221,16 @@ def _validate_multipage(self): """ self.log.debug('_validate_multipage') for f in self.mets.find_files(mimetype='//image/.*'): - if not is_local_filename(f.url) and not self.download: - self.log.warning("Won't download remote image '%s'", f.url) + if not f.local_filename and not self.download: + self.log.warning("Not available locally and 'download' is not set: %s", f) continue self.workspace.download_file(f) try: - exif = self.workspace.resolve_image_exif(f.url) + exif = self.workspace.resolve_image_exif(f.local_filename) if exif.n_frames > 1: self.report.add_error("Image %s: More than 1 frame: %s" % (f.ID, exif.n_frames)) except FileNotFoundError: - self.report.add_error("Image %s: Could not retrieve %s" % (f.ID, f.url)) + self.report.add_error("Image %s: Could not retrieve %s (local_filename=%s, url=%s)" % (f.ID, f.local_filename, f.url)) return def _validate_pixel_density(self): @@ -241,11 +241,11 @@ def _validate_pixel_density(self): """ self.log.debug('_validate_pixel_density') for f in self.mets.find_files(mimetype='//image/.*'): - if not is_local_filename(f.url) and not self.download: - self.log.warning("Won't download remote image '%s'", f.url) + if not f.local_filename and not self.download: + self.log.warning("Not available locally and 'download' is not set: %s", f) continue self.workspace.download_file(f) - exif = self.workspace.resolve_image_exif(f.url) + exif = self.workspace.resolve_image_exif(f.local_filename) for k in ['xResolution', 'yResolution']: v = exif.__dict__.get(k) if v is None or v <= 72: @@ -288,7 +288,7 @@ def _validate_mets_files(self): for f in self.mets.find_files(): if f._el.get('GROUPID'): # pylint: disable=protected-access self.report.add_notice("File '%s' has GROUPID attribute - document might need an update" % f.ID) - if not f.url and not f.local_filename: + if not (f.url or f.local_filename): self.report.add_error("File '%s' has neither mets:Flocat[@LOCTYPE='URL']/@xlink:href nor mets:FLocat[@LOCTYPE='OTHER'][@OTHERLOCTYPE='FILE']/xlink:href" % f.ID) continue if f.url and 'url' not in self.skip: @@ -304,8 +304,8 @@ def _validate_page(self): """ self.log.debug('_validate_page') for f in self.mets.find_files(mimetype=MIMETYPE_PAGE): - if not is_local_filename(f.url) and not self.download: - self.log.warning("Won't download remote PAGE XML '%s'", f.url) + if not f.local_filename and not self.download: + self.log.warning("Not available locally and 'download' is not set: %s", f) continue self.workspace.download_file(f) page_report = PageValidator.validate(ocrd_file=f, @@ -323,8 +323,8 @@ def _validate_page_xsd(self): """ self.log.debug('_validate_page_xsd') for f in self.mets.find_files(mimetype=MIMETYPE_PAGE): - if not is_local_filename(f.url) and not self.download: - self.log.warning("Won't download remote PAGE XML '%s'", f.url) + if not f.local_filename and not self.download: + self.log.warning("Not available locally and 'download' is not set: %s", f) continue self.workspace.download_file(f) for err in XsdPageValidator.validate(Path(f.local_filename)).errors: diff --git a/tests/cli/test_workspace.py b/tests/cli/test_workspace.py index d00c47259..088432a02 100644 --- a/tests/cli/test_workspace.py +++ b/tests/cli/test_workspace.py @@ -384,10 +384,11 @@ def test_copy_vs_clone(self): def test_find_all_files_multiple_physical_pages_for_fileids(self): with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir: - result = self.runner.invoke(workspace_cli, ['-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0005', '-k', 'url']) + result = self.runner.invoke(workspace_cli, ['-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0005', '-k', 'local_filename']) + print(result.stdout) self.assertEqual(result.stdout, 'OCR-D-IMG/FILE_0005_IMAGE.tif\n') self.assertEqual(result.exit_code, 0) - result = self.runner.invoke(workspace_cli, ['-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0001', '-k', 'url']) + result = self.runner.invoke(workspace_cli, ['-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0001', '-k', 'local_filename']) self.assertEqual(len(result.stdout.split('\n')), 19) def test_mets_basename(self): diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index 5e073336e..82964e349 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -428,7 +428,7 @@ def test_serialize_no_empty_readingorder(): """ https://github.com/OCR-D/core/issues/602 """ - pcgts = page_from_image(create_ocrd_file_with_defaults(url=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif'))) + pcgts = page_from_image(create_ocrd_file_with_defaults(local_filename=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif'))) pcgts.get_Page().set_ReadingOrder(ReadingOrderType()) assert pcgts.get_Page().get_ReadingOrder() pcgts = parseString(to_xml(pcgts, skip_declaration=True)) @@ -439,7 +439,7 @@ def test_hashable(): """ https://github.com/OCR-D/ocrd_segment/issues/45 """ - pcgts = page_from_image(create_ocrd_file_with_defaults(url=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif'))) + pcgts = page_from_image(create_ocrd_file_with_defaults(local_filename=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif'))) page = pcgts.get_Page() testset = set() testset.add(pcgts) diff --git a/tests/processor/test_ocrd_dummy.py b/tests/processor/test_ocrd_dummy.py index c98a6d481..1f4ecc055 100644 --- a/tests/processor/test_ocrd_dummy.py +++ b/tests/processor/test_ocrd_dummy.py @@ -34,10 +34,10 @@ def test_copies_ok(self): output_files = workspace.mets.find_all_files(fileGrp='OUTPUT') output_files.sort(key=lambda x: x.url) print([str(s) for s in output_files]) - self.assertEqual(output_files[0].url, 'OUTPUT/OUTPUT_PHYS_0001.tif') - self.assertEqual(output_files[1].url, 'OUTPUT/OUTPUT_PHYS_0001.xml') + assert output_files[0].local_filename == Path('OUTPUT/OUTPUT_PHYS_0001.tif') + assert output_files[1].local_filename == Path('OUTPUT/OUTPUT_PHYS_0001.xml') self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID) - self.assertEqual(page_from_file(output_files[1]).get_Page().imageFilename, output_files[0].url) + assert page_from_file(output_files[1]).get_Page().imageFilename == str(output_files[0].local_filename) self.assertEqual(len(output_files), 6) self.assertEqual(len(workspace.mets.find_all_files(ID='//OUTPUT.*')), 6) self.assertEqual(len(workspace.mets.find_all_files(ID='//OUTPUT.*_PAGE')), 3) diff --git a/tests/validator/test_workspace_backup.py b/tests/validator/test_workspace_backup.py index 9021c01b1..0d0bc141c 100644 --- a/tests/validator/test_workspace_backup.py +++ b/tests/validator/test_workspace_backup.py @@ -28,7 +28,7 @@ def test_backup_mgr(self): self.mgr.undo() self.assertEqual(self.mgr.list(), []) first = self.mgr.add() - self.assertEqual(first, '94d33aa8773bbbf78919f89a01f03392ad39bb295859cca065d2d8eb8a4811e9') + self.assertEqual(first, '55c54aa8e10b13e495eebe292bb00250478dc9f365ea21afc2b297fe889a5a0c') self.assertEqual(len(self.mgr.list()), 1) self.mgr.add() self.assertEqual(len(self.mgr.list()), 1) @@ -46,4 +46,4 @@ def test_backup_mgr(self): self.mgr.restore(first, choose_first=False) if __name__ == '__main__': - main() + main(__name__) diff --git a/tests/validator/test_workspace_validator.py b/tests/validator/test_workspace_validator.py index 9fa353f5e..39c03bd84 100644 --- a/tests/validator/test_workspace_validator.py +++ b/tests/validator/test_workspace_validator.py @@ -138,15 +138,18 @@ def test_validate_weird_urls(self): workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=['pixel_density']) assert not report.is_valid - assert len(report.errors) == 2 + assert len(report.errors) == 1 assert "invalid (Java-specific) file URL" in report.errors[0] + assert len(report.warnings) == 1 + assert "non-HTTP" in report.warnings[0] + @pytest.mark.skip("Not usable as such anymore, because we properly distinguish .url and .local_filename now. Requires a server to test") def test_validate_pixel_no_download(self): imgpath = assets.path_to('kant_aufklaerung_1784-binarized/data/OCR-D-IMG-BIN/BIN_0020.png') with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' - workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', url=imgpath) + workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', local_filename=imgpath) workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=[], download=False) self.assertEqual(len(report.errors), 0) @@ -158,7 +161,7 @@ def test_validate_pixel_density_too_low(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' - workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', url=imgpath) + workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', local_filename=imgpath) workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=[], download=True) self.assertEqual(len(report.notices), 2) From 7446f5f25f483f45614203ec921455e32b0311b4 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Sun, 20 Aug 2023 15:36:47 +0200 Subject: [PATCH 09/12] reenable and fix bashlib tests --- Makefile | 4 ---- ocrd/ocrd/cli/bashlib.py | 8 ++++---- tests/cli/test_bashlib.py | 4 ++-- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index efeb314e3..e5c9ae202 100644 --- a/Makefile +++ b/Makefile @@ -195,14 +195,10 @@ assets: repo/assets .PHONY: test # Run all unit tests -# XXX TODO need to fix this before merge, silencing the deprecation warnings about PEP420 and ignoring the long-running bashlib/resmgr test - #-W 'ignore:Deprecated call to `pkg_r:DeprecationWarning' test: assets $(PYTHON) \ -m pytest $(PYTEST_ARGS) --durations=10\ --ignore=$(TESTDIR)/test_logging.py \ - --ignore=$(TESTDIR)/cli/test_bashlib.py \ - --ignore=$(TESTDIR)/test_resource_manager.py \ --ignore=$(TESTDIR)/test_logging_conf.py \ --ignore-glob="$(TESTDIR)/**/*bench*.py" \ $(TESTDIR) diff --git a/ocrd/ocrd/cli/bashlib.py b/ocrd/ocrd/cli/bashlib.py index 27f96ca1b..b387c4b5a 100644 --- a/ocrd/ocrd/cli/bashlib.py +++ b/ocrd/ocrd/cli/bashlib.py @@ -104,7 +104,7 @@ def bashlib_input_files(**kwargs): working_dir = kwargs.pop('working_dir') if is_local_filename(mets) and not isfile(get_local_filename(mets)): msg = "File does not exist: %s" % mets - raise Exception(msg) + raise FileNotFoundError(msg) resolver = Resolver() workspace = resolver.workspace_from_url(mets, working_dir) processor = Processor(workspace, @@ -113,11 +113,11 @@ def bashlib_input_files(**kwargs): input_file_grp=kwargs['input_file_grp'], output_file_grp=kwargs['output_file_grp']) for input_files in processor.zip_input_files(mimetype=None, on_error='abort'): - for field in ['url', 'ID', 'mimetype', 'pageId']: + for field in ['url', 'local_filename', 'ID', 'mimetype', 'pageId']: # make this bash-friendly (show initialization for associative array) if len(input_files) > 1: # single quotes allow us to preserve the list value inside the alist - print("[%s]='%s'" % (field, ' '.join(getattr(res, field) for res in input_files)), end=' ') + print("[%s]='%s'" % (field, ' '.join(str(getattr(res, field)) for res in input_files)), end=' ') else: - print("[%s]='%s'" % (field, getattr(input_files[0], field)), end=' ') + print("[%s]='%s'" % (field, str(getattr(input_files[0], field))), end=' ') print("[outputFileId]='%s'" % make_file_id(input_files[0], kwargs['output_file_grp'])) diff --git a/tests/cli/test_bashlib.py b/tests/cli/test_bashlib.py index 3d178dcdd..0e61e093e 100644 --- a/tests/cli/test_bashlib.py +++ b/tests/cli/test_bashlib.py @@ -88,7 +88,7 @@ def test_input_files(self): with copy_of_directory(assets.path_to('kant_aufklaerung_1784/data')) as wsdir: with pushd_popd(wsdir): _, out, err = self.invoke_cli(bashlib_cli, ['input-files', '-I', 'OCR-D-IMG']) - assert ("[url]='OCR-D-IMG/INPUT_0017.tif' [ID]='INPUT_0017' [mimetype]='image/tiff' " + assert ("[url]='' [local_filename]='OCR-D-IMG/INPUT_0017.tif' [ID]='INPUT_0017' [mimetype]='image/tiff' " "[pageId]='PHYS_0017' [outputFileId]='OUTPUT_PHYS_0017'") in out def test_bashlib_defs(self): @@ -147,7 +147,7 @@ def test_bashlib_cp_processor(self): cd "${ocrd__argv[working_dir]}" mets=$(basename ${ocrd__argv[mets_file]}) for ((n=0; n<${#ocrd__files[*]}; n++)); do - in_fpath=($(ocrd__input_file $n url)) + in_fpath=($(ocrd__input_file $n local_filename)) in_id=($(ocrd__input_file $n ID)) in_mimetype=($(ocrd__input_file $n mimetype)) in_pageId=($(ocrd__input_file $n pageId)) From 619d81e5997105e5bf6ed31897019fd0fbb6aeb1 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Sun, 20 Aug 2023 15:42:43 +0200 Subject: [PATCH 10/12] workspace find --undo-download --- ocrd/ocrd/cli/workspace.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index 018aee643..2bed01856 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -372,7 +372,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp @workspace_cli.command('find') @mets_find_options @click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab", - default=['local_filename', 'url'], + default=['local_filename'], multiple=True, type=click.Choice([ 'url', @@ -388,9 +388,10 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp 'local_filename', ])) @click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file ") +@click.option('--undo-download', is_flag=True, help="Remove all downloaded files from the METS") @click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests") @pass_workspace -def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download, wait): +def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download, undo_download, wait): """ Find files. @@ -413,6 +414,9 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, down modified_mets = True if wait: time.sleep(wait) + if undo_download and f.local_filename: + f.local_filename = None + modified_mets = True ret.append([f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field]) if modified_mets: From a583a50a9323e8cc448b05371d8bbfa50b8dc238 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 8 Sep 2023 18:54:41 +0200 Subject: [PATCH 11/12] adapt mets_server to handle pathlib.Path local_filename --- ocrd/ocrd/mets_server.py | 57 +++++++++++++++++++++------------------ tests/test_mets_server.py | 12 +++++++-- 2 files changed, 41 insertions(+), 28 deletions(-) diff --git a/ocrd/ocrd/mets_server.py b/ocrd/ocrd/mets_server.py index 8b54371e5..44dec51c6 100644 --- a/ocrd/ocrd/mets_server.py +++ b/ocrd/ocrd/mets_server.py @@ -5,6 +5,7 @@ from os import environ, _exit from io import BytesIO from typing import Any, Dict, Optional, Union, List, Tuple +from pathlib import Path from urllib.parse import urlparse from fastapi import FastAPI, Request, File, Form, Response @@ -31,13 +32,13 @@ class OcrdFileModel(BaseModel): file_grp : str = Field() file_id : str = Field() mimetype : str = Field() - page_id : Union[str, None] = Field() - url : Union[str, None] = Field() - local_filename : Union[str, None] = Field() + page_id : Optional[str] = Field() + url : Optional[str] = Field() + local_filename : Optional[str] = Field() @staticmethod - def create(file_grp : str, file_id : str, page_id : Union[str, None], url : str, local_filename : str, mimetype : str): - return OcrdFileModel(file_grp=file_grp, file_id=file_id, page_id=page_id, mimetype=mimetype, url=url, local_filename=local_filename) + def create(file_grp : str, file_id : str, page_id : Optional[str], url : Optional[str], local_filename : Optional[Union[str, Path]], mimetype : str): + return OcrdFileModel(file_grp=file_grp, file_id=file_id, page_id=page_id, mimetype=mimetype, url=url, local_filename=str(local_filename)) class OcrdAgentModel(BaseModel): name : str = Field() @@ -57,9 +58,16 @@ class OcrdFileListModel(BaseModel): @staticmethod def create(files : List[OcrdFile]): - return OcrdFileListModel( - files=[OcrdFileModel.create(file_grp=f.fileGrp, file_id=f.ID, mimetype=f.mimetype, page_id=f.pageId, url=f.url, local_filename=f.local_filename) for f in files] - ) + ret = OcrdFileListModel( + files=[OcrdFileModel.create( + file_grp=f.fileGrp, + file_id=f.ID, + mimetype=f.mimetype, + page_id=f.pageId, + url=f.url, + local_filename=f.local_filename + ) for f in files]) + return ret class OcrdFileGroupListModel(BaseModel): file_groups : List[str] = Field() @@ -144,17 +152,14 @@ def file_groups(self): @deprecated_alias(pageId="page_id") @deprecated_alias(ID="file_id") def add_file(self, file_grp, content=None, file_id=None, url=None, local_filename=None, mimetype=None, page_id=None, **kwargs): - self.session.request( - 'POST', - f'{self.url}/file', - data=OcrdFileModel.create( - file_id=file_id, - file_grp=file_grp, - page_id=page_id, - mimetype=mimetype, - url=url, - local_filename=local_filename).dict(), - ) + data = OcrdFileModel.create( + file_id=file_id, + file_grp=file_grp, + page_id=page_id, + mimetype=mimetype, + url=url, + local_filename=local_filename) + r = self.session.request('POST', f'{self.url}/file', data=data.dict()) return ClientSideOcrdFile( None, ID=file_id, @@ -208,10 +213,10 @@ async def exception_handler_invalid_regex(request: Request, exc: re.error): @app.get("/file", response_model=OcrdFileListModel) async def find_files( - file_grp : Union[str, None] = None, - file_id : Union[str, None] = None, - page_id : Union[str, None] = None, - mimetype : Union[str, None] = None, + file_grp : Optional[str] = None, + file_id : Optional[str] = None, + page_id : Optional[str] = None, + mimetype : Optional[str] = None, ): """ Find files in the mets @@ -227,10 +232,10 @@ def save(): async def add_file( file_grp : str = Form(), file_id : str = Form(), - page_id : Union[str, None] = Form(), + page_id : Optional[str] = Form(), mimetype : str = Form(), - url : Union[str, None] = Form(), - local_filename : Union[str, None] = Form(), + url : Optional[str] = Form(None), + local_filename : Optional[str] = Form(None), ): """ Add a file diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 80add438a..7013d5576 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -3,6 +3,7 @@ from itertools import repeat from multiprocessing import Process, Pool, set_start_method +# necessary for macos set_start_method("fork") from shutil import rmtree, copytree from os import remove @@ -43,7 +44,14 @@ def _start_mets_server(*args, **kwargs): def add_file_server(x): mets_server_url, i = x workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) - workspace_server.add_file(local_filename=f'foo{i}', mimetype=MIMETYPE_PAGE, page_id=f'page{1}', file_grp='FOO', file_id=f'FOO_page{i}_foo{i}') + workspace_server.add_file( + local_filename=f'local_filename{i}', + mimetype=MIMETYPE_PAGE, + page_id=f'page{i}', + file_grp='FOO', + file_id=f'FOO_page{i}_foo{i}', + # url=f'url{i}' + ) def add_agent_server(x): mets_server_url, i = x @@ -108,7 +116,7 @@ def test_mets_server_str(start_mets_server): mets_server_url, workspace_server = start_mets_server workspace_server = Workspace(Resolver(), WORKSPACE_DIR, mets_server_url=mets_server_url) f = next(workspace_server.find_files()) - assert str(f) == '' + assert str(f) == '' a = workspace_server.mets.agents[0] assert str(a) == '' assert str(workspace_server.mets) == '' % ('http+unix://%2Ftmp%2Focrd-mets-server.sock' if mets_server_url == TRANSPORTS[0] else TRANSPORTS[1]) From 0f26809e9979306b2e4dad26cb34709025a1d38e Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 11 Sep 2023 13:41:11 +0200 Subject: [PATCH 12/12] ocrd workspace find --undo-download: more intuitive output --- ocrd/ocrd/cli/workspace.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index 4dd6bd42d..eb894bf7b 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -444,16 +444,17 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, down mimetype=mimetype, page_id=page_id, ): + ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field] if download and not f.local_filename: workspace.download_file(f) modified_mets = True if wait: time.sleep(wait) if undo_download and f.local_filename: + ret_entry = [f'Removed local_filename {f.local_filename}'] f.local_filename = None modified_mets = True - ret.append([f.ID if field == 'pageId' else str(getattr(f, field)) or '' - for field in output_field]) + ret.append(ret_entry) if modified_mets: workspace.save_mets() if 'pageId' in output_field: