diff --git a/Makefile b/Makefile
index 6df7d5fd0..e5c9ae202 100644
--- a/Makefile
+++ b/Makefile
@@ -196,7 +196,8 @@ assets: repo/assets
.PHONY: test
# Run all unit tests
test: assets
- $(PYTHON) -m pytest --continue-on-collection-errors --durations=10\
+ $(PYTHON) \
+ -m pytest $(PYTEST_ARGS) --durations=10\
--ignore=$(TESTDIR)/test_logging.py \
--ignore=$(TESTDIR)/test_logging_conf.py \
--ignore-glob="$(TESTDIR)/**/*bench*.py" \
diff --git a/ocrd/ocrd/cli/bashlib.py b/ocrd/ocrd/cli/bashlib.py
index 27f96ca1b..b387c4b5a 100644
--- a/ocrd/ocrd/cli/bashlib.py
+++ b/ocrd/ocrd/cli/bashlib.py
@@ -104,7 +104,7 @@ def bashlib_input_files(**kwargs):
working_dir = kwargs.pop('working_dir')
if is_local_filename(mets) and not isfile(get_local_filename(mets)):
msg = "File does not exist: %s" % mets
- raise Exception(msg)
+ raise FileNotFoundError(msg)
resolver = Resolver()
workspace = resolver.workspace_from_url(mets, working_dir)
processor = Processor(workspace,
@@ -113,11 +113,11 @@ def bashlib_input_files(**kwargs):
input_file_grp=kwargs['input_file_grp'],
output_file_grp=kwargs['output_file_grp'])
for input_files in processor.zip_input_files(mimetype=None, on_error='abort'):
- for field in ['url', 'ID', 'mimetype', 'pageId']:
+ for field in ['url', 'local_filename', 'ID', 'mimetype', 'pageId']:
# make this bash-friendly (show initialization for associative array)
if len(input_files) > 1:
# single quotes allow us to preserve the list value inside the alist
- print("[%s]='%s'" % (field, ' '.join(getattr(res, field) for res in input_files)), end=' ')
+ print("[%s]='%s'" % (field, ' '.join(str(getattr(res, field)) for res in input_files)), end=' ')
else:
- print("[%s]='%s'" % (field, getattr(input_files[0], field)), end=' ')
+ print("[%s]='%s'" % (field, str(getattr(input_files[0], field))), end=' ')
print("[outputFileId]='%s'" % make_file_id(input_files[0], kwargs['output_file_grp']))
diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py
index 3585a2826..eb894bf7b 100644
--- a/ocrd/ocrd/cli/workspace.py
+++ b/ocrd/ocrd/cli/workspace.py
@@ -402,7 +402,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp
@workspace_cli.command('find')
@mets_find_options
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
- default=['url'],
+ default=['local_filename'],
multiple=True,
type=click.Choice([
'url',
@@ -418,9 +418,10 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp
'local_filename',
]))
@click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file ")
+@click.option('--undo-download', is_flag=True, help="Remove all downloaded files from the METS")
@click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests")
@pass_workspace
-def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download, wait):
+def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download, undo_download, wait):
"""
Find files.
@@ -443,13 +444,17 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, down
mimetype=mimetype,
page_id=page_id,
):
+ ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field]
if download and not f.local_filename:
workspace.download_file(f)
modified_mets = True
if wait:
time.sleep(wait)
- ret.append([f.ID if field == 'pageId' else getattr(f, field) or ''
- for field in output_field])
+ if undo_download and f.local_filename:
+ ret_entry = [f'Removed local_filename {f.local_filename}']
+ f.local_filename = None
+ modified_mets = True
+ ret.append(ret_entry)
if modified_mets:
workspace.save_mets()
if 'pageId' in output_field:
diff --git a/ocrd/ocrd/mets_server.py b/ocrd/ocrd/mets_server.py
index 8b54371e5..44dec51c6 100644
--- a/ocrd/ocrd/mets_server.py
+++ b/ocrd/ocrd/mets_server.py
@@ -5,6 +5,7 @@
from os import environ, _exit
from io import BytesIO
from typing import Any, Dict, Optional, Union, List, Tuple
+from pathlib import Path
from urllib.parse import urlparse
from fastapi import FastAPI, Request, File, Form, Response
@@ -31,13 +32,13 @@ class OcrdFileModel(BaseModel):
file_grp : str = Field()
file_id : str = Field()
mimetype : str = Field()
- page_id : Union[str, None] = Field()
- url : Union[str, None] = Field()
- local_filename : Union[str, None] = Field()
+ page_id : Optional[str] = Field()
+ url : Optional[str] = Field()
+ local_filename : Optional[str] = Field()
@staticmethod
- def create(file_grp : str, file_id : str, page_id : Union[str, None], url : str, local_filename : str, mimetype : str):
- return OcrdFileModel(file_grp=file_grp, file_id=file_id, page_id=page_id, mimetype=mimetype, url=url, local_filename=local_filename)
+ def create(file_grp : str, file_id : str, page_id : Optional[str], url : Optional[str], local_filename : Optional[Union[str, Path]], mimetype : str):
+ return OcrdFileModel(file_grp=file_grp, file_id=file_id, page_id=page_id, mimetype=mimetype, url=url, local_filename=str(local_filename))
class OcrdAgentModel(BaseModel):
name : str = Field()
@@ -57,9 +58,16 @@ class OcrdFileListModel(BaseModel):
@staticmethod
def create(files : List[OcrdFile]):
- return OcrdFileListModel(
- files=[OcrdFileModel.create(file_grp=f.fileGrp, file_id=f.ID, mimetype=f.mimetype, page_id=f.pageId, url=f.url, local_filename=f.local_filename) for f in files]
- )
+ ret = OcrdFileListModel(
+ files=[OcrdFileModel.create(
+ file_grp=f.fileGrp,
+ file_id=f.ID,
+ mimetype=f.mimetype,
+ page_id=f.pageId,
+ url=f.url,
+ local_filename=f.local_filename
+ ) for f in files])
+ return ret
class OcrdFileGroupListModel(BaseModel):
file_groups : List[str] = Field()
@@ -144,17 +152,14 @@ def file_groups(self):
@deprecated_alias(pageId="page_id")
@deprecated_alias(ID="file_id")
def add_file(self, file_grp, content=None, file_id=None, url=None, local_filename=None, mimetype=None, page_id=None, **kwargs):
- self.session.request(
- 'POST',
- f'{self.url}/file',
- data=OcrdFileModel.create(
- file_id=file_id,
- file_grp=file_grp,
- page_id=page_id,
- mimetype=mimetype,
- url=url,
- local_filename=local_filename).dict(),
- )
+ data = OcrdFileModel.create(
+ file_id=file_id,
+ file_grp=file_grp,
+ page_id=page_id,
+ mimetype=mimetype,
+ url=url,
+ local_filename=local_filename)
+ r = self.session.request('POST', f'{self.url}/file', data=data.dict())
return ClientSideOcrdFile(
None,
ID=file_id,
@@ -208,10 +213,10 @@ async def exception_handler_invalid_regex(request: Request, exc: re.error):
@app.get("/file", response_model=OcrdFileListModel)
async def find_files(
- file_grp : Union[str, None] = None,
- file_id : Union[str, None] = None,
- page_id : Union[str, None] = None,
- mimetype : Union[str, None] = None,
+ file_grp : Optional[str] = None,
+ file_id : Optional[str] = None,
+ page_id : Optional[str] = None,
+ mimetype : Optional[str] = None,
):
"""
Find files in the mets
@@ -227,10 +232,10 @@ def save():
async def add_file(
file_grp : str = Form(),
file_id : str = Form(),
- page_id : Union[str, None] = Form(),
+ page_id : Optional[str] = Form(),
mimetype : str = Form(),
- url : Union[str, None] = Form(),
- local_filename : Union[str, None] = Form(),
+ url : Optional[str] = Form(None),
+ local_filename : Optional[str] = Form(None),
):
"""
Add a file
diff --git a/ocrd/ocrd/resolver.py b/ocrd/ocrd/resolver.py
index 04e7d2683..649a934e5 100644
--- a/ocrd/ocrd/resolver.py
+++ b/ocrd/ocrd/resolver.py
@@ -27,24 +27,29 @@ class Resolver():
def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None, retries=None, timeout=None):
"""
- Download a file to a directory.
+ Download a URL ``url`` to a local file in ``directory``.
- Early Shortcut: If `url` is a local file and that file is already in the directory, keep it there.
+ If ``url`` looks like a file path, check whether that exists.
+ If it does exist and is within ``directory` already, return early.
+ If it does exist but is outside of ``directory``. copy it.
+ If ``url` does not appear to be a file path, try downloading via HTTP, retrying ``retries`` times with timeout ``timeout`` between calls.
- If `basename` is not given but subdir is, assume user knows what she's doing and
- use last URL segment as the basename.
+ If ``basename`` is not given but ``subdir`` is, set ``basename`` to the last path segment of ``url``.
- If `basename` is not given and no subdir is given, use the alnum characters in the URL as the basename.
+ If the target file already exists within ``directory``, behavior depends on ``if_exists``:
+ - ``skip`` (default): do nothing and return early. Note that this
+ - ``overwrite``: overwrite the existing file
+ - ``raise``: raise a ``FileExistsError``
Args:
directory (string): Directory to download files to
url (string): URL to download from
Keyword Args:
- basename (string, None): basename part of the filename on disk.
+ basename (string, None): basename part of the filename on disk. Defaults to last path segment of ``url`` if unset.
if_exists (string, "skip"): What to do if target file already exists.
One of ``skip`` (default), ``overwrite`` or ``raise``
- subdir (string, None): Subdirectory to create within the directory. Think ``mets:fileGrp``.
+ subdir (string, None): Subdirectory to create within the directory. Think ``mets:fileGrp[@USE]``.
retries (int, None): Number of retries to attempt on network failure.
timeout (tuple, None): Timeout in seconds for establishing a connection and reading next chunk of data.
@@ -55,56 +60,57 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip',
log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir)
if not url:
- raise Exception("'url' must be a string")
+ raise ValueError(f"'url' must be a non-empty string, not '{url}'") # actually Path also ok
if not directory:
- raise Exception("'directory' must be a string") # actually Path would also work
+ raise ValueError(f"'directory' must be a non-empty string, not '{url}'") # actually Path would also work
+ url = str(url)
directory = Path(directory)
directory.mkdir(parents=True, exist_ok=True)
- directory = str(directory.resolve())
subdir_path = Path(subdir if subdir else '')
basename_path = Path(basename if basename else nth_url_segment(url))
- ret = str(Path(subdir_path, basename_path))
+ ret = Path(subdir_path, basename_path)
dst_path = Path(directory, ret)
- # log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url)
- # print('url=%s', url)
- # print('directory=%s', directory)
- # print('subdir_path=%s', subdir_path)
- # print('basename_path=%s', basename_path)
- # print('ret=%s', ret)
- # print('dst_path=%s', dst_path)
+ # log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url)
+ # print(f'>>> url={url}')
+ # print(f'>>> directory={directory}')
+ # print(f'>>> subdir_path={subdir_path}')
+ # print(f'>>> basename_path={basename_path}')
+ # print(f'>>> dst_path={dst_path}')
+ # print(f'>>> ret={ret}')
src_path = None
if is_local_filename(url):
try:
- # XXX this raises FNFE in Python 3.5 if src_path doesn't exist but not 3.6+
src_path = Path(get_local_filename(url)).resolve()
except FileNotFoundError as e:
log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path))
raise e
if not src_path.exists():
- raise FileNotFoundError("File path passed as 'url' to download_to_directory does not exist: %s" % url)
+ raise FileNotFoundError(f"File path passed as 'url' to download_to_directory does not exist: '{url}")
if src_path == dst_path:
log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url))
- return ret
+ return str(ret)
# Respect 'if_exists' arg
if dst_path.exists():
if if_exists == 'skip':
- return ret
+ return str(ret)
if if_exists == 'raise':
- raise FileExistsError("File already exists and if_exists == 'raise': %s" % (dst_path))
+ raise FileExistsError(f"File already exists and if_exists == 'raise': {dst_path}")
# Create dst_path parent dir
dst_path.parent.mkdir(parents=True, exist_ok=True)
# Copy files or download remote assets
if src_path:
+ # src_path set, so it is a file source, we can copy directly
log.debug("Copying file '%s' to '%s'", src_path, dst_path)
dst_path.write_bytes(src_path.read_bytes())
else:
+ # src_path not set, it's an http URL, try to download
log.debug("Downloading URL '%s' to '%s'", url, dst_path)
if not retries and config.is_set('OCRD_DOWNLOAD_RETRIES'):
retries = config.OCRD_DOWNLOAD_RETRIES
@@ -141,7 +147,7 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip',
contents = handle_oai_response(response)
dst_path.write_bytes(contents)
- return ret
+ return str(ret)
def workspace_from_url(
self,
@@ -206,7 +212,6 @@ def workspace_from_url(
log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'",
mets_basename, mets_url, src_baseurl, dst_dir)
-
self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip')
workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url)
diff --git a/ocrd/ocrd/workspace.py b/ocrd/ocrd/workspace.py
index dba97d952..bdbc4e7b5 100644
--- a/ocrd/ocrd/workspace.py
+++ b/ocrd/ocrd/workspace.py
@@ -123,18 +123,21 @@ def merge(self, other_workspace, copy_files=True, overwrite=False, **kwargs):
"""
def after_add_cb(f):
"""callback to run on merged OcrdFile instances in the destination"""
+ print(f)
+ if not f.local_filename:
+ # OcrdFile has no local_filename, so nothing to be copied
+ return
if not copy_files:
fpath_src = Path(other_workspace.directory).resolve()
fpath_dst = Path(self.directory).resolve()
dstprefix = fpath_src.relative_to(fpath_dst) # raises ValueError if not a subpath
- if is_local_filename(f.url):
- f.url = str(Path(dstprefix, f.url))
+ f.local_filename = dstprefix / f.local_filename
return
- fpath_src = Path(other_workspace.directory, f.url)
- fpath_dest = Path(self.directory, f.url)
+ fpath_src = Path(other_workspace.directory, f.local_filename)
+ fpath_dest = Path(self.directory, f.local_filename)
if fpath_src.exists():
if fpath_dest.exists() and not overwrite:
- raise Exception("Copying %s to %s would overwrite the latter" % (fpath_src, fpath_dest))
+ raise FileExistsError("Copying %s to %s would overwrite the latter" % (fpath_src, fpath_dest))
if not fpath_dest.parent.is_dir():
makedirs(str(fpath_dest.parent))
with open(str(fpath_src), 'rb') as fstream_in, open(str(fpath_dest), 'wb') as fstream_out:
@@ -173,26 +176,36 @@ def download_file(self, f, _recursion_count=0):
Download a :py:class:`ocrd_models.ocrd_file.OcrdFile` to the workspace.
"""
log = getLogger('ocrd.workspace.download_file')
- log.debug('download_file %s [_recursion_count=%s]' % (f, _recursion_count))
with pushd_popd(self.directory):
- try:
- # If the f.url is already a file path, and is within self.directory, do nothing
- url_path = Path(f.url).absolute()
- if not (url_path.exists() and url_path.relative_to(str(Path(self.directory).resolve()))):
- raise Exception("Not already downloaded, moving on")
- except Exception as e:
+ print(f)
+ if f.local_filename:
+ file_path = Path(f.local_filename).absolute()
+ if file_path.exists():
+ try:
+ file_path.relative_to(Path(self.directory).resolve()) # raises ValueError if not relative
+ # If the f.local_filename exists and is within self.directory, nothing to do
+ log.info(f"'local_filename' {f.local_filename} already within {self.directory}, nothing to do")
+ except ValueError:
+ # f.local_filename exists, but not within self.directory, copy it
+ log.info("Copying 'local_filename' %s to workspace directory %s" % (f.local_filename, self.directory))
+ f.local_filename = self.resolver.download_to_directory(self.directory, f.local_filename, subdir=f.fileGrp)
+ return f
+ if f.url:
+ log.info("OcrdFile has 'local_filename' but it doesn't resolve, try to download from set 'url' %s", f.url)
+ elif self.baseurl:
+ log.info("OcrdFile has 'local_filename' but it doesn't resolve and no 'url', concatenate 'baseurl' %s and 'local_filename' %s",
+ self.baseurl, f.local_filename)
+ f.url = '%s/%s' % (self.baseurl, f.local_filename)
+ else:
+ raise FileNotFoundError(f"'local_filename' {f.local_filename} points to non-existing file,"
+ "and no 'url' to download and no 'baseurl' set on workspace, nothing we can do.")
+ if f.url:
+ # If f.url is set, download the file to the workspace
basename = '%s%s' % (f.ID, MIME_TO_EXT.get(f.mimetype, '')) if f.ID else f.basename
- try:
- f.url = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename)
- except FileNotFoundError as e:
- if not self.baseurl:
- raise Exception("No baseurl defined by workspace. Cannot retrieve '%s'" % f.url)
- if _recursion_count >= 1:
- raise FileNotFoundError("Already tried prepending baseurl '%s'. Cannot retrieve '%s'" % (self.baseurl, f.url))
- log.debug("First run of resolver.download_to_directory(%s) failed, try prepending baseurl '%s': %s", f.url, self.baseurl, e)
- f.url = '%s/%s' % (self.baseurl, f.url)
- f.url = self.download_file(f, _recursion_count + 1).local_filename
- f.local_filename = f.url
+ f.local_filename = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename)
+ else:
+ # If neither f.local_filename nor f.url is set, fail
+ raise ValueError("OcrdFile {f} has neither 'url' nor 'local_filename', so cannot be downloaded")
return f
def remove_file(self, file_id, force=False, keep_file=False, page_recursive=False, page_same_group=False):
@@ -212,7 +225,7 @@ def remove_file(self, file_id, force=False, keep_file=False, page_recursive=Fals
"""
log = getLogger('ocrd.workspace.remove_file')
log.debug('Deleting mets:file %s', file_id)
- if not force and self.overwrite_mode:
+ if self.overwrite_mode:
force = True
if isinstance(file_id, OcrdFile):
file_id = file_id.ID
@@ -228,7 +241,7 @@ def remove_file(self, file_id, force=False, keep_file=False, page_recursive=Fals
with pushd_popd(self.directory):
ocrd_page = parse(self.download_file(ocrd_file).local_filename, silence=True)
for img_url in ocrd_page.get_AllAlternativeImagePaths():
- img_kwargs = {'url': img_url}
+ img_kwargs = {'local_filename': img_url}
if page_same_group:
img_kwargs['fileGrp'] = ocrd_file.fileGrp
for img_file in self.mets.find_files(**img_kwargs):
@@ -304,34 +317,34 @@ def rename_file_group(self, old, new):
log = getLogger('ocrd.workspace.rename_file_group')
if old not in self.mets.file_groups:
- raise ValueError("No such fileGrp: %s" % old)
+ raise ValueError(f"No such fileGrp: {old}")
if new in self.mets.file_groups:
- raise ValueError("fileGrp already exists %s" % new)
+ raise ValueError(f"fileGrp already exists {new}")
with pushd_popd(self.directory):
# create workspace dir ``new``
log.info("mkdir %s" % new)
if not Path(new).is_dir():
Path(new).mkdir()
- url_replacements = {}
+ local_filename_replacements = {}
log.info("Moving files")
for mets_file in self.mets.find_files(fileGrp=old, local_only=True):
- new_url = old_url = mets_file.url
+ new_local_filename = old_local_filename = str(mets_file.local_filename)
# Directory part
- new_url = sub(r'^%s/' % old, r'%s/' % new, new_url)
+ new_local_filename = sub(r'^%s/' % old, r'%s/' % new, new_local_filename)
# File part
- new_url = sub(r'/%s' % old, r'/%s' % new, new_url)
- url_replacements[mets_file.url] = new_url
+ new_local_filename = sub(r'/%s' % old, r'/%s' % new, new_local_filename)
+ local_filename_replacements[str(mets_file.local_filename)] = new_local_filename
# move file from ``old`` to ``new``
- move(mets_file.url, new_url)
+ mets_file.local_filename.rename(new_local_filename)
# change the url of ``mets:file``
- mets_file.url = new_url
+ mets_file.local_filename = new_local_filename
# change the file ID and update structMap
# change the file ID and update structMap
new_id = sub(r'^%s' % old, r'%s' % new, mets_file.ID)
try:
next(self.mets.find_files(ID=new_id))
- log.warning("ID %s already exists, not changing ID while renaming %s -> %s" % (new_id, old_url, new_url))
+ log.warning("ID %s already exists, not changing ID while renaming %s -> %s" % (new_id, old_local_filename, new_local_filename))
except StopIteration:
mets_file.ID = new_id
# change file paths in PAGE-XML imageFilename and filename attributes
@@ -339,17 +352,17 @@ def rename_file_group(self, old, new):
log.info("Renaming file references in PAGE-XML %s" % page_file)
pcgts = page_from_file(page_file)
changed = False
- for old_url, new_url in url_replacements.items():
- if pcgts.get_Page().imageFilename == old_url:
+ for old_local_filename, new_local_filename in local_filename_replacements.items():
+ if pcgts.get_Page().imageFilename == old_local_filename:
changed = True
- log.info("Rename pc:Page/@imageFilename: %s -> %s" % (old_url, new_url))
- pcgts.get_Page().imageFilename = new_url
+ log.info("Rename pc:Page/@imageFilename: %s -> %s" % (old_local_filename, new_local_filename))
+ pcgts.get_Page().imageFilename = new_local_filename
for ai in pcgts.get_Page().get_AllAlternativeImages():
- for old_url, new_url in url_replacements.items():
- if ai.filename == old_url:
+ for old_local_filename, new_local_filename in local_filename_replacements.items():
+ if ai.filename == old_local_filename:
changed = True
- log.info("Rename pc:Page/../AlternativeImage: %s -> %s" % (old_url, new_url))
- ai.filename = new_url
+ log.info("Rename pc:Page/../AlternativeImage: %s -> %s" % (old_local_filename, new_local_filename))
+ ai.filename = new_local_filename
if changed:
log.info("PAGE-XML changed, writing %s" % (page_file.local_filename))
with open(page_file.local_filename, 'w', encoding='utf-8') as f:
@@ -392,11 +405,9 @@ def add_file(self, file_grp, content=None, **kwargs):
with pushd_popd(self.directory):
if kwargs.get('local_filename'):
# If the local filename has folder components, create those folders
- local_filename_dir = kwargs['local_filename'].rsplit('/', 1)[0]
- if local_filename_dir != kwargs['local_filename'] and not Path(local_filename_dir).is_dir():
+ local_filename_dir = str(kwargs['local_filename']).rsplit('/', 1)[0]
+ if local_filename_dir != str(kwargs['local_filename']) and not Path(local_filename_dir).is_dir():
makedirs(local_filename_dir)
- if 'url' not in kwargs:
- kwargs['url'] = kwargs['local_filename']
# print(kwargs)
kwargs["pageId"] = kwargs.pop("page_id")
@@ -441,15 +452,17 @@ def resolve_image_exif(self, image_url):
"""
if not image_url:
# avoid "finding" just any file
- raise Exception("Cannot resolve empty image path")
+ raise ValueError(f"'image_url' must be a non-empty string, not '{image_url}' ({type(image_url)})")
try:
- f = next(self.mets.find_files(url=image_url))
- image_filename = self.download_file(f).local_filename
- ocrd_exif = exif_from_filename(image_filename)
+ f = next(self.mets.find_files(local_filename=str(image_url)))
+ return exif_from_filename(f.local_filename)
except StopIteration:
- with download_temporary_file(image_url) as f:
- ocrd_exif = exif_from_filename(f.name)
- return ocrd_exif
+ try:
+ f = next(self.mets.find_files(url=str(image_url)))
+ return exif_from_filename(self.download_file(f).local_filename)
+ except StopIteration:
+ with download_temporary_file(image_url) as f:
+ return exif_from_filename(f.name)
@deprecated(version='1.0.0', reason="Use workspace.image_from_page and workspace.image_from_segment")
def resolve_image_as_pil(self, image_url, coords=None):
@@ -474,11 +487,15 @@ def _resolve_image_as_pil(self, image_url, coords=None):
log = getLogger('ocrd.workspace._resolve_image_as_pil')
with pushd_popd(self.directory):
try:
- f = next(self.mets.find_files(url=image_url))
- pil_image = Image.open(self.download_file(f).local_filename)
+ f = next(self.mets.find_files(local_filename=str(image_url)))
+ pil_image = Image.open(f.local_filename)
except StopIteration:
- with download_temporary_file(image_url) as f:
- pil_image = Image.open(f.name)
+ try:
+ f = next(self.mets.find_files(url=str(image_url)))
+ pil_image = Image.open(self.download_file(f).local_filename)
+ except StopIteration:
+ with download_temporary_file(image_url) as f:
+ pil_image = Image.open(f.name)
pil_image.load() # alloc and give up the FD
# Pillow does not properly support higher color depths
@@ -1039,7 +1056,7 @@ def save_image_file(self, image,
The (absolute) path of the created file.
"""
log = getLogger('ocrd.workspace.save_image_file')
- if not force and self.overwrite_mode:
+ if self.overwrite_mode:
force = True
image_bytes = io.BytesIO()
image.save(image_bytes, format=MIME_TO_PIL[mimetype])
diff --git a/ocrd/ocrd/workspace_bagger.py b/ocrd/ocrd/workspace_bagger.py
index 447f33d13..4c6129d89 100644
--- a/ocrd/ocrd/workspace_bagger.py
+++ b/ocrd/ocrd/workspace_bagger.py
@@ -14,6 +14,7 @@
from ocrd_utils import (
pushd_popd,
getLogger,
+ MIME_TO_EXT,
is_local_filename,
unzip_file_to_dir,
@@ -57,31 +58,28 @@ def _log_or_raise(self, msg):
def _bag_mets_files(self, workspace, bagdir, ocrd_mets, processes):
mets = workspace.mets
- changed_urls = {}
+ changed_local_filenames = {}
log = getLogger('ocrd.workspace_bagger')
# TODO allow filtering by fileGrp@USE and such
with pushd_popd(workspace.directory):
- # URLs of the files before changing
+ # local_filenames of the files before changing
for f in mets.find_files():
- log.info("Resolving %s", f.url)
- if is_local_filename(f.url):
- # nothing to do then
- pass
- elif not f.url.startswith('http'):
- self._log_or_raise("Not an http URL: %s" % f.url)
- continue
- log.info("Resolved %s", f.url)
-
- file_grp_dir = join(bagdir, 'data', f.fileGrp)
- if not isdir(file_grp_dir):
- makedirs(file_grp_dir)
-
- _basename = "%s%s" % (f.ID, f.extension)
- _relpath = join(f.fileGrp, _basename)
- self.resolver.download_to_directory(file_grp_dir, f.url, basename=_basename)
- changed_urls[f.url] = _relpath
- f.url = _relpath
+ log.info("Handling OcrdFile %s", f)
+
+ file_grp_dir = Path(bagdir, 'data', f.fileGrp)
+ if not file_grp_dir.is_dir():
+ file_grp_dir.mkdir()
+
+ if f.local_filename:
+ _relpath = join(f.fileGrp, f.basename)
+ self.resolver.download_to_directory(file_grp_dir, f.local_filename, basename=f.basename)
+ changed_local_filenames[str(f.local_filename)] = _relpath
+ else:
+ _relpath = join(f.fileGrp, f"{f.ID}{MIME_TO_EXT[f.mimetype]}")
+ self.resolver.download_to_directory(file_grp_dir, f.url, basename=f.basename)
+ changed_local_filenames[f.url] = _relpath
+ f.local_filename = _relpath
# save mets.xml
with open(join(bagdir, 'data', ocrd_mets), 'wb') as f:
@@ -95,21 +93,19 @@ def _bag_mets_files(self, workspace, bagdir, ocrd_mets, processes):
for page_file in bag_workspace.mets.find_files(mimetype=MIMETYPE_PAGE):
pcgts = page_from_file(page_file)
changed = False
- # page_doc.set(imageFileName
- # for old, new in changed_urls:
- for old, new in changed_urls.items():
+ for old, new in changed_local_filenames.items():
if pcgts.get_Page().imageFilename == old:
pcgts.get_Page().imageFilename = new
changed = True
# TODO replace AlternativeImage, recursively...
if changed:
- with open(page_file.url, 'w') as out:
+ with open(page_file.local_filename, 'w') as out:
out.write(to_xml(pcgts))
# log.info("Replace %s -> %s in %s" % (old, new, page_file))
- chdir(bagdir)
- total_bytes, total_files = make_manifests('data', processes, algorithms=['sha512'])
- log.info("New vs. old: %s" % changed_urls)
+ with pushd_popd(bagdir):
+ total_bytes, total_files = make_manifests('data', processes, algorithms=['sha512'])
+ log.info("New vs. old: %s" % changed_local_filenames)
return total_bytes, total_files
def _set_bag_info(self, bag, total_bytes, total_files, ocrd_identifier, ocrd_base_version_checksum, ocrd_mets='mets.xml'):
diff --git a/ocrd_modelfactory/ocrd_modelfactory/__init__.py b/ocrd_modelfactory/ocrd_modelfactory/__init__.py
index 453d46e89..e345ee061 100644
--- a/ocrd_modelfactory/ocrd_modelfactory/__init__.py
+++ b/ocrd_modelfactory/ocrd_modelfactory/__init__.py
@@ -67,7 +67,7 @@ def page_from_image(input_file, with_tree=False):
imageWidth=exif.width,
imageHeight=exif.height,
# XXX brittle
- imageFilename=input_file.url if input_file.url is not None else input_file.local_filename
+ imageFilename=str(input_file.local_filename) if input_file.local_filename else input_file.url
),
pcGtsId=input_file.ID
)
diff --git a/ocrd_models/ocrd_models/ocrd_file.py b/ocrd_models/ocrd_models/ocrd_file.py
index a28b97db8..e8205a33d 100644
--- a/ocrd_models/ocrd_models/ocrd_file.py
+++ b/ocrd_models/ocrd_models/ocrd_file.py
@@ -2,8 +2,9 @@
API to ``mets:file``
"""
from os.path import splitext, basename
+from pathlib import Path
-from ocrd_utils import is_local_filename, get_local_filename, MIME_TO_EXT, EXT_TO_MIME
+from ocrd_utils import deprecation_warning
from .ocrd_xml_base import ET
from .constants import NAMESPACES as NS, TAG_METS_FLOCAT, TAG_METS_FILE
@@ -13,11 +14,7 @@ class OcrdFile():
Represents a single ``mets:file/mets:FLocat`` (METS file entry).
"""
- # @staticmethod
- # def create(mimetype, ID, url, local_filename):
- # el_fileGrp.SubElement('file')
-
- def __init__(self, el, mimetype=None, pageId=None, loctype='OTHER', local_filename=None, mets=None, url=None, ID=None):
+ def __init__(self, el, mimetype=None, pageId=None, local_filename=None, mets=None, url=None, ID=None, loctype=None):
"""
Args:
el (LxmlElement): etree Element of the ``mets:file`` this represents. Create new if not provided
@@ -25,28 +22,27 @@ def __init__(self, el, mimetype=None, pageId=None, loctype='OTHER', local_filena
mets (OcrdMets): Containing :py:class:`ocrd_models.ocrd_mets.OcrdMets`.
mimetype (string): ``@MIMETYPE`` of this ``mets:file``
pageId (string): ``@ID`` of the physical ``mets:structMap`` entry corresponding to this ``mets:file``
- loctype (string): ``@LOCTYPE`` of this ``mets:file``
local_filename (string): Local filename
- url (string): ``@xlink:href`` of this ``mets:file``
+ url (string): original ``@xlink:href`` of this ``mets:file``
+ local_filename (Path): ``@xlink:href`` pointing to the locally cached version of the file in the workspace
ID (string): ``@ID`` of this ``mets:file``
+ loctype (string): DEPRECATED do not use
"""
if el is None:
raise ValueError("Must provide mets:file element this OcrdFile represents")
+ if loctype:
+ deprecation_warning("'loctype' is not supported in OcrdFile anymore, use 'url' or 'local_filename'")
self._el = el
self.mets = mets
self.ID = ID
self.mimetype = mimetype
- self.local_filename = local_filename
- self.loctype = loctype
self.pageId = pageId
+ if local_filename:
+ self.local_filename = Path(local_filename)
if url:
self.url = url
- if not(local_filename):
- if self.url and is_local_filename(self.url):
- self.local_filename = get_local_filename(self.url)
-
def __str__(self):
"""
String representation of this ``mets:file``.
@@ -57,7 +53,7 @@ def __str__(self):
# ])
# return 'OcrdFile[' + '\n\t' + props + '\n\t]'
props = ', '.join([
- '='.join([k, getattr(self, k) if getattr(self, k) else '---'])
+ '='.join([k, str(getattr(self, k)) if getattr(self, k) else '---'])
for k in ['ID', 'mimetype', 'url', 'local_filename']
])
try:
@@ -67,34 +63,35 @@ def __str__(self):
return ' ' % (fileGrp, props)
def __eq__(self, other):
- return self.ID == other.ID # and \
- # self.url == other.url and \
+ return self.ID == other.ID \
+ and self.url == other.url \
+ and self.local_filename == other.local_filename
# EXT_TO_MIME[MIME_TO_EXT[self.mimetype]] == EXT_TO_MIME[MIME_TO_EXT[other.mimetype]] and \
# self.fileGrp == other.fileGrp
@property
def basename(self):
"""
- Get the ``os.path.basename`` of the local file, if any.
+ Get the ``.name`` of the local file
"""
- return basename(self.local_filename if self.local_filename else self.url)
+ if not self.local_filename:
+ return
+ return self.local_filename.name
@property
def extension(self):
- _basename, ext = splitext(self.basename)
- if _basename.endswith('.tar'):
- ext = ".tar" + ext
- return ext
+ if not self.local_filename:
+ return
+ return ''.join(self.local_filename.suffixes)
@property
def basename_without_extension(self):
"""
Get the ``os.path.basename`` of the local file, if any, with extension removed.
"""
- ret = self.basename.rsplit('.', 1)[0]
- if ret.endswith('.tar'):
- ret = ret[0:len(ret)-4]
- return ret
+ if not self.local_filename:
+ return
+ return self.local_filename.name[:-len(self.extension)]
@property
def ID(self):
@@ -139,46 +136,11 @@ def pageId(self, pageId):
self.mets.set_physical_page_for_file(pageId, self)
@property
- def loctype(self):
+ def loctypes(self):
"""
- Get the ``@LOCTYPE`` of the ``mets:file``.
+ Get the ``@LOCTYPE``s of the ``mets:file``.
"""
- el_FLocat = self._el.find('mets:FLocat', NS)
- return '' if el_FLocat is None else el_FLocat.get('LOCTYPE')
-
- @loctype.setter
- def loctype(self, loctype):
- """
- Set the ``@LOCTYPE`` of the ``mets:file`` to :py:attr:`loctype`.
- """
- if loctype is None:
- return
- loctype = loctype.upper()
- el_FLocat = self._el.find('mets:FLocat', NS)
- if el_FLocat is None:
- el_FLocat = ET.SubElement(self._el, TAG_METS_FLOCAT)
- el_FLocat.set('LOCTYPE', loctype)
- if loctype == 'OTHER':
- self.otherloctype = 'FILE'
- else:
- self.otherloctype = None
-
- @property
- def otherloctype(self):
- el_FLocat = self._el.find('mets:FLocat', NS)
- return '' if el_FLocat is None else el_FLocat.get('OTHERLOCTYPE')
-
- @otherloctype.setter
- def otherloctype(self, otherloctype):
- el_FLocat = self._el.find('mets:FLocat', NS)
- if el_FLocat is None:
- el_FLocat = ET.SubElement(self._el, TAG_METS_FLOCAT)
- if not otherloctype:
- if 'OTHERLOCTYPE' in el_FLocat.attrib:
- del el_FLocat.attrib['OTHERLOCTYPE']
- else:
- el_FLocat.set('LOCTYPE', 'OTHER')
- el_FLocat.set('OTHERLOCTYPE', otherloctype)
+ return [x.get('LOCTYPE') for x in self._el.findall('mets:FLocat', NS)]
@property
def mimetype(self):
@@ -209,9 +171,9 @@ def fileGrp(self):
@property
def url(self):
"""
- Get the ``@xlink:href`` of this ``mets:file``.
+ Get the remote/original URL ``@xlink:href`` of this ``mets:file``.
"""
- el_FLocat = self._el.find(TAG_METS_FLOCAT)
+ el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="URL"]', NS)
if el_FLocat is not None:
return el_FLocat.get("{%s}href" % NS["xlink"])
return ''
@@ -219,14 +181,42 @@ def url(self):
@url.setter
def url(self, url):
"""
- Set the ``@xlink:href`` of this ``mets:file`` to :py:attr:`url`.
+ Set the remote/original URL ``@xlink:href`` of this ``mets:file`` to :py:attr:`url`.
"""
+ el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="URL"]', NS)
if url is None:
+ if el_FLocat:
+ self._el.remove(el_FLocat)
return
- el_FLocat = self._el.find('mets:FLocat', NS)
if el_FLocat is None:
el_FLocat = ET.SubElement(self._el, TAG_METS_FLOCAT)
el_FLocat.set("{%s}href" % NS["xlink"], url)
+ el_FLocat.set("LOCTYPE", "URL")
+
+ @property
+ def local_filename(self):
+ """
+ Get the local/cached ``@xlink:href`` of this ``mets:file``.
+ """
+ el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"]', NS)
+ if el_FLocat is not None:
+ return Path(el_FLocat.get("{%s}href" % NS["xlink"]))
+
+ @local_filename.setter
+ def local_filename(self, fname):
+ """
+ Set the local/cached ``@xlink:href`` of this ``mets:file`` to :py:attr:`local_filename`.
+ """
+ el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"]', NS)
+ if not fname:
+ if el_FLocat is not None:
+ self._el.remove(el_FLocat)
+ return
+ if el_FLocat is None:
+ el_FLocat = ET.SubElement(self._el, TAG_METS_FLOCAT)
+ el_FLocat.set("{%s}href" % NS["xlink"], str(fname))
+ el_FLocat.set("LOCTYPE", "OTHER")
+ el_FLocat.set("OTHERLOCTYPE", "FILE")
class ClientSideOcrdFile:
diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py
index ba2125465..92bf019d9 100644
--- a/ocrd_models/ocrd_models/ocrd_mets.py
+++ b/ocrd_models/ocrd_models/ocrd_mets.py
@@ -8,8 +8,8 @@
from copy import deepcopy
from ocrd_utils import (
- is_local_filename,
getLogger,
+ deprecation_warning,
generate_range,
VERSION,
REGEX_PREFIX,
@@ -242,7 +242,7 @@ def find_all_files(self, *args, **kwargs):
return list(self.find_files(*args, **kwargs))
# pylint: disable=multiple-statements
- def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None, local_only=False):
+ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None, local_filename=None, local_only=False):
"""
Search ``mets:file`` entries in this METS document and yield results.
The :py:attr:`ID`, :py:attr:`pageId`, :py:attr:`fileGrp`,
@@ -259,7 +259,8 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None
ID (string) : ``@ID`` of the ``mets:file``
fileGrp (string) : ``@USE`` of the ``mets:fileGrp`` to list files of
pageId (string) : ``@ID`` of the corresponding physical ``mets:structMap`` entry (physical page)
- url (string) : ``@xlink:href`` (URL or path) of ``mets:Flocat`` of ``mets:file``
+ url (string) : ``@xlink:href`` remote/original URL of ``mets:Flocat`` of ``mets:file``
+ local_filename (string) : ``@xlink:href`` local/cached filename of ``mets:Flocat`` of ``mets:file``
mimetype (string) : ``@MIMETYPE`` of ``mets:file``
local (boolean) : Whether to restrict results to local files in the filesystem
Yields:
@@ -331,7 +332,7 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None
if not mimetype.fullmatch(cand.get('MIMETYPE') or ''): continue
if url:
- cand_locat = cand.find('mets:FLocat', namespaces=NS)
+ cand_locat = cand.find('mets:FLocat[@LOCTYPE="URL"]', namespaces=NS)
if cand_locat is None:
continue
cand_url = cand_locat.get('{%s}href' % NS['xlink'])
@@ -340,14 +341,23 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None
else:
if not url.fullmatch(cand_url): continue
- # Note: why we instantiate a class only to find out that the local_only is set afterwards
- # Checking local_only and url before instantiation should be better?
- f = OcrdFile(cand, mets=self, loctype=cand.get('LOCTYPE'))
+ if local_filename:
+ cand_locat = cand.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"]', namespaces=NS)
+ if cand_locat is None:
+ continue
+ cand_local_filename = cand_locat.get('{%s}href' % NS['xlink'])
+ if isinstance(local_filename, str):
+ if cand_local_filename != local_filename: continue
+ else:
+ if not local_filename.fullmatch(cand_local_filename): continue
- # If only local resources should be returned and f is not a file path: skip the file
- if local_only and not is_local_filename(f.url):
- continue
- yield f
+ if local_only:
+ # deprecation_warning("'local_only' is deprecated, use 'local_filename=\"//.+\"' instead")
+ is_local = cand.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"][@xlink:href]', namespaces=NS)
+ if is_local is None:
+ continue
+
+ yield OcrdFile(cand, mets=self)
def add_file_group(self, fileGrp):
"""
@@ -356,7 +366,7 @@ def add_file_group(self, fileGrp):
fileGrp (string): ``@USE`` of the new ``mets:fileGrp``.
"""
if ',' in fileGrp:
- raise Exception('fileGrp must not contain commas')
+ raise ValueError('fileGrp must not contain commas')
el_fileSec = self._tree.getroot().find('mets:fileSec', NS)
if el_fileSec is None:
el_fileSec = ET.SubElement(self._tree.getroot(), TAG_METS_FILESEC)
@@ -364,11 +374,11 @@ def add_file_group(self, fileGrp):
if el_fileGrp is None:
el_fileGrp = ET.SubElement(el_fileSec, TAG_METS_FILEGRP)
el_fileGrp.set('USE', fileGrp)
-
+
if self._cache_flag:
# Assign an empty dictionary that will hold the files of the added fileGrp
self._file_cache[fileGrp] = {}
-
+
return el_fileGrp
def rename_file_group(self, old, new):
@@ -750,6 +760,7 @@ def merge(self, other_mets, force=False, fileGrp_mapping=None, fileId_mapping=No
fileGrp_mapping.get(f_src.fileGrp, f_src.fileGrp),
mimetype=f_src.mimetype,
url=f_src.url,
+ local_filename=f_src.local_filename,
ID=fileId_mapping.get(f_src.ID, f_src.ID),
pageId=pageId_mapping.get(f_src.pageId, f_src.pageId),
force=force)
diff --git a/ocrd_utils/ocrd_utils/__init__.py b/ocrd_utils/ocrd_utils/__init__.py
index 1f17f1e7b..33352693b 100644
--- a/ocrd_utils/ocrd_utils/__init__.py
+++ b/ocrd_utils/ocrd_utils/__init__.py
@@ -99,6 +99,8 @@
Exports of :py:mod:`ocrd_utils.logging`
* :py:func:`deprecated_alias`
+* :py:func:`rename_kwargs`
+* :py:func:`deprecation_warning`
Decorator to mark a kwarg as deprecated
"""
@@ -118,7 +120,9 @@
)
from .deprecate import (
- deprecated_alias)
+ deprecated_alias,
+ rename_kwargs,
+ deprecation_warning)
from .image import (
adjust_canvas_to_rotation,
diff --git a/ocrd_utils/ocrd_utils/deprecate.py b/ocrd_utils/ocrd_utils/deprecate.py
index 6688fde5f..d17efeb58 100644
--- a/ocrd_utils/ocrd_utils/deprecate.py
+++ b/ocrd_utils/ocrd_utils/deprecate.py
@@ -1,10 +1,8 @@
import functools
import warnings
-"""
-https://stackoverflow.com/questions/49802412/how-to-implement-deprecation-in-python-with-argument-alias
-by user2357112 supports Monica
-"""
+def deprecation_warning(msg, stacklevel=2):
+ warnings.warn(msg, DeprecationWarning, stacklevel)
def deprecated_alias(**aliases):
"""
@@ -19,6 +17,10 @@ def wrapper(*args, **kwargs):
return deco
def rename_kwargs(func_name, kwargs, aliases):
+ """
+ https://stackoverflow.com/questions/49802412/how-to-implement-deprecation-in-python-with-argument-alias
+ by user2357112 supports Monica
+ """
for alias, new in aliases.items():
if alias in kwargs:
if new in kwargs:
diff --git a/ocrd_utils/ocrd_utils/str.py b/ocrd_utils/ocrd_utils/str.py
index 7211699f0..3c044e35b 100644
--- a/ocrd_utils/ocrd_utils/str.py
+++ b/ocrd_utils/ocrd_utils/str.py
@@ -5,6 +5,7 @@
import re
import json
from .constants import REGEX_FILE_ID
+from .deprecate import deprecation_warning
__all__ = [
'assert_file_grp_cardinality',
@@ -119,7 +120,7 @@ def get_local_filename(url, start=None):
start (string): Base path to remove from filename. Raise an exception if not a prefix of url
"""
if url.startswith('https://') or url.startswith('http:'):
- raise Exception("Can't determine local filename of http(s) URL")
+ raise ValueError("Can't determine local filename of http(s) URL")
if url.startswith('file://'):
url = url[len('file://'):]
# Goobi/Kitodo produces those, they are always absolute
@@ -127,7 +128,7 @@ def get_local_filename(url, start=None):
url = url[len('file:'):]
if start:
if not url.startswith(start):
- raise Exception("Cannot remove prefix %s from url %s" % (start, url))
+ raise ValueError("Cannot remove prefix %s from url %s" % (start, url))
if not start.endswith('/'):
start += '/'
url = url[len(start):]
@@ -137,6 +138,7 @@ def is_local_filename(url):
"""
Whether a url is a local filename.
"""
+ # deprecation_warning("Deprecated so we spot inconsistent URL/file handling")
return url.startswith('file://') or not('://' in url)
def is_string(val):
diff --git a/ocrd_validators/ocrd_validators/workspace_validator.py b/ocrd_validators/ocrd_validators/workspace_validator.py
index 5a408b2f9..22a34c723 100644
--- a/ocrd_validators/ocrd_validators/workspace_validator.py
+++ b/ocrd_validators/ocrd_validators/workspace_validator.py
@@ -185,16 +185,16 @@ def _validate_imagefilename(self):
"""
self.log.debug('_validate_imagefilename')
for f in self.mets.find_files(mimetype=MIMETYPE_PAGE):
- if not is_local_filename(f.url) and not self.download:
- self.log.warning("Won't download remote PAGE XML '%s'", f.url)
+ if not f.local_filename and not self.download:
+ self.log.warning("Not available locally and 'download' is not set: %s", f)
continue
self.workspace.download_file(f)
page = page_from_file(f).get_Page()
imageFilename = page.imageFilename
if not self.mets.find_files(url=imageFilename):
- self.report.add_error("PAGE-XML %s : imageFilename '%s' not found in METS" % (f.url, imageFilename))
+ self.report.add_error("PAGE-XML %s : imageFilename '%s' not found in METS" % (f.local_filename, imageFilename))
if is_local_filename(imageFilename) and not Path(imageFilename).exists():
- self.report.add_warning("PAGE-XML %s : imageFilename '%s' points to non-existent local file" % (f.url, imageFilename))
+ self.report.add_warning("PAGE-XML %s : imageFilename '%s' points to non-existent local file" % (f.local_filename, imageFilename))
def _validate_dimension(self):
"""
@@ -202,8 +202,8 @@ def _validate_dimension(self):
"""
self.log.info('_validate_dimension')
for f in self.mets.find_files(mimetype=MIMETYPE_PAGE):
- if not is_local_filename(f.url) and not self.download:
- self.log.warning("Won't download remote PAGE XML '%s'", f.url)
+ if not f.local_filename and not self.download:
+ self.log.warning("Not available locally and 'download' is not set: %s", f)
continue
self.workspace.download_file(f)
page = page_from_file(f).get_Page()
@@ -221,16 +221,16 @@ def _validate_multipage(self):
"""
self.log.debug('_validate_multipage')
for f in self.mets.find_files(mimetype='//image/.*'):
- if not is_local_filename(f.url) and not self.download:
- self.log.warning("Won't download remote image '%s'", f.url)
+ if not f.local_filename and not self.download:
+ self.log.warning("Not available locally and 'download' is not set: %s", f)
continue
self.workspace.download_file(f)
try:
- exif = self.workspace.resolve_image_exif(f.url)
+ exif = self.workspace.resolve_image_exif(f.local_filename)
if exif.n_frames > 1:
self.report.add_error("Image %s: More than 1 frame: %s" % (f.ID, exif.n_frames))
except FileNotFoundError:
- self.report.add_error("Image %s: Could not retrieve %s" % (f.ID, f.url))
+ self.report.add_error("Image %s: Could not retrieve %s (local_filename=%s, url=%s)" % (f.ID, f.local_filename, f.url))
return
def _validate_pixel_density(self):
@@ -241,11 +241,11 @@ def _validate_pixel_density(self):
"""
self.log.debug('_validate_pixel_density')
for f in self.mets.find_files(mimetype='//image/.*'):
- if not is_local_filename(f.url) and not self.download:
- self.log.warning("Won't download remote image '%s'", f.url)
+ if not f.local_filename and not self.download:
+ self.log.warning("Not available locally and 'download' is not set: %s", f)
continue
self.workspace.download_file(f)
- exif = self.workspace.resolve_image_exif(f.url)
+ exif = self.workspace.resolve_image_exif(f.local_filename)
for k in ['xResolution', 'yResolution']:
v = exif.__dict__.get(k)
if v is None or v <= 72:
@@ -288,10 +288,10 @@ def _validate_mets_files(self):
for f in self.mets.find_files():
if f._el.get('GROUPID'): # pylint: disable=protected-access
self.report.add_notice("File '%s' has GROUPID attribute - document might need an update" % f.ID)
- if not f.url:
- self.report.add_error("File '%s' has no mets:Flocat/@xlink:href" % f.ID)
+ if not (f.url or f.local_filename):
+ self.report.add_error("File '%s' has neither mets:Flocat[@LOCTYPE='URL']/@xlink:href nor mets:FLocat[@LOCTYPE='OTHER'][@OTHERLOCTYPE='FILE']/xlink:href" % f.ID)
continue
- if 'url' not in self.skip and ':/' in f.url:
+ if f.url and 'url' not in self.skip:
if re.match(r'^file:/[^/]', f.url):
self.report.add_error("File '%s' has an invalid (Java-specific) file URL '%s'" % (f.ID, f.url))
scheme = f.url[0:f.url.index(':')]
@@ -304,8 +304,8 @@ def _validate_page(self):
"""
self.log.debug('_validate_page')
for f in self.mets.find_files(mimetype=MIMETYPE_PAGE):
- if not is_local_filename(f.url) and not self.download:
- self.log.warning("Won't download remote PAGE XML '%s'", f.url)
+ if not f.local_filename and not self.download:
+ self.log.warning("Not available locally and 'download' is not set: %s", f)
continue
self.workspace.download_file(f)
page_report = PageValidator.validate(ocrd_file=f,
@@ -323,8 +323,8 @@ def _validate_page_xsd(self):
"""
self.log.debug('_validate_page_xsd')
for f in self.mets.find_files(mimetype=MIMETYPE_PAGE):
- if not is_local_filename(f.url) and not self.download:
- self.log.warning("Won't download remote PAGE XML '%s'", f.url)
+ if not f.local_filename and not self.download:
+ self.log.warning("Not available locally and 'download' is not set: %s", f)
continue
self.workspace.download_file(f)
for err in XsdPageValidator.validate(Path(f.local_filename)).errors:
diff --git a/tests/cli/test_bashlib.py b/tests/cli/test_bashlib.py
index 3d178dcdd..0e61e093e 100644
--- a/tests/cli/test_bashlib.py
+++ b/tests/cli/test_bashlib.py
@@ -88,7 +88,7 @@ def test_input_files(self):
with copy_of_directory(assets.path_to('kant_aufklaerung_1784/data')) as wsdir:
with pushd_popd(wsdir):
_, out, err = self.invoke_cli(bashlib_cli, ['input-files', '-I', 'OCR-D-IMG'])
- assert ("[url]='OCR-D-IMG/INPUT_0017.tif' [ID]='INPUT_0017' [mimetype]='image/tiff' "
+ assert ("[url]='' [local_filename]='OCR-D-IMG/INPUT_0017.tif' [ID]='INPUT_0017' [mimetype]='image/tiff' "
"[pageId]='PHYS_0017' [outputFileId]='OUTPUT_PHYS_0017'") in out
def test_bashlib_defs(self):
@@ -147,7 +147,7 @@ def test_bashlib_cp_processor(self):
cd "${ocrd__argv[working_dir]}"
mets=$(basename ${ocrd__argv[mets_file]})
for ((n=0; n<${#ocrd__files[*]}; n++)); do
- in_fpath=($(ocrd__input_file $n url))
+ in_fpath=($(ocrd__input_file $n local_filename))
in_id=($(ocrd__input_file $n ID))
in_mimetype=($(ocrd__input_file $n mimetype))
in_pageId=($(ocrd__input_file $n pageId))
diff --git a/tests/cli/test_workspace.py b/tests/cli/test_workspace.py
index d00c47259..088432a02 100644
--- a/tests/cli/test_workspace.py
+++ b/tests/cli/test_workspace.py
@@ -384,10 +384,11 @@ def test_copy_vs_clone(self):
def test_find_all_files_multiple_physical_pages_for_fileids(self):
with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
- result = self.runner.invoke(workspace_cli, ['-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0005', '-k', 'url'])
+ result = self.runner.invoke(workspace_cli, ['-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0005', '-k', 'local_filename'])
+ print(result.stdout)
self.assertEqual(result.stdout, 'OCR-D-IMG/FILE_0005_IMAGE.tif\n')
self.assertEqual(result.exit_code, 0)
- result = self.runner.invoke(workspace_cli, ['-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0001', '-k', 'url'])
+ result = self.runner.invoke(workspace_cli, ['-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0001', '-k', 'local_filename'])
self.assertEqual(len(result.stdout.split('\n')), 19)
def test_mets_basename(self):
diff --git a/tests/model/test_ocrd_file.py b/tests/model/test_ocrd_file.py
index cdaaffcfa..98b9fa424 100644
--- a/tests/model/test_ocrd_file.py
+++ b/tests/model/test_ocrd_file.py
@@ -26,18 +26,9 @@ def test_ocrd_file_without_filegrp():
assert "set fileGrp" in str(val_err.value)
-def test_set_loctype():
- f = create_ocrd_file_with_defaults()
- assert f.loctype == 'OTHER'
- assert f.otherloctype == 'FILE'
- f.otherloctype = 'foo'
- assert f.otherloctype == 'foo'
- f.loctype = 'URN'
- assert f.loctype == 'URN'
- assert f.otherloctype == None
- f.otherloctype = 'foo'
- assert f.loctype, 'OTHER'
-
+def test_get_loctypes():
+ f = create_ocrd_file_with_defaults(local_filename='foo', url='bar')
+ assert f.loctypes == ['OTHER', 'URL']
def test_set_url():
f = create_ocrd_file_with_defaults()
@@ -50,8 +41,7 @@ def test_set_url():
def test_constructor_url():
f = create_ocrd_file_with_defaults(url="foo")
assert f.url == 'foo'
- assert f.local_filename == 'foo'
-
+ assert f.local_filename == None
def test_set_id_none():
f = create_ocrd_file_with_defaults()
@@ -67,8 +57,11 @@ def test_basename():
def test_basename_from_url():
+ """
+ Changed behavior, basename no longer derived from f.url
+ """
f = create_ocrd_file_with_defaults(url="http://foo.bar/quux")
- assert f.basename == 'quux'
+ assert f.basename == None
@pytest.mark.parametrize("local_filename,extension",
@@ -120,7 +113,6 @@ def test_ocrd_file_equality():
f5 = mets.add_file('TEMP', ID='TEMP_1', mimetype='image/tiff')
assert f3 == f5
-
def test_fptr_changed_for_change_id():
mets = OcrdMets.empty_mets()
f1 = mets.add_file('FOO', ID='FOO_1', mimetype='image/tiff', pageId='p0001')
diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py
index ad72a6ebe..17ab0a8ed 100644
--- a/tests/model/test_ocrd_mets.py
+++ b/tests/model/test_ocrd_mets.py
@@ -85,7 +85,8 @@ def test_find_all_files(sbb_sample_01):
assert len(sbb_sample_01.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff'
assert len(sbb_sample_01.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*'
assert len(sbb_sample_01.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE
- assert len(sbb_sample_01.find_all_files(url='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"'
+ assert len(sbb_sample_01.find_all_files(local_filename='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 FILE xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"'
+ assert len(sbb_sample_01.find_all_files(url='https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif')) == 1, '1 URL xlink:href="https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif"'
assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"'
assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002'
assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)'
diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py
index 5e073336e..82964e349 100644
--- a/tests/model/test_ocrd_page.py
+++ b/tests/model/test_ocrd_page.py
@@ -428,7 +428,7 @@ def test_serialize_no_empty_readingorder():
"""
https://github.com/OCR-D/core/issues/602
"""
- pcgts = page_from_image(create_ocrd_file_with_defaults(url=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif')))
+ pcgts = page_from_image(create_ocrd_file_with_defaults(local_filename=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif')))
pcgts.get_Page().set_ReadingOrder(ReadingOrderType())
assert pcgts.get_Page().get_ReadingOrder()
pcgts = parseString(to_xml(pcgts, skip_declaration=True))
@@ -439,7 +439,7 @@ def test_hashable():
"""
https://github.com/OCR-D/ocrd_segment/issues/45
"""
- pcgts = page_from_image(create_ocrd_file_with_defaults(url=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif')))
+ pcgts = page_from_image(create_ocrd_file_with_defaults(local_filename=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif')))
page = pcgts.get_Page()
testset = set()
testset.add(pcgts)
diff --git a/tests/processor/test_ocrd_dummy.py b/tests/processor/test_ocrd_dummy.py
index c98a6d481..1f4ecc055 100644
--- a/tests/processor/test_ocrd_dummy.py
+++ b/tests/processor/test_ocrd_dummy.py
@@ -34,10 +34,10 @@ def test_copies_ok(self):
output_files = workspace.mets.find_all_files(fileGrp='OUTPUT')
output_files.sort(key=lambda x: x.url)
print([str(s) for s in output_files])
- self.assertEqual(output_files[0].url, 'OUTPUT/OUTPUT_PHYS_0001.tif')
- self.assertEqual(output_files[1].url, 'OUTPUT/OUTPUT_PHYS_0001.xml')
+ assert output_files[0].local_filename == Path('OUTPUT/OUTPUT_PHYS_0001.tif')
+ assert output_files[1].local_filename == Path('OUTPUT/OUTPUT_PHYS_0001.xml')
self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID)
- self.assertEqual(page_from_file(output_files[1]).get_Page().imageFilename, output_files[0].url)
+ assert page_from_file(output_files[1]).get_Page().imageFilename == str(output_files[0].local_filename)
self.assertEqual(len(output_files), 6)
self.assertEqual(len(workspace.mets.find_all_files(ID='//OUTPUT.*')), 6)
self.assertEqual(len(workspace.mets.find_all_files(ID='//OUTPUT.*_PAGE')), 3)
diff --git a/tests/test_decorators.py b/tests/test_decorators.py
index cd8085f8a..4f4f3ecc0 100644
--- a/tests/test_decorators.py
+++ b/tests/test_decorators.py
@@ -210,4 +210,4 @@ def test_parameter_override_wo_param(self):
if __name__ == '__main__':
- main()
+ main(__name__)
diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py
index 80add438a..7013d5576 100644
--- a/tests/test_mets_server.py
+++ b/tests/test_mets_server.py
@@ -3,6 +3,7 @@
from itertools import repeat
from multiprocessing import Process, Pool, set_start_method
+# necessary for macos
set_start_method("fork")
from shutil import rmtree, copytree
from os import remove
@@ -43,7 +44,14 @@ def _start_mets_server(*args, **kwargs):
def add_file_server(x):
mets_server_url, i = x
workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url)
- workspace_server.add_file(local_filename=f'foo{i}', mimetype=MIMETYPE_PAGE, page_id=f'page{1}', file_grp='FOO', file_id=f'FOO_page{i}_foo{i}')
+ workspace_server.add_file(
+ local_filename=f'local_filename{i}',
+ mimetype=MIMETYPE_PAGE,
+ page_id=f'page{i}',
+ file_grp='FOO',
+ file_id=f'FOO_page{i}_foo{i}',
+ # url=f'url{i}'
+ )
def add_agent_server(x):
mets_server_url, i = x
@@ -108,7 +116,7 @@ def test_mets_server_str(start_mets_server):
mets_server_url, workspace_server = start_mets_server
workspace_server = Workspace(Resolver(), WORKSPACE_DIR, mets_server_url=mets_server_url)
f = next(workspace_server.find_files())
- assert str(f) == ''
+ assert str(f) == ''
a = workspace_server.mets.agents[0]
assert str(a) == ''
assert str(workspace_server.mets) == '' % ('http+unix://%2Ftmp%2Focrd-mets-server.sock' if mets_server_url == TRANSPORTS[0] else TRANSPORTS[1])
diff --git a/tests/test_resolver.py b/tests/test_resolver.py
index 770221a5f..bf9f4bb72 100644
--- a/tests/test_resolver.py
+++ b/tests/test_resolver.py
@@ -174,16 +174,17 @@ def test_workspace_from_url0():
# assert
assert '%s.tif' % f.ID == 'FILE_0001_IMAGE.tif'
- assert f.local_filename == 'OCR-D-IMG/FILE_0001_IMAGE.tif'
+ assert f.local_filename == Path('OCR-D-IMG/FILE_0001_IMAGE.tif')
def test_resolve_image0():
workspace = Resolver().workspace_from_url(METS_HEROLD)
input_files = workspace.mets.find_all_files(fileGrp='OCR-D-IMG')
f = input_files[0]
- img_pil1 = workspace._resolve_image_as_pil(f.url)
+ print(f)
+ img_pil1 = workspace._resolve_image_as_pil(f.local_filename)
assert img_pil1.size == (2875, 3749)
- img_pil2 = workspace._resolve_image_as_pil(f.url, [[0, 0], [1, 1]])
+ img_pil2 = workspace._resolve_image_as_pil(f.local_filename, [[0, 0], [1, 1]])
assert img_pil2.size == (1, 1)
@@ -237,12 +238,12 @@ def test_workspace_from_nothing_noclobber(tmp_path):
@pytest.mark.parametrize("url,basename,exc_msg",
- [(None, None, "'url' must be a string"),
- (None, 'foo', "'directory' must be a string")]
+ [(None, None, "'url' must be a non-empty string"),
+ (None, 'foo', "'directory' must be a non-empty string")]
)
def test_download_to_directory_with_badargs(url, basename, exc_msg):
- with pytest.raises(Exception) as exc:
+ with pytest.raises(ValueError) as exc:
Resolver().download_to_directory(url, basename)
# assert exception message contained
diff --git a/tests/test_workspace.py b/tests/test_workspace.py
index 338f0681d..149d7211f 100644
--- a/tests/test_workspace.py
+++ b/tests/test_workspace.py
@@ -59,7 +59,7 @@ def _fixture_plain_workspace(tmp_path):
chdir(prev_dir)
def test_workspace_add_file(plain_workspace):
- fpath = str(plain_workspace.directory / 'ID1.tif')
+ fpath = plain_workspace.directory / 'ID1.tif'
# act
plain_workspace.add_file(
@@ -75,40 +75,37 @@ def test_workspace_add_file(plain_workspace):
# assert
assert f.ID == 'ID1'
assert f.mimetype == 'image/tiff'
- assert f.url == fpath
+ assert not f.url
assert f.local_filename == fpath
- assert exists(fpath)
+ assert f.local_filename.exists()
def test_workspace_add_file_overwrite(plain_workspace):
- fpath = str(plain_workspace.directory / 'ID1.tif')
+ fpath = plain_workspace.directory / 'ID1.tif'
# act
- plain_workspace.add_file('GRP', ID='ID1', mimetype='image/tiff', content='CONTENT', pageId='phys1', local_filename=fpath)
+ plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id='phys1', local_filename=fpath)
with pytest.raises(FileExistsError) as fn_exc:
- plain_workspace.add_file('GRP', ID='ID1', mimetype='image/tiff', content='CONTENT', pageId=None, local_filename=fpath)
- assert str(fn_exc.value) == "File with ID='ID1' already exists"
+ plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id=None, local_filename=fpath)
+ assert str(fn_exc.value) == "File with file_id='ID1' already exists"
with pytest.raises(FileExistsError) as fn_exc:
- plain_workspace.add_file('GRP', ID='ID1', mimetype='image/tiff', content='CONTENT', pageId='phys2', local_filename=fpath, force=True)
+ plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id='phys2', local_filename=fpath, force=True)
assert 'cannot mitigate' in str(fn_exc.value)
- plain_workspace.add_file('GRP', ID='ID1', mimetype='image/tiff', content='CONTENT2', pageId='phys1', local_filename=fpath, force=True)
+ plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT2', page_id='phys1', local_filename=fpath, force=True)
f = plain_workspace.mets.find_all_files()[0]
assert f.ID == 'ID1'
assert f.mimetype == 'image/tiff'
- assert f.url == fpath
+ assert not f.url
assert f.local_filename == fpath
assert f.pageId == 'phys1'
- assert exists(fpath)
+ assert fpath.exists()
def test_workspace_add_file_basename_no_content(plain_workspace):
plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', page_id=None)
f = next(plain_workspace.mets.find_files())
-
- # assert
- assert f.url == None
-
+ assert f.url == ''
def test_workspace_add_file_binary_content(plain_workspace):
fpath = join(plain_workspace.directory, 'subdir', 'ID1.tif')
@@ -160,19 +157,14 @@ def test_workspace_backup(plain_workspace):
def _url_to_file(the_path):
dummy_mets = OcrdMets.empty_mets()
dummy_url = abspath(the_path)
- return dummy_mets.add_file('DEPRECATED', ID=Path(dummy_url).name, url=dummy_url)
+ return dummy_mets.add_file('TESTGRP', ID=Path(dummy_url).name, url=dummy_url)
def test_download_very_self_file(plain_workspace):
-
- # arrange with some dummy stuff
the_file = _url_to_file(abspath(__file__))
-
- # act
fn = plain_workspace.download_file(the_file)
-
- # assert
- assert fn, join('DEPRECATED', basename(__file__))
+ assert fn, join('TESTGRP', basename(__file__))
+ assert fn == the_file
def test_download_url_without_baseurl_raises_exception(tmp_path):
@@ -187,7 +179,7 @@ def test_download_url_without_baseurl_raises_exception(tmp_path):
ws1.download_file(the_file)
# assert exception message contents
- assert "Already tried prepending baseurl '%s'" % str(tmp_path) in str(exc.value)
+ assert "File path passed as 'url' to download_to_directory does not exist:" in str(exc.value)
def test_download_url_with_baseurl(tmp_path):
@@ -208,7 +200,8 @@ def test_download_url_with_baseurl(tmp_path):
f = Path(ws1.download_file(the_file).local_filename)
# assert
- assert str(f).endswith(join('OCR-D-IMG', '%s.tif' % SAMPLE_FILE_ID))
+ assert f.name == f'{SAMPLE_FILE_ID}.tif'
+ assert f.parent.name == 'TESTGRP'
assert Path(ws1.directory, f).exists()
@@ -306,8 +299,8 @@ def test_remove_file_remote(plain_workspace):
def test_rename_file_group(tmp_path):
# arrange
- copytree(assets.path_to('kant_aufklaerung_1784-page-region-line-word_glyph/data'), str(tmp_path))
- workspace = Workspace(Resolver(), directory=str(tmp_path))
+ copytree(assets.path_to('kant_aufklaerung_1784-page-region-line-word_glyph/data'), tmp_path)
+ workspace = Workspace(Resolver(), directory=tmp_path)
# before act
# TODO clear semantics
@@ -316,7 +309,7 @@ def test_rename_file_group(tmp_path):
# which can be achieved with pushd_popd functionalities
ocrd_file = next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001'))
relative_name = ocrd_file.local_filename
- ocrd_file.local_filename = join(tmp_path, relative_name)
+ ocrd_file.local_filename = tmp_path / relative_name
pcgts_before = page_from_file(ocrd_file)
# before assert
assert pcgts_before.get_Page().imageFilename == 'OCR-D-IMG/INPUT_0017.tif'
@@ -324,7 +317,7 @@ def test_rename_file_group(tmp_path):
# act
workspace.rename_file_group('OCR-D-IMG', 'FOOBAR')
next_ocrd_file = next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001'))
- next_ocrd_file.local_filename = join(tmp_path, relative_name)
+ next_ocrd_file.local_filename = tmp_path / relative_name
pcgts_after = page_from_file(next_ocrd_file)
# assert
@@ -380,7 +373,7 @@ def _fixture_kant_complex(tmp_path):
yield Workspace(Resolver, directory=tmp_path)
-def test_remove_file_page_recursive(kant_complex_workspace):
+def test_remove_file_page_recursive0(kant_complex_workspace):
assert len(kant_complex_workspace.mets.find_all_files()) == 119
kant_complex_workspace.remove_file('OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001', page_recursive=True, page_same_group=False, keep_file=True)
assert len(kant_complex_workspace.mets.find_all_files()) == 83
@@ -408,15 +401,15 @@ def test_download_to_directory_from_workspace_download_file(plain_workspace):
f1 = plain_workspace.add_file('IMG', file_id='page1_img', mimetype='image/tiff', local_filename='test.tif', content='', page_id=None)
f2 = plain_workspace.add_file('GT', file_id='page1_gt', mimetype='text/xml', local_filename='test.xml', content='', page_id=None)
- assert f1.url == 'test.tif'
- assert f2.url == 'test.xml'
+ assert not f1.url
+ assert not f2.url
# these should be no-ops
plain_workspace.download_file(f1)
plain_workspace.download_file(f2)
- assert f1.url == 'test.tif'
- assert f2.url == 'test.xml'
+ assert f1.local_filename == Path('test.tif')
+ assert f2.local_filename == Path('test.xml')
def test_save_image_file_invalid_mimetype_raises_exception(plain_workspace):
@@ -510,8 +503,8 @@ def _fixture_workspace_sample_features(tmp_path):
def test_image_feature_selectoro(workspace_sample_features):
# arrange
- with open(join(str(workspace_sample_features.directory), 'image_features.page.xml'), 'r') as f:
- pcgts = parseString(f.read().encode('utf8'))
+ with open(Path(workspace_sample_features.directory) / 'image_features.page.xml', 'r', encoding='utf-8') as f:
+ pcgts = parseString(f.read().encode('utf-8'))
# richest feature set is not last:
_, info, _ = workspace_sample_features.image_from_page(pcgts.get_Page(), page_id='page1', feature_selector='dewarped')
@@ -594,13 +587,13 @@ def test_deskewing(plain_workspace):
def test_downsample_16bit_image(plain_workspace):
# arrange image
- img_path = join(plain_workspace.directory, '16bit.tif')
- with gzip_open(join(dirname(__file__), 'data/OCR-D-IMG_APBB_Mitteilungen_62.0002.tif.gz'), 'rb') as gzip_in:
+ img_path = Path(plain_workspace.directory, '16bit.tif')
+ with gzip_open(Path(__file__).parent / 'data/OCR-D-IMG_APBB_Mitteilungen_62.0002.tif.gz', 'rb') as gzip_in:
with open(img_path, 'wb') as tif_out:
tif_out.write(gzip_in.read())
# act
- plain_workspace.add_file('IMG', file_id='foo', url=img_path, mimetype='image/tiff', page_id=None)
+ plain_workspace.add_file('IMG', file_id='foo', local_filename=img_path, mimetype='image/tiff', page_id=None)
# assert
pil_before = Image.open(img_path)
@@ -620,7 +613,7 @@ def test_mets_permissions(plain_workspace):
assert filemode(stat(mets_path).st_mode) == '-rwxrwxrwx'
-def test_merge(tmp_path):
+def test_merge0(tmp_path):
# arrange
dst_path1 = tmp_path / 'kant_aufklaerung'
@@ -655,16 +648,16 @@ def test_merge_no_copy_files(tmp_path):
ws1 = Resolver().workspace_from_nothing(directory=dst_path1)
ws2 = Resolver().workspace_from_nothing(directory=dst_path2)
- ws2.add_file('GRP2', pageId='p01', mimetype='text/plain', ID='f1', local_filename='GRP2/f1', content='ws2')
+ ws2.add_file('GRP2', page_id='p01', mimetype='text/plain', file_id='f1', local_filename='GRP2/f1', content='ws2')
ws1.merge(ws2, copy_files=False, fileId_mapping={'f1': 'f1_copy_files'})
- assert next(ws1.mets.find_files(ID='f1_copy_files')).url == 'ws2/GRP2/f1'
+ assert next(ws1.mets.find_files(ID='f1_copy_files')).local_filename == Path('ws2/GRP2/f1')
with pytest.raises(FileExistsError):
ws1.merge(ws2, copy_files=True, fileId_mapping={'f1': 'f1_copy_files'})
ws1.merge(ws2, copy_files=True, fileId_mapping={'f1': 'f1_copy_files'}, force=True)
- assert next(ws1.mets.find_files(ID='f1_copy_files')).url == 'GRP2/f1'
+ assert next(ws1.mets.find_files(ID='f1_copy_files')).local_filename == Path('GRP2/f1')
def test_merge_overwrite(tmp_path):
# arrange
@@ -677,8 +670,8 @@ def test_merge_overwrite(tmp_path):
ws2 = Resolver().workspace_from_nothing(directory=dst_path2)
with pytest.raises(Exception) as exc:
- ws1.add_file('X', pageId='X', mimetype='X', ID='id123', local_filename='X/X', content='ws1')
- ws2.add_file('X', pageId='X', mimetype='X', ID='id456', local_filename='X/X', content='ws2')
+ ws1.add_file('X', page_id='X', mimetype='X', file_id='id123', local_filename='X/X', content='ws1')
+ ws2.add_file('X', page_id='X', mimetype='X', file_id='id456', local_filename='X/X', content='ws2')
ws1.merge(ws2)
assert "would overwrite" == str(exc.value)
diff --git a/tests/validator/test_workspace_backup.py b/tests/validator/test_workspace_backup.py
index 03d5aaa34..0d0bc141c 100644
--- a/tests/validator/test_workspace_backup.py
+++ b/tests/validator/test_workspace_backup.py
@@ -46,4 +46,4 @@ def test_backup_mgr(self):
self.mgr.restore(first, choose_first=False)
if __name__ == '__main__':
- main()
+ main(__name__)
diff --git a/tests/validator/test_workspace_bagger.py b/tests/validator/test_workspace_bagger.py
index fc3b03503..837819fdf 100644
--- a/tests/validator/test_workspace_bagger.py
+++ b/tests/validator/test_workspace_bagger.py
@@ -60,7 +60,7 @@ def test_bag_full(self):
f = self.workspace.mets.find_all_files(ID='INPUT_0017')[0]
f.url = 'bad-scheme://foo'
f.local_filename = None
- with self.assertRaisesRegex(Exception, "Not an http URL"):
+ with self.assertRaisesRegex(Exception, "No connection adapters were found for 'bad-scheme://foo'"):
self.bagger.bag(self.workspace, 'kant_aufklaerung_1784', skip_zip=False)
self.bagger.strict = False
@@ -183,4 +183,4 @@ def test_recreate_checksums_zipped(self):
if __name__ == '__main__':
- main()
+ main(__name__)
diff --git a/tests/validator/test_workspace_validator.py b/tests/validator/test_workspace_validator.py
index 9fa353f5e..39c03bd84 100644
--- a/tests/validator/test_workspace_validator.py
+++ b/tests/validator/test_workspace_validator.py
@@ -138,15 +138,18 @@ def test_validate_weird_urls(self):
workspace.save_mets()
report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=['pixel_density'])
assert not report.is_valid
- assert len(report.errors) == 2
+ assert len(report.errors) == 1
assert "invalid (Java-specific) file URL" in report.errors[0]
+ assert len(report.warnings) == 1
+ assert "non-HTTP" in report.warnings[0]
+ @pytest.mark.skip("Not usable as such anymore, because we properly distinguish .url and .local_filename now. Requires a server to test")
def test_validate_pixel_no_download(self):
imgpath = assets.path_to('kant_aufklaerung_1784-binarized/data/OCR-D-IMG-BIN/BIN_0020.png')
with TemporaryDirectory() as tempdir:
workspace = self.resolver.workspace_from_nothing(directory=tempdir)
workspace.mets.unique_identifier = 'foobar'
- workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', url=imgpath)
+ workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', local_filename=imgpath)
workspace.save_mets()
report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=[], download=False)
self.assertEqual(len(report.errors), 0)
@@ -158,7 +161,7 @@ def test_validate_pixel_density_too_low(self):
with TemporaryDirectory() as tempdir:
workspace = self.resolver.workspace_from_nothing(directory=tempdir)
workspace.mets.unique_identifier = 'foobar'
- workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', url=imgpath)
+ workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', local_filename=imgpath)
workspace.save_mets()
report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=[], download=True)
self.assertEqual(len(report.notices), 2)