diff --git a/Makefile b/Makefile index 6df7d5fd0..e5c9ae202 100644 --- a/Makefile +++ b/Makefile @@ -196,7 +196,8 @@ assets: repo/assets .PHONY: test # Run all unit tests test: assets - $(PYTHON) -m pytest --continue-on-collection-errors --durations=10\ + $(PYTHON) \ + -m pytest $(PYTEST_ARGS) --durations=10\ --ignore=$(TESTDIR)/test_logging.py \ --ignore=$(TESTDIR)/test_logging_conf.py \ --ignore-glob="$(TESTDIR)/**/*bench*.py" \ diff --git a/ocrd/ocrd/cli/bashlib.py b/ocrd/ocrd/cli/bashlib.py index 27f96ca1b..b387c4b5a 100644 --- a/ocrd/ocrd/cli/bashlib.py +++ b/ocrd/ocrd/cli/bashlib.py @@ -104,7 +104,7 @@ def bashlib_input_files(**kwargs): working_dir = kwargs.pop('working_dir') if is_local_filename(mets) and not isfile(get_local_filename(mets)): msg = "File does not exist: %s" % mets - raise Exception(msg) + raise FileNotFoundError(msg) resolver = Resolver() workspace = resolver.workspace_from_url(mets, working_dir) processor = Processor(workspace, @@ -113,11 +113,11 @@ def bashlib_input_files(**kwargs): input_file_grp=kwargs['input_file_grp'], output_file_grp=kwargs['output_file_grp']) for input_files in processor.zip_input_files(mimetype=None, on_error='abort'): - for field in ['url', 'ID', 'mimetype', 'pageId']: + for field in ['url', 'local_filename', 'ID', 'mimetype', 'pageId']: # make this bash-friendly (show initialization for associative array) if len(input_files) > 1: # single quotes allow us to preserve the list value inside the alist - print("[%s]='%s'" % (field, ' '.join(getattr(res, field) for res in input_files)), end=' ') + print("[%s]='%s'" % (field, ' '.join(str(getattr(res, field)) for res in input_files)), end=' ') else: - print("[%s]='%s'" % (field, getattr(input_files[0], field)), end=' ') + print("[%s]='%s'" % (field, str(getattr(input_files[0], field))), end=' ') print("[outputFileId]='%s'" % make_file_id(input_files[0], kwargs['output_file_grp'])) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index 3585a2826..eb894bf7b 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -402,7 +402,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp @workspace_cli.command('find') @mets_find_options @click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab", - default=['url'], + default=['local_filename'], multiple=True, type=click.Choice([ 'url', @@ -418,9 +418,10 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp 'local_filename', ])) @click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file ") +@click.option('--undo-download', is_flag=True, help="Remove all downloaded files from the METS") @click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests") @pass_workspace -def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download, wait): +def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download, undo_download, wait): """ Find files. @@ -443,13 +444,17 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, down mimetype=mimetype, page_id=page_id, ): + ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field] if download and not f.local_filename: workspace.download_file(f) modified_mets = True if wait: time.sleep(wait) - ret.append([f.ID if field == 'pageId' else getattr(f, field) or '' - for field in output_field]) + if undo_download and f.local_filename: + ret_entry = [f'Removed local_filename {f.local_filename}'] + f.local_filename = None + modified_mets = True + ret.append(ret_entry) if modified_mets: workspace.save_mets() if 'pageId' in output_field: diff --git a/ocrd/ocrd/mets_server.py b/ocrd/ocrd/mets_server.py index 8b54371e5..44dec51c6 100644 --- a/ocrd/ocrd/mets_server.py +++ b/ocrd/ocrd/mets_server.py @@ -5,6 +5,7 @@ from os import environ, _exit from io import BytesIO from typing import Any, Dict, Optional, Union, List, Tuple +from pathlib import Path from urllib.parse import urlparse from fastapi import FastAPI, Request, File, Form, Response @@ -31,13 +32,13 @@ class OcrdFileModel(BaseModel): file_grp : str = Field() file_id : str = Field() mimetype : str = Field() - page_id : Union[str, None] = Field() - url : Union[str, None] = Field() - local_filename : Union[str, None] = Field() + page_id : Optional[str] = Field() + url : Optional[str] = Field() + local_filename : Optional[str] = Field() @staticmethod - def create(file_grp : str, file_id : str, page_id : Union[str, None], url : str, local_filename : str, mimetype : str): - return OcrdFileModel(file_grp=file_grp, file_id=file_id, page_id=page_id, mimetype=mimetype, url=url, local_filename=local_filename) + def create(file_grp : str, file_id : str, page_id : Optional[str], url : Optional[str], local_filename : Optional[Union[str, Path]], mimetype : str): + return OcrdFileModel(file_grp=file_grp, file_id=file_id, page_id=page_id, mimetype=mimetype, url=url, local_filename=str(local_filename)) class OcrdAgentModel(BaseModel): name : str = Field() @@ -57,9 +58,16 @@ class OcrdFileListModel(BaseModel): @staticmethod def create(files : List[OcrdFile]): - return OcrdFileListModel( - files=[OcrdFileModel.create(file_grp=f.fileGrp, file_id=f.ID, mimetype=f.mimetype, page_id=f.pageId, url=f.url, local_filename=f.local_filename) for f in files] - ) + ret = OcrdFileListModel( + files=[OcrdFileModel.create( + file_grp=f.fileGrp, + file_id=f.ID, + mimetype=f.mimetype, + page_id=f.pageId, + url=f.url, + local_filename=f.local_filename + ) for f in files]) + return ret class OcrdFileGroupListModel(BaseModel): file_groups : List[str] = Field() @@ -144,17 +152,14 @@ def file_groups(self): @deprecated_alias(pageId="page_id") @deprecated_alias(ID="file_id") def add_file(self, file_grp, content=None, file_id=None, url=None, local_filename=None, mimetype=None, page_id=None, **kwargs): - self.session.request( - 'POST', - f'{self.url}/file', - data=OcrdFileModel.create( - file_id=file_id, - file_grp=file_grp, - page_id=page_id, - mimetype=mimetype, - url=url, - local_filename=local_filename).dict(), - ) + data = OcrdFileModel.create( + file_id=file_id, + file_grp=file_grp, + page_id=page_id, + mimetype=mimetype, + url=url, + local_filename=local_filename) + r = self.session.request('POST', f'{self.url}/file', data=data.dict()) return ClientSideOcrdFile( None, ID=file_id, @@ -208,10 +213,10 @@ async def exception_handler_invalid_regex(request: Request, exc: re.error): @app.get("/file", response_model=OcrdFileListModel) async def find_files( - file_grp : Union[str, None] = None, - file_id : Union[str, None] = None, - page_id : Union[str, None] = None, - mimetype : Union[str, None] = None, + file_grp : Optional[str] = None, + file_id : Optional[str] = None, + page_id : Optional[str] = None, + mimetype : Optional[str] = None, ): """ Find files in the mets @@ -227,10 +232,10 @@ def save(): async def add_file( file_grp : str = Form(), file_id : str = Form(), - page_id : Union[str, None] = Form(), + page_id : Optional[str] = Form(), mimetype : str = Form(), - url : Union[str, None] = Form(), - local_filename : Union[str, None] = Form(), + url : Optional[str] = Form(None), + local_filename : Optional[str] = Form(None), ): """ Add a file diff --git a/ocrd/ocrd/resolver.py b/ocrd/ocrd/resolver.py index 04e7d2683..649a934e5 100644 --- a/ocrd/ocrd/resolver.py +++ b/ocrd/ocrd/resolver.py @@ -27,24 +27,29 @@ class Resolver(): def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None, retries=None, timeout=None): """ - Download a file to a directory. + Download a URL ``url`` to a local file in ``directory``. - Early Shortcut: If `url` is a local file and that file is already in the directory, keep it there. + If ``url`` looks like a file path, check whether that exists. + If it does exist and is within ``directory` already, return early. + If it does exist but is outside of ``directory``. copy it. + If ``url` does not appear to be a file path, try downloading via HTTP, retrying ``retries`` times with timeout ``timeout`` between calls. - If `basename` is not given but subdir is, assume user knows what she's doing and - use last URL segment as the basename. + If ``basename`` is not given but ``subdir`` is, set ``basename`` to the last path segment of ``url``. - If `basename` is not given and no subdir is given, use the alnum characters in the URL as the basename. + If the target file already exists within ``directory``, behavior depends on ``if_exists``: + - ``skip`` (default): do nothing and return early. Note that this + - ``overwrite``: overwrite the existing file + - ``raise``: raise a ``FileExistsError`` Args: directory (string): Directory to download files to url (string): URL to download from Keyword Args: - basename (string, None): basename part of the filename on disk. + basename (string, None): basename part of the filename on disk. Defaults to last path segment of ``url`` if unset. if_exists (string, "skip"): What to do if target file already exists. One of ``skip`` (default), ``overwrite`` or ``raise`` - subdir (string, None): Subdirectory to create within the directory. Think ``mets:fileGrp``. + subdir (string, None): Subdirectory to create within the directory. Think ``mets:fileGrp[@USE]``. retries (int, None): Number of retries to attempt on network failure. timeout (tuple, None): Timeout in seconds for establishing a connection and reading next chunk of data. @@ -55,56 +60,57 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip', log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir) if not url: - raise Exception("'url' must be a string") + raise ValueError(f"'url' must be a non-empty string, not '{url}'") # actually Path also ok if not directory: - raise Exception("'directory' must be a string") # actually Path would also work + raise ValueError(f"'directory' must be a non-empty string, not '{url}'") # actually Path would also work + url = str(url) directory = Path(directory) directory.mkdir(parents=True, exist_ok=True) - directory = str(directory.resolve()) subdir_path = Path(subdir if subdir else '') basename_path = Path(basename if basename else nth_url_segment(url)) - ret = str(Path(subdir_path, basename_path)) + ret = Path(subdir_path, basename_path) dst_path = Path(directory, ret) - # log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url) - # print('url=%s', url) - # print('directory=%s', directory) - # print('subdir_path=%s', subdir_path) - # print('basename_path=%s', basename_path) - # print('ret=%s', ret) - # print('dst_path=%s', dst_path) + # log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url) + # print(f'>>> url={url}') + # print(f'>>> directory={directory}') + # print(f'>>> subdir_path={subdir_path}') + # print(f'>>> basename_path={basename_path}') + # print(f'>>> dst_path={dst_path}') + # print(f'>>> ret={ret}') src_path = None if is_local_filename(url): try: - # XXX this raises FNFE in Python 3.5 if src_path doesn't exist but not 3.6+ src_path = Path(get_local_filename(url)).resolve() except FileNotFoundError as e: log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path)) raise e if not src_path.exists(): - raise FileNotFoundError("File path passed as 'url' to download_to_directory does not exist: %s" % url) + raise FileNotFoundError(f"File path passed as 'url' to download_to_directory does not exist: '{url}") if src_path == dst_path: log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url)) - return ret + return str(ret) # Respect 'if_exists' arg if dst_path.exists(): if if_exists == 'skip': - return ret + return str(ret) if if_exists == 'raise': - raise FileExistsError("File already exists and if_exists == 'raise': %s" % (dst_path)) + raise FileExistsError(f"File already exists and if_exists == 'raise': {dst_path}") # Create dst_path parent dir dst_path.parent.mkdir(parents=True, exist_ok=True) # Copy files or download remote assets if src_path: + # src_path set, so it is a file source, we can copy directly log.debug("Copying file '%s' to '%s'", src_path, dst_path) dst_path.write_bytes(src_path.read_bytes()) else: + # src_path not set, it's an http URL, try to download log.debug("Downloading URL '%s' to '%s'", url, dst_path) if not retries and config.is_set('OCRD_DOWNLOAD_RETRIES'): retries = config.OCRD_DOWNLOAD_RETRIES @@ -141,7 +147,7 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip', contents = handle_oai_response(response) dst_path.write_bytes(contents) - return ret + return str(ret) def workspace_from_url( self, @@ -206,7 +212,6 @@ def workspace_from_url( log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'", mets_basename, mets_url, src_baseurl, dst_dir) - self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip') workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url) diff --git a/ocrd/ocrd/workspace.py b/ocrd/ocrd/workspace.py index dba97d952..bdbc4e7b5 100644 --- a/ocrd/ocrd/workspace.py +++ b/ocrd/ocrd/workspace.py @@ -123,18 +123,21 @@ def merge(self, other_workspace, copy_files=True, overwrite=False, **kwargs): """ def after_add_cb(f): """callback to run on merged OcrdFile instances in the destination""" + print(f) + if not f.local_filename: + # OcrdFile has no local_filename, so nothing to be copied + return if not copy_files: fpath_src = Path(other_workspace.directory).resolve() fpath_dst = Path(self.directory).resolve() dstprefix = fpath_src.relative_to(fpath_dst) # raises ValueError if not a subpath - if is_local_filename(f.url): - f.url = str(Path(dstprefix, f.url)) + f.local_filename = dstprefix / f.local_filename return - fpath_src = Path(other_workspace.directory, f.url) - fpath_dest = Path(self.directory, f.url) + fpath_src = Path(other_workspace.directory, f.local_filename) + fpath_dest = Path(self.directory, f.local_filename) if fpath_src.exists(): if fpath_dest.exists() and not overwrite: - raise Exception("Copying %s to %s would overwrite the latter" % (fpath_src, fpath_dest)) + raise FileExistsError("Copying %s to %s would overwrite the latter" % (fpath_src, fpath_dest)) if not fpath_dest.parent.is_dir(): makedirs(str(fpath_dest.parent)) with open(str(fpath_src), 'rb') as fstream_in, open(str(fpath_dest), 'wb') as fstream_out: @@ -173,26 +176,36 @@ def download_file(self, f, _recursion_count=0): Download a :py:class:`ocrd_models.ocrd_file.OcrdFile` to the workspace. """ log = getLogger('ocrd.workspace.download_file') - log.debug('download_file %s [_recursion_count=%s]' % (f, _recursion_count)) with pushd_popd(self.directory): - try: - # If the f.url is already a file path, and is within self.directory, do nothing - url_path = Path(f.url).absolute() - if not (url_path.exists() and url_path.relative_to(str(Path(self.directory).resolve()))): - raise Exception("Not already downloaded, moving on") - except Exception as e: + print(f) + if f.local_filename: + file_path = Path(f.local_filename).absolute() + if file_path.exists(): + try: + file_path.relative_to(Path(self.directory).resolve()) # raises ValueError if not relative + # If the f.local_filename exists and is within self.directory, nothing to do + log.info(f"'local_filename' {f.local_filename} already within {self.directory}, nothing to do") + except ValueError: + # f.local_filename exists, but not within self.directory, copy it + log.info("Copying 'local_filename' %s to workspace directory %s" % (f.local_filename, self.directory)) + f.local_filename = self.resolver.download_to_directory(self.directory, f.local_filename, subdir=f.fileGrp) + return f + if f.url: + log.info("OcrdFile has 'local_filename' but it doesn't resolve, try to download from set 'url' %s", f.url) + elif self.baseurl: + log.info("OcrdFile has 'local_filename' but it doesn't resolve and no 'url', concatenate 'baseurl' %s and 'local_filename' %s", + self.baseurl, f.local_filename) + f.url = '%s/%s' % (self.baseurl, f.local_filename) + else: + raise FileNotFoundError(f"'local_filename' {f.local_filename} points to non-existing file," + "and no 'url' to download and no 'baseurl' set on workspace, nothing we can do.") + if f.url: + # If f.url is set, download the file to the workspace basename = '%s%s' % (f.ID, MIME_TO_EXT.get(f.mimetype, '')) if f.ID else f.basename - try: - f.url = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename) - except FileNotFoundError as e: - if not self.baseurl: - raise Exception("No baseurl defined by workspace. Cannot retrieve '%s'" % f.url) - if _recursion_count >= 1: - raise FileNotFoundError("Already tried prepending baseurl '%s'. Cannot retrieve '%s'" % (self.baseurl, f.url)) - log.debug("First run of resolver.download_to_directory(%s) failed, try prepending baseurl '%s': %s", f.url, self.baseurl, e) - f.url = '%s/%s' % (self.baseurl, f.url) - f.url = self.download_file(f, _recursion_count + 1).local_filename - f.local_filename = f.url + f.local_filename = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename) + else: + # If neither f.local_filename nor f.url is set, fail + raise ValueError("OcrdFile {f} has neither 'url' nor 'local_filename', so cannot be downloaded") return f def remove_file(self, file_id, force=False, keep_file=False, page_recursive=False, page_same_group=False): @@ -212,7 +225,7 @@ def remove_file(self, file_id, force=False, keep_file=False, page_recursive=Fals """ log = getLogger('ocrd.workspace.remove_file') log.debug('Deleting mets:file %s', file_id) - if not force and self.overwrite_mode: + if self.overwrite_mode: force = True if isinstance(file_id, OcrdFile): file_id = file_id.ID @@ -228,7 +241,7 @@ def remove_file(self, file_id, force=False, keep_file=False, page_recursive=Fals with pushd_popd(self.directory): ocrd_page = parse(self.download_file(ocrd_file).local_filename, silence=True) for img_url in ocrd_page.get_AllAlternativeImagePaths(): - img_kwargs = {'url': img_url} + img_kwargs = {'local_filename': img_url} if page_same_group: img_kwargs['fileGrp'] = ocrd_file.fileGrp for img_file in self.mets.find_files(**img_kwargs): @@ -304,34 +317,34 @@ def rename_file_group(self, old, new): log = getLogger('ocrd.workspace.rename_file_group') if old not in self.mets.file_groups: - raise ValueError("No such fileGrp: %s" % old) + raise ValueError(f"No such fileGrp: {old}") if new in self.mets.file_groups: - raise ValueError("fileGrp already exists %s" % new) + raise ValueError(f"fileGrp already exists {new}") with pushd_popd(self.directory): # create workspace dir ``new`` log.info("mkdir %s" % new) if not Path(new).is_dir(): Path(new).mkdir() - url_replacements = {} + local_filename_replacements = {} log.info("Moving files") for mets_file in self.mets.find_files(fileGrp=old, local_only=True): - new_url = old_url = mets_file.url + new_local_filename = old_local_filename = str(mets_file.local_filename) # Directory part - new_url = sub(r'^%s/' % old, r'%s/' % new, new_url) + new_local_filename = sub(r'^%s/' % old, r'%s/' % new, new_local_filename) # File part - new_url = sub(r'/%s' % old, r'/%s' % new, new_url) - url_replacements[mets_file.url] = new_url + new_local_filename = sub(r'/%s' % old, r'/%s' % new, new_local_filename) + local_filename_replacements[str(mets_file.local_filename)] = new_local_filename # move file from ``old`` to ``new`` - move(mets_file.url, new_url) + mets_file.local_filename.rename(new_local_filename) # change the url of ``mets:file`` - mets_file.url = new_url + mets_file.local_filename = new_local_filename # change the file ID and update structMap # change the file ID and update structMap new_id = sub(r'^%s' % old, r'%s' % new, mets_file.ID) try: next(self.mets.find_files(ID=new_id)) - log.warning("ID %s already exists, not changing ID while renaming %s -> %s" % (new_id, old_url, new_url)) + log.warning("ID %s already exists, not changing ID while renaming %s -> %s" % (new_id, old_local_filename, new_local_filename)) except StopIteration: mets_file.ID = new_id # change file paths in PAGE-XML imageFilename and filename attributes @@ -339,17 +352,17 @@ def rename_file_group(self, old, new): log.info("Renaming file references in PAGE-XML %s" % page_file) pcgts = page_from_file(page_file) changed = False - for old_url, new_url in url_replacements.items(): - if pcgts.get_Page().imageFilename == old_url: + for old_local_filename, new_local_filename in local_filename_replacements.items(): + if pcgts.get_Page().imageFilename == old_local_filename: changed = True - log.info("Rename pc:Page/@imageFilename: %s -> %s" % (old_url, new_url)) - pcgts.get_Page().imageFilename = new_url + log.info("Rename pc:Page/@imageFilename: %s -> %s" % (old_local_filename, new_local_filename)) + pcgts.get_Page().imageFilename = new_local_filename for ai in pcgts.get_Page().get_AllAlternativeImages(): - for old_url, new_url in url_replacements.items(): - if ai.filename == old_url: + for old_local_filename, new_local_filename in local_filename_replacements.items(): + if ai.filename == old_local_filename: changed = True - log.info("Rename pc:Page/../AlternativeImage: %s -> %s" % (old_url, new_url)) - ai.filename = new_url + log.info("Rename pc:Page/../AlternativeImage: %s -> %s" % (old_local_filename, new_local_filename)) + ai.filename = new_local_filename if changed: log.info("PAGE-XML changed, writing %s" % (page_file.local_filename)) with open(page_file.local_filename, 'w', encoding='utf-8') as f: @@ -392,11 +405,9 @@ def add_file(self, file_grp, content=None, **kwargs): with pushd_popd(self.directory): if kwargs.get('local_filename'): # If the local filename has folder components, create those folders - local_filename_dir = kwargs['local_filename'].rsplit('/', 1)[0] - if local_filename_dir != kwargs['local_filename'] and not Path(local_filename_dir).is_dir(): + local_filename_dir = str(kwargs['local_filename']).rsplit('/', 1)[0] + if local_filename_dir != str(kwargs['local_filename']) and not Path(local_filename_dir).is_dir(): makedirs(local_filename_dir) - if 'url' not in kwargs: - kwargs['url'] = kwargs['local_filename'] # print(kwargs) kwargs["pageId"] = kwargs.pop("page_id") @@ -441,15 +452,17 @@ def resolve_image_exif(self, image_url): """ if not image_url: # avoid "finding" just any file - raise Exception("Cannot resolve empty image path") + raise ValueError(f"'image_url' must be a non-empty string, not '{image_url}' ({type(image_url)})") try: - f = next(self.mets.find_files(url=image_url)) - image_filename = self.download_file(f).local_filename - ocrd_exif = exif_from_filename(image_filename) + f = next(self.mets.find_files(local_filename=str(image_url))) + return exif_from_filename(f.local_filename) except StopIteration: - with download_temporary_file(image_url) as f: - ocrd_exif = exif_from_filename(f.name) - return ocrd_exif + try: + f = next(self.mets.find_files(url=str(image_url))) + return exif_from_filename(self.download_file(f).local_filename) + except StopIteration: + with download_temporary_file(image_url) as f: + return exif_from_filename(f.name) @deprecated(version='1.0.0', reason="Use workspace.image_from_page and workspace.image_from_segment") def resolve_image_as_pil(self, image_url, coords=None): @@ -474,11 +487,15 @@ def _resolve_image_as_pil(self, image_url, coords=None): log = getLogger('ocrd.workspace._resolve_image_as_pil') with pushd_popd(self.directory): try: - f = next(self.mets.find_files(url=image_url)) - pil_image = Image.open(self.download_file(f).local_filename) + f = next(self.mets.find_files(local_filename=str(image_url))) + pil_image = Image.open(f.local_filename) except StopIteration: - with download_temporary_file(image_url) as f: - pil_image = Image.open(f.name) + try: + f = next(self.mets.find_files(url=str(image_url))) + pil_image = Image.open(self.download_file(f).local_filename) + except StopIteration: + with download_temporary_file(image_url) as f: + pil_image = Image.open(f.name) pil_image.load() # alloc and give up the FD # Pillow does not properly support higher color depths @@ -1039,7 +1056,7 @@ def save_image_file(self, image, The (absolute) path of the created file. """ log = getLogger('ocrd.workspace.save_image_file') - if not force and self.overwrite_mode: + if self.overwrite_mode: force = True image_bytes = io.BytesIO() image.save(image_bytes, format=MIME_TO_PIL[mimetype]) diff --git a/ocrd/ocrd/workspace_bagger.py b/ocrd/ocrd/workspace_bagger.py index 447f33d13..4c6129d89 100644 --- a/ocrd/ocrd/workspace_bagger.py +++ b/ocrd/ocrd/workspace_bagger.py @@ -14,6 +14,7 @@ from ocrd_utils import ( pushd_popd, getLogger, + MIME_TO_EXT, is_local_filename, unzip_file_to_dir, @@ -57,31 +58,28 @@ def _log_or_raise(self, msg): def _bag_mets_files(self, workspace, bagdir, ocrd_mets, processes): mets = workspace.mets - changed_urls = {} + changed_local_filenames = {} log = getLogger('ocrd.workspace_bagger') # TODO allow filtering by fileGrp@USE and such with pushd_popd(workspace.directory): - # URLs of the files before changing + # local_filenames of the files before changing for f in mets.find_files(): - log.info("Resolving %s", f.url) - if is_local_filename(f.url): - # nothing to do then - pass - elif not f.url.startswith('http'): - self._log_or_raise("Not an http URL: %s" % f.url) - continue - log.info("Resolved %s", f.url) - - file_grp_dir = join(bagdir, 'data', f.fileGrp) - if not isdir(file_grp_dir): - makedirs(file_grp_dir) - - _basename = "%s%s" % (f.ID, f.extension) - _relpath = join(f.fileGrp, _basename) - self.resolver.download_to_directory(file_grp_dir, f.url, basename=_basename) - changed_urls[f.url] = _relpath - f.url = _relpath + log.info("Handling OcrdFile %s", f) + + file_grp_dir = Path(bagdir, 'data', f.fileGrp) + if not file_grp_dir.is_dir(): + file_grp_dir.mkdir() + + if f.local_filename: + _relpath = join(f.fileGrp, f.basename) + self.resolver.download_to_directory(file_grp_dir, f.local_filename, basename=f.basename) + changed_local_filenames[str(f.local_filename)] = _relpath + else: + _relpath = join(f.fileGrp, f"{f.ID}{MIME_TO_EXT[f.mimetype]}") + self.resolver.download_to_directory(file_grp_dir, f.url, basename=f.basename) + changed_local_filenames[f.url] = _relpath + f.local_filename = _relpath # save mets.xml with open(join(bagdir, 'data', ocrd_mets), 'wb') as f: @@ -95,21 +93,19 @@ def _bag_mets_files(self, workspace, bagdir, ocrd_mets, processes): for page_file in bag_workspace.mets.find_files(mimetype=MIMETYPE_PAGE): pcgts = page_from_file(page_file) changed = False - # page_doc.set(imageFileName - # for old, new in changed_urls: - for old, new in changed_urls.items(): + for old, new in changed_local_filenames.items(): if pcgts.get_Page().imageFilename == old: pcgts.get_Page().imageFilename = new changed = True # TODO replace AlternativeImage, recursively... if changed: - with open(page_file.url, 'w') as out: + with open(page_file.local_filename, 'w') as out: out.write(to_xml(pcgts)) # log.info("Replace %s -> %s in %s" % (old, new, page_file)) - chdir(bagdir) - total_bytes, total_files = make_manifests('data', processes, algorithms=['sha512']) - log.info("New vs. old: %s" % changed_urls) + with pushd_popd(bagdir): + total_bytes, total_files = make_manifests('data', processes, algorithms=['sha512']) + log.info("New vs. old: %s" % changed_local_filenames) return total_bytes, total_files def _set_bag_info(self, bag, total_bytes, total_files, ocrd_identifier, ocrd_base_version_checksum, ocrd_mets='mets.xml'): diff --git a/ocrd_modelfactory/ocrd_modelfactory/__init__.py b/ocrd_modelfactory/ocrd_modelfactory/__init__.py index 453d46e89..e345ee061 100644 --- a/ocrd_modelfactory/ocrd_modelfactory/__init__.py +++ b/ocrd_modelfactory/ocrd_modelfactory/__init__.py @@ -67,7 +67,7 @@ def page_from_image(input_file, with_tree=False): imageWidth=exif.width, imageHeight=exif.height, # XXX brittle - imageFilename=input_file.url if input_file.url is not None else input_file.local_filename + imageFilename=str(input_file.local_filename) if input_file.local_filename else input_file.url ), pcGtsId=input_file.ID ) diff --git a/ocrd_models/ocrd_models/ocrd_file.py b/ocrd_models/ocrd_models/ocrd_file.py index a28b97db8..e8205a33d 100644 --- a/ocrd_models/ocrd_models/ocrd_file.py +++ b/ocrd_models/ocrd_models/ocrd_file.py @@ -2,8 +2,9 @@ API to ``mets:file`` """ from os.path import splitext, basename +from pathlib import Path -from ocrd_utils import is_local_filename, get_local_filename, MIME_TO_EXT, EXT_TO_MIME +from ocrd_utils import deprecation_warning from .ocrd_xml_base import ET from .constants import NAMESPACES as NS, TAG_METS_FLOCAT, TAG_METS_FILE @@ -13,11 +14,7 @@ class OcrdFile(): Represents a single ``mets:file/mets:FLocat`` (METS file entry). """ - # @staticmethod - # def create(mimetype, ID, url, local_filename): - # el_fileGrp.SubElement('file') - - def __init__(self, el, mimetype=None, pageId=None, loctype='OTHER', local_filename=None, mets=None, url=None, ID=None): + def __init__(self, el, mimetype=None, pageId=None, local_filename=None, mets=None, url=None, ID=None, loctype=None): """ Args: el (LxmlElement): etree Element of the ``mets:file`` this represents. Create new if not provided @@ -25,28 +22,27 @@ def __init__(self, el, mimetype=None, pageId=None, loctype='OTHER', local_filena mets (OcrdMets): Containing :py:class:`ocrd_models.ocrd_mets.OcrdMets`. mimetype (string): ``@MIMETYPE`` of this ``mets:file`` pageId (string): ``@ID`` of the physical ``mets:structMap`` entry corresponding to this ``mets:file`` - loctype (string): ``@LOCTYPE`` of this ``mets:file`` local_filename (string): Local filename - url (string): ``@xlink:href`` of this ``mets:file`` + url (string): original ``@xlink:href`` of this ``mets:file`` + local_filename (Path): ``@xlink:href`` pointing to the locally cached version of the file in the workspace ID (string): ``@ID`` of this ``mets:file`` + loctype (string): DEPRECATED do not use """ if el is None: raise ValueError("Must provide mets:file element this OcrdFile represents") + if loctype: + deprecation_warning("'loctype' is not supported in OcrdFile anymore, use 'url' or 'local_filename'") self._el = el self.mets = mets self.ID = ID self.mimetype = mimetype - self.local_filename = local_filename - self.loctype = loctype self.pageId = pageId + if local_filename: + self.local_filename = Path(local_filename) if url: self.url = url - if not(local_filename): - if self.url and is_local_filename(self.url): - self.local_filename = get_local_filename(self.url) - def __str__(self): """ String representation of this ``mets:file``. @@ -57,7 +53,7 @@ def __str__(self): # ]) # return 'OcrdFile[' + '\n\t' + props + '\n\t]' props = ', '.join([ - '='.join([k, getattr(self, k) if getattr(self, k) else '---']) + '='.join([k, str(getattr(self, k)) if getattr(self, k) else '---']) for k in ['ID', 'mimetype', 'url', 'local_filename'] ]) try: @@ -67,34 +63,35 @@ def __str__(self): return ' ' % (fileGrp, props) def __eq__(self, other): - return self.ID == other.ID # and \ - # self.url == other.url and \ + return self.ID == other.ID \ + and self.url == other.url \ + and self.local_filename == other.local_filename # EXT_TO_MIME[MIME_TO_EXT[self.mimetype]] == EXT_TO_MIME[MIME_TO_EXT[other.mimetype]] and \ # self.fileGrp == other.fileGrp @property def basename(self): """ - Get the ``os.path.basename`` of the local file, if any. + Get the ``.name`` of the local file """ - return basename(self.local_filename if self.local_filename else self.url) + if not self.local_filename: + return + return self.local_filename.name @property def extension(self): - _basename, ext = splitext(self.basename) - if _basename.endswith('.tar'): - ext = ".tar" + ext - return ext + if not self.local_filename: + return + return ''.join(self.local_filename.suffixes) @property def basename_without_extension(self): """ Get the ``os.path.basename`` of the local file, if any, with extension removed. """ - ret = self.basename.rsplit('.', 1)[0] - if ret.endswith('.tar'): - ret = ret[0:len(ret)-4] - return ret + if not self.local_filename: + return + return self.local_filename.name[:-len(self.extension)] @property def ID(self): @@ -139,46 +136,11 @@ def pageId(self, pageId): self.mets.set_physical_page_for_file(pageId, self) @property - def loctype(self): + def loctypes(self): """ - Get the ``@LOCTYPE`` of the ``mets:file``. + Get the ``@LOCTYPE``s of the ``mets:file``. """ - el_FLocat = self._el.find('mets:FLocat', NS) - return '' if el_FLocat is None else el_FLocat.get('LOCTYPE') - - @loctype.setter - def loctype(self, loctype): - """ - Set the ``@LOCTYPE`` of the ``mets:file`` to :py:attr:`loctype`. - """ - if loctype is None: - return - loctype = loctype.upper() - el_FLocat = self._el.find('mets:FLocat', NS) - if el_FLocat is None: - el_FLocat = ET.SubElement(self._el, TAG_METS_FLOCAT) - el_FLocat.set('LOCTYPE', loctype) - if loctype == 'OTHER': - self.otherloctype = 'FILE' - else: - self.otherloctype = None - - @property - def otherloctype(self): - el_FLocat = self._el.find('mets:FLocat', NS) - return '' if el_FLocat is None else el_FLocat.get('OTHERLOCTYPE') - - @otherloctype.setter - def otherloctype(self, otherloctype): - el_FLocat = self._el.find('mets:FLocat', NS) - if el_FLocat is None: - el_FLocat = ET.SubElement(self._el, TAG_METS_FLOCAT) - if not otherloctype: - if 'OTHERLOCTYPE' in el_FLocat.attrib: - del el_FLocat.attrib['OTHERLOCTYPE'] - else: - el_FLocat.set('LOCTYPE', 'OTHER') - el_FLocat.set('OTHERLOCTYPE', otherloctype) + return [x.get('LOCTYPE') for x in self._el.findall('mets:FLocat', NS)] @property def mimetype(self): @@ -209,9 +171,9 @@ def fileGrp(self): @property def url(self): """ - Get the ``@xlink:href`` of this ``mets:file``. + Get the remote/original URL ``@xlink:href`` of this ``mets:file``. """ - el_FLocat = self._el.find(TAG_METS_FLOCAT) + el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="URL"]', NS) if el_FLocat is not None: return el_FLocat.get("{%s}href" % NS["xlink"]) return '' @@ -219,14 +181,42 @@ def url(self): @url.setter def url(self, url): """ - Set the ``@xlink:href`` of this ``mets:file`` to :py:attr:`url`. + Set the remote/original URL ``@xlink:href`` of this ``mets:file`` to :py:attr:`url`. """ + el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="URL"]', NS) if url is None: + if el_FLocat: + self._el.remove(el_FLocat) return - el_FLocat = self._el.find('mets:FLocat', NS) if el_FLocat is None: el_FLocat = ET.SubElement(self._el, TAG_METS_FLOCAT) el_FLocat.set("{%s}href" % NS["xlink"], url) + el_FLocat.set("LOCTYPE", "URL") + + @property + def local_filename(self): + """ + Get the local/cached ``@xlink:href`` of this ``mets:file``. + """ + el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"]', NS) + if el_FLocat is not None: + return Path(el_FLocat.get("{%s}href" % NS["xlink"])) + + @local_filename.setter + def local_filename(self, fname): + """ + Set the local/cached ``@xlink:href`` of this ``mets:file`` to :py:attr:`local_filename`. + """ + el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"]', NS) + if not fname: + if el_FLocat is not None: + self._el.remove(el_FLocat) + return + if el_FLocat is None: + el_FLocat = ET.SubElement(self._el, TAG_METS_FLOCAT) + el_FLocat.set("{%s}href" % NS["xlink"], str(fname)) + el_FLocat.set("LOCTYPE", "OTHER") + el_FLocat.set("OTHERLOCTYPE", "FILE") class ClientSideOcrdFile: diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index ba2125465..92bf019d9 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -8,8 +8,8 @@ from copy import deepcopy from ocrd_utils import ( - is_local_filename, getLogger, + deprecation_warning, generate_range, VERSION, REGEX_PREFIX, @@ -242,7 +242,7 @@ def find_all_files(self, *args, **kwargs): return list(self.find_files(*args, **kwargs)) # pylint: disable=multiple-statements - def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None, local_only=False): + def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None, local_filename=None, local_only=False): """ Search ``mets:file`` entries in this METS document and yield results. The :py:attr:`ID`, :py:attr:`pageId`, :py:attr:`fileGrp`, @@ -259,7 +259,8 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None ID (string) : ``@ID`` of the ``mets:file`` fileGrp (string) : ``@USE`` of the ``mets:fileGrp`` to list files of pageId (string) : ``@ID`` of the corresponding physical ``mets:structMap`` entry (physical page) - url (string) : ``@xlink:href`` (URL or path) of ``mets:Flocat`` of ``mets:file`` + url (string) : ``@xlink:href`` remote/original URL of ``mets:Flocat`` of ``mets:file`` + local_filename (string) : ``@xlink:href`` local/cached filename of ``mets:Flocat`` of ``mets:file`` mimetype (string) : ``@MIMETYPE`` of ``mets:file`` local (boolean) : Whether to restrict results to local files in the filesystem Yields: @@ -331,7 +332,7 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None if not mimetype.fullmatch(cand.get('MIMETYPE') or ''): continue if url: - cand_locat = cand.find('mets:FLocat', namespaces=NS) + cand_locat = cand.find('mets:FLocat[@LOCTYPE="URL"]', namespaces=NS) if cand_locat is None: continue cand_url = cand_locat.get('{%s}href' % NS['xlink']) @@ -340,14 +341,23 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None else: if not url.fullmatch(cand_url): continue - # Note: why we instantiate a class only to find out that the local_only is set afterwards - # Checking local_only and url before instantiation should be better? - f = OcrdFile(cand, mets=self, loctype=cand.get('LOCTYPE')) + if local_filename: + cand_locat = cand.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"]', namespaces=NS) + if cand_locat is None: + continue + cand_local_filename = cand_locat.get('{%s}href' % NS['xlink']) + if isinstance(local_filename, str): + if cand_local_filename != local_filename: continue + else: + if not local_filename.fullmatch(cand_local_filename): continue - # If only local resources should be returned and f is not a file path: skip the file - if local_only and not is_local_filename(f.url): - continue - yield f + if local_only: + # deprecation_warning("'local_only' is deprecated, use 'local_filename=\"//.+\"' instead") + is_local = cand.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"][@xlink:href]', namespaces=NS) + if is_local is None: + continue + + yield OcrdFile(cand, mets=self) def add_file_group(self, fileGrp): """ @@ -356,7 +366,7 @@ def add_file_group(self, fileGrp): fileGrp (string): ``@USE`` of the new ``mets:fileGrp``. """ if ',' in fileGrp: - raise Exception('fileGrp must not contain commas') + raise ValueError('fileGrp must not contain commas') el_fileSec = self._tree.getroot().find('mets:fileSec', NS) if el_fileSec is None: el_fileSec = ET.SubElement(self._tree.getroot(), TAG_METS_FILESEC) @@ -364,11 +374,11 @@ def add_file_group(self, fileGrp): if el_fileGrp is None: el_fileGrp = ET.SubElement(el_fileSec, TAG_METS_FILEGRP) el_fileGrp.set('USE', fileGrp) - + if self._cache_flag: # Assign an empty dictionary that will hold the files of the added fileGrp self._file_cache[fileGrp] = {} - + return el_fileGrp def rename_file_group(self, old, new): @@ -750,6 +760,7 @@ def merge(self, other_mets, force=False, fileGrp_mapping=None, fileId_mapping=No fileGrp_mapping.get(f_src.fileGrp, f_src.fileGrp), mimetype=f_src.mimetype, url=f_src.url, + local_filename=f_src.local_filename, ID=fileId_mapping.get(f_src.ID, f_src.ID), pageId=pageId_mapping.get(f_src.pageId, f_src.pageId), force=force) diff --git a/ocrd_utils/ocrd_utils/__init__.py b/ocrd_utils/ocrd_utils/__init__.py index 1f17f1e7b..33352693b 100644 --- a/ocrd_utils/ocrd_utils/__init__.py +++ b/ocrd_utils/ocrd_utils/__init__.py @@ -99,6 +99,8 @@ Exports of :py:mod:`ocrd_utils.logging` * :py:func:`deprecated_alias` +* :py:func:`rename_kwargs` +* :py:func:`deprecation_warning` Decorator to mark a kwarg as deprecated """ @@ -118,7 +120,9 @@ ) from .deprecate import ( - deprecated_alias) + deprecated_alias, + rename_kwargs, + deprecation_warning) from .image import ( adjust_canvas_to_rotation, diff --git a/ocrd_utils/ocrd_utils/deprecate.py b/ocrd_utils/ocrd_utils/deprecate.py index 6688fde5f..d17efeb58 100644 --- a/ocrd_utils/ocrd_utils/deprecate.py +++ b/ocrd_utils/ocrd_utils/deprecate.py @@ -1,10 +1,8 @@ import functools import warnings -""" -https://stackoverflow.com/questions/49802412/how-to-implement-deprecation-in-python-with-argument-alias -by user2357112 supports Monica -""" +def deprecation_warning(msg, stacklevel=2): + warnings.warn(msg, DeprecationWarning, stacklevel) def deprecated_alias(**aliases): """ @@ -19,6 +17,10 @@ def wrapper(*args, **kwargs): return deco def rename_kwargs(func_name, kwargs, aliases): + """ + https://stackoverflow.com/questions/49802412/how-to-implement-deprecation-in-python-with-argument-alias + by user2357112 supports Monica + """ for alias, new in aliases.items(): if alias in kwargs: if new in kwargs: diff --git a/ocrd_utils/ocrd_utils/str.py b/ocrd_utils/ocrd_utils/str.py index 7211699f0..3c044e35b 100644 --- a/ocrd_utils/ocrd_utils/str.py +++ b/ocrd_utils/ocrd_utils/str.py @@ -5,6 +5,7 @@ import re import json from .constants import REGEX_FILE_ID +from .deprecate import deprecation_warning __all__ = [ 'assert_file_grp_cardinality', @@ -119,7 +120,7 @@ def get_local_filename(url, start=None): start (string): Base path to remove from filename. Raise an exception if not a prefix of url """ if url.startswith('https://') or url.startswith('http:'): - raise Exception("Can't determine local filename of http(s) URL") + raise ValueError("Can't determine local filename of http(s) URL") if url.startswith('file://'): url = url[len('file://'):] # Goobi/Kitodo produces those, they are always absolute @@ -127,7 +128,7 @@ def get_local_filename(url, start=None): url = url[len('file:'):] if start: if not url.startswith(start): - raise Exception("Cannot remove prefix %s from url %s" % (start, url)) + raise ValueError("Cannot remove prefix %s from url %s" % (start, url)) if not start.endswith('/'): start += '/' url = url[len(start):] @@ -137,6 +138,7 @@ def is_local_filename(url): """ Whether a url is a local filename. """ + # deprecation_warning("Deprecated so we spot inconsistent URL/file handling") return url.startswith('file://') or not('://' in url) def is_string(val): diff --git a/ocrd_validators/ocrd_validators/workspace_validator.py b/ocrd_validators/ocrd_validators/workspace_validator.py index 5a408b2f9..22a34c723 100644 --- a/ocrd_validators/ocrd_validators/workspace_validator.py +++ b/ocrd_validators/ocrd_validators/workspace_validator.py @@ -185,16 +185,16 @@ def _validate_imagefilename(self): """ self.log.debug('_validate_imagefilename') for f in self.mets.find_files(mimetype=MIMETYPE_PAGE): - if not is_local_filename(f.url) and not self.download: - self.log.warning("Won't download remote PAGE XML '%s'", f.url) + if not f.local_filename and not self.download: + self.log.warning("Not available locally and 'download' is not set: %s", f) continue self.workspace.download_file(f) page = page_from_file(f).get_Page() imageFilename = page.imageFilename if not self.mets.find_files(url=imageFilename): - self.report.add_error("PAGE-XML %s : imageFilename '%s' not found in METS" % (f.url, imageFilename)) + self.report.add_error("PAGE-XML %s : imageFilename '%s' not found in METS" % (f.local_filename, imageFilename)) if is_local_filename(imageFilename) and not Path(imageFilename).exists(): - self.report.add_warning("PAGE-XML %s : imageFilename '%s' points to non-existent local file" % (f.url, imageFilename)) + self.report.add_warning("PAGE-XML %s : imageFilename '%s' points to non-existent local file" % (f.local_filename, imageFilename)) def _validate_dimension(self): """ @@ -202,8 +202,8 @@ def _validate_dimension(self): """ self.log.info('_validate_dimension') for f in self.mets.find_files(mimetype=MIMETYPE_PAGE): - if not is_local_filename(f.url) and not self.download: - self.log.warning("Won't download remote PAGE XML '%s'", f.url) + if not f.local_filename and not self.download: + self.log.warning("Not available locally and 'download' is not set: %s", f) continue self.workspace.download_file(f) page = page_from_file(f).get_Page() @@ -221,16 +221,16 @@ def _validate_multipage(self): """ self.log.debug('_validate_multipage') for f in self.mets.find_files(mimetype='//image/.*'): - if not is_local_filename(f.url) and not self.download: - self.log.warning("Won't download remote image '%s'", f.url) + if not f.local_filename and not self.download: + self.log.warning("Not available locally and 'download' is not set: %s", f) continue self.workspace.download_file(f) try: - exif = self.workspace.resolve_image_exif(f.url) + exif = self.workspace.resolve_image_exif(f.local_filename) if exif.n_frames > 1: self.report.add_error("Image %s: More than 1 frame: %s" % (f.ID, exif.n_frames)) except FileNotFoundError: - self.report.add_error("Image %s: Could not retrieve %s" % (f.ID, f.url)) + self.report.add_error("Image %s: Could not retrieve %s (local_filename=%s, url=%s)" % (f.ID, f.local_filename, f.url)) return def _validate_pixel_density(self): @@ -241,11 +241,11 @@ def _validate_pixel_density(self): """ self.log.debug('_validate_pixel_density') for f in self.mets.find_files(mimetype='//image/.*'): - if not is_local_filename(f.url) and not self.download: - self.log.warning("Won't download remote image '%s'", f.url) + if not f.local_filename and not self.download: + self.log.warning("Not available locally and 'download' is not set: %s", f) continue self.workspace.download_file(f) - exif = self.workspace.resolve_image_exif(f.url) + exif = self.workspace.resolve_image_exif(f.local_filename) for k in ['xResolution', 'yResolution']: v = exif.__dict__.get(k) if v is None or v <= 72: @@ -288,10 +288,10 @@ def _validate_mets_files(self): for f in self.mets.find_files(): if f._el.get('GROUPID'): # pylint: disable=protected-access self.report.add_notice("File '%s' has GROUPID attribute - document might need an update" % f.ID) - if not f.url: - self.report.add_error("File '%s' has no mets:Flocat/@xlink:href" % f.ID) + if not (f.url or f.local_filename): + self.report.add_error("File '%s' has neither mets:Flocat[@LOCTYPE='URL']/@xlink:href nor mets:FLocat[@LOCTYPE='OTHER'][@OTHERLOCTYPE='FILE']/xlink:href" % f.ID) continue - if 'url' not in self.skip and ':/' in f.url: + if f.url and 'url' not in self.skip: if re.match(r'^file:/[^/]', f.url): self.report.add_error("File '%s' has an invalid (Java-specific) file URL '%s'" % (f.ID, f.url)) scheme = f.url[0:f.url.index(':')] @@ -304,8 +304,8 @@ def _validate_page(self): """ self.log.debug('_validate_page') for f in self.mets.find_files(mimetype=MIMETYPE_PAGE): - if not is_local_filename(f.url) and not self.download: - self.log.warning("Won't download remote PAGE XML '%s'", f.url) + if not f.local_filename and not self.download: + self.log.warning("Not available locally and 'download' is not set: %s", f) continue self.workspace.download_file(f) page_report = PageValidator.validate(ocrd_file=f, @@ -323,8 +323,8 @@ def _validate_page_xsd(self): """ self.log.debug('_validate_page_xsd') for f in self.mets.find_files(mimetype=MIMETYPE_PAGE): - if not is_local_filename(f.url) and not self.download: - self.log.warning("Won't download remote PAGE XML '%s'", f.url) + if not f.local_filename and not self.download: + self.log.warning("Not available locally and 'download' is not set: %s", f) continue self.workspace.download_file(f) for err in XsdPageValidator.validate(Path(f.local_filename)).errors: diff --git a/tests/cli/test_bashlib.py b/tests/cli/test_bashlib.py index 3d178dcdd..0e61e093e 100644 --- a/tests/cli/test_bashlib.py +++ b/tests/cli/test_bashlib.py @@ -88,7 +88,7 @@ def test_input_files(self): with copy_of_directory(assets.path_to('kant_aufklaerung_1784/data')) as wsdir: with pushd_popd(wsdir): _, out, err = self.invoke_cli(bashlib_cli, ['input-files', '-I', 'OCR-D-IMG']) - assert ("[url]='OCR-D-IMG/INPUT_0017.tif' [ID]='INPUT_0017' [mimetype]='image/tiff' " + assert ("[url]='' [local_filename]='OCR-D-IMG/INPUT_0017.tif' [ID]='INPUT_0017' [mimetype]='image/tiff' " "[pageId]='PHYS_0017' [outputFileId]='OUTPUT_PHYS_0017'") in out def test_bashlib_defs(self): @@ -147,7 +147,7 @@ def test_bashlib_cp_processor(self): cd "${ocrd__argv[working_dir]}" mets=$(basename ${ocrd__argv[mets_file]}) for ((n=0; n<${#ocrd__files[*]}; n++)); do - in_fpath=($(ocrd__input_file $n url)) + in_fpath=($(ocrd__input_file $n local_filename)) in_id=($(ocrd__input_file $n ID)) in_mimetype=($(ocrd__input_file $n mimetype)) in_pageId=($(ocrd__input_file $n pageId)) diff --git a/tests/cli/test_workspace.py b/tests/cli/test_workspace.py index d00c47259..088432a02 100644 --- a/tests/cli/test_workspace.py +++ b/tests/cli/test_workspace.py @@ -384,10 +384,11 @@ def test_copy_vs_clone(self): def test_find_all_files_multiple_physical_pages_for_fileids(self): with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir: - result = self.runner.invoke(workspace_cli, ['-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0005', '-k', 'url']) + result = self.runner.invoke(workspace_cli, ['-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0005', '-k', 'local_filename']) + print(result.stdout) self.assertEqual(result.stdout, 'OCR-D-IMG/FILE_0005_IMAGE.tif\n') self.assertEqual(result.exit_code, 0) - result = self.runner.invoke(workspace_cli, ['-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0001', '-k', 'url']) + result = self.runner.invoke(workspace_cli, ['-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0001', '-k', 'local_filename']) self.assertEqual(len(result.stdout.split('\n')), 19) def test_mets_basename(self): diff --git a/tests/model/test_ocrd_file.py b/tests/model/test_ocrd_file.py index cdaaffcfa..98b9fa424 100644 --- a/tests/model/test_ocrd_file.py +++ b/tests/model/test_ocrd_file.py @@ -26,18 +26,9 @@ def test_ocrd_file_without_filegrp(): assert "set fileGrp" in str(val_err.value) -def test_set_loctype(): - f = create_ocrd_file_with_defaults() - assert f.loctype == 'OTHER' - assert f.otherloctype == 'FILE' - f.otherloctype = 'foo' - assert f.otherloctype == 'foo' - f.loctype = 'URN' - assert f.loctype == 'URN' - assert f.otherloctype == None - f.otherloctype = 'foo' - assert f.loctype, 'OTHER' - +def test_get_loctypes(): + f = create_ocrd_file_with_defaults(local_filename='foo', url='bar') + assert f.loctypes == ['OTHER', 'URL'] def test_set_url(): f = create_ocrd_file_with_defaults() @@ -50,8 +41,7 @@ def test_set_url(): def test_constructor_url(): f = create_ocrd_file_with_defaults(url="foo") assert f.url == 'foo' - assert f.local_filename == 'foo' - + assert f.local_filename == None def test_set_id_none(): f = create_ocrd_file_with_defaults() @@ -67,8 +57,11 @@ def test_basename(): def test_basename_from_url(): + """ + Changed behavior, basename no longer derived from f.url + """ f = create_ocrd_file_with_defaults(url="http://foo.bar/quux") - assert f.basename == 'quux' + assert f.basename == None @pytest.mark.parametrize("local_filename,extension", @@ -120,7 +113,6 @@ def test_ocrd_file_equality(): f5 = mets.add_file('TEMP', ID='TEMP_1', mimetype='image/tiff') assert f3 == f5 - def test_fptr_changed_for_change_id(): mets = OcrdMets.empty_mets() f1 = mets.add_file('FOO', ID='FOO_1', mimetype='image/tiff', pageId='p0001') diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index ad72a6ebe..17ab0a8ed 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -85,7 +85,8 @@ def test_find_all_files(sbb_sample_01): assert len(sbb_sample_01.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff' assert len(sbb_sample_01.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*' assert len(sbb_sample_01.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE - assert len(sbb_sample_01.find_all_files(url='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' + assert len(sbb_sample_01.find_all_files(local_filename='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 FILE xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' + assert len(sbb_sample_01.find_all_files(url='https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif')) == 1, '1 URL xlink:href="https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif"' assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index 5e073336e..82964e349 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -428,7 +428,7 @@ def test_serialize_no_empty_readingorder(): """ https://github.com/OCR-D/core/issues/602 """ - pcgts = page_from_image(create_ocrd_file_with_defaults(url=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif'))) + pcgts = page_from_image(create_ocrd_file_with_defaults(local_filename=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif'))) pcgts.get_Page().set_ReadingOrder(ReadingOrderType()) assert pcgts.get_Page().get_ReadingOrder() pcgts = parseString(to_xml(pcgts, skip_declaration=True)) @@ -439,7 +439,7 @@ def test_hashable(): """ https://github.com/OCR-D/ocrd_segment/issues/45 """ - pcgts = page_from_image(create_ocrd_file_with_defaults(url=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif'))) + pcgts = page_from_image(create_ocrd_file_with_defaults(local_filename=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif'))) page = pcgts.get_Page() testset = set() testset.add(pcgts) diff --git a/tests/processor/test_ocrd_dummy.py b/tests/processor/test_ocrd_dummy.py index c98a6d481..1f4ecc055 100644 --- a/tests/processor/test_ocrd_dummy.py +++ b/tests/processor/test_ocrd_dummy.py @@ -34,10 +34,10 @@ def test_copies_ok(self): output_files = workspace.mets.find_all_files(fileGrp='OUTPUT') output_files.sort(key=lambda x: x.url) print([str(s) for s in output_files]) - self.assertEqual(output_files[0].url, 'OUTPUT/OUTPUT_PHYS_0001.tif') - self.assertEqual(output_files[1].url, 'OUTPUT/OUTPUT_PHYS_0001.xml') + assert output_files[0].local_filename == Path('OUTPUT/OUTPUT_PHYS_0001.tif') + assert output_files[1].local_filename == Path('OUTPUT/OUTPUT_PHYS_0001.xml') self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID) - self.assertEqual(page_from_file(output_files[1]).get_Page().imageFilename, output_files[0].url) + assert page_from_file(output_files[1]).get_Page().imageFilename == str(output_files[0].local_filename) self.assertEqual(len(output_files), 6) self.assertEqual(len(workspace.mets.find_all_files(ID='//OUTPUT.*')), 6) self.assertEqual(len(workspace.mets.find_all_files(ID='//OUTPUT.*_PAGE')), 3) diff --git a/tests/test_decorators.py b/tests/test_decorators.py index cd8085f8a..4f4f3ecc0 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -210,4 +210,4 @@ def test_parameter_override_wo_param(self): if __name__ == '__main__': - main() + main(__name__) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 80add438a..7013d5576 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -3,6 +3,7 @@ from itertools import repeat from multiprocessing import Process, Pool, set_start_method +# necessary for macos set_start_method("fork") from shutil import rmtree, copytree from os import remove @@ -43,7 +44,14 @@ def _start_mets_server(*args, **kwargs): def add_file_server(x): mets_server_url, i = x workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) - workspace_server.add_file(local_filename=f'foo{i}', mimetype=MIMETYPE_PAGE, page_id=f'page{1}', file_grp='FOO', file_id=f'FOO_page{i}_foo{i}') + workspace_server.add_file( + local_filename=f'local_filename{i}', + mimetype=MIMETYPE_PAGE, + page_id=f'page{i}', + file_grp='FOO', + file_id=f'FOO_page{i}_foo{i}', + # url=f'url{i}' + ) def add_agent_server(x): mets_server_url, i = x @@ -108,7 +116,7 @@ def test_mets_server_str(start_mets_server): mets_server_url, workspace_server = start_mets_server workspace_server = Workspace(Resolver(), WORKSPACE_DIR, mets_server_url=mets_server_url) f = next(workspace_server.find_files()) - assert str(f) == '' + assert str(f) == '' a = workspace_server.mets.agents[0] assert str(a) == '' assert str(workspace_server.mets) == '' % ('http+unix://%2Ftmp%2Focrd-mets-server.sock' if mets_server_url == TRANSPORTS[0] else TRANSPORTS[1]) diff --git a/tests/test_resolver.py b/tests/test_resolver.py index 770221a5f..bf9f4bb72 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -174,16 +174,17 @@ def test_workspace_from_url0(): # assert assert '%s.tif' % f.ID == 'FILE_0001_IMAGE.tif' - assert f.local_filename == 'OCR-D-IMG/FILE_0001_IMAGE.tif' + assert f.local_filename == Path('OCR-D-IMG/FILE_0001_IMAGE.tif') def test_resolve_image0(): workspace = Resolver().workspace_from_url(METS_HEROLD) input_files = workspace.mets.find_all_files(fileGrp='OCR-D-IMG') f = input_files[0] - img_pil1 = workspace._resolve_image_as_pil(f.url) + print(f) + img_pil1 = workspace._resolve_image_as_pil(f.local_filename) assert img_pil1.size == (2875, 3749) - img_pil2 = workspace._resolve_image_as_pil(f.url, [[0, 0], [1, 1]]) + img_pil2 = workspace._resolve_image_as_pil(f.local_filename, [[0, 0], [1, 1]]) assert img_pil2.size == (1, 1) @@ -237,12 +238,12 @@ def test_workspace_from_nothing_noclobber(tmp_path): @pytest.mark.parametrize("url,basename,exc_msg", - [(None, None, "'url' must be a string"), - (None, 'foo', "'directory' must be a string")] + [(None, None, "'url' must be a non-empty string"), + (None, 'foo', "'directory' must be a non-empty string")] ) def test_download_to_directory_with_badargs(url, basename, exc_msg): - with pytest.raises(Exception) as exc: + with pytest.raises(ValueError) as exc: Resolver().download_to_directory(url, basename) # assert exception message contained diff --git a/tests/test_workspace.py b/tests/test_workspace.py index 338f0681d..149d7211f 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -59,7 +59,7 @@ def _fixture_plain_workspace(tmp_path): chdir(prev_dir) def test_workspace_add_file(plain_workspace): - fpath = str(plain_workspace.directory / 'ID1.tif') + fpath = plain_workspace.directory / 'ID1.tif' # act plain_workspace.add_file( @@ -75,40 +75,37 @@ def test_workspace_add_file(plain_workspace): # assert assert f.ID == 'ID1' assert f.mimetype == 'image/tiff' - assert f.url == fpath + assert not f.url assert f.local_filename == fpath - assert exists(fpath) + assert f.local_filename.exists() def test_workspace_add_file_overwrite(plain_workspace): - fpath = str(plain_workspace.directory / 'ID1.tif') + fpath = plain_workspace.directory / 'ID1.tif' # act - plain_workspace.add_file('GRP', ID='ID1', mimetype='image/tiff', content='CONTENT', pageId='phys1', local_filename=fpath) + plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id='phys1', local_filename=fpath) with pytest.raises(FileExistsError) as fn_exc: - plain_workspace.add_file('GRP', ID='ID1', mimetype='image/tiff', content='CONTENT', pageId=None, local_filename=fpath) - assert str(fn_exc.value) == "File with ID='ID1' already exists" + plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id=None, local_filename=fpath) + assert str(fn_exc.value) == "File with file_id='ID1' already exists" with pytest.raises(FileExistsError) as fn_exc: - plain_workspace.add_file('GRP', ID='ID1', mimetype='image/tiff', content='CONTENT', pageId='phys2', local_filename=fpath, force=True) + plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id='phys2', local_filename=fpath, force=True) assert 'cannot mitigate' in str(fn_exc.value) - plain_workspace.add_file('GRP', ID='ID1', mimetype='image/tiff', content='CONTENT2', pageId='phys1', local_filename=fpath, force=True) + plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT2', page_id='phys1', local_filename=fpath, force=True) f = plain_workspace.mets.find_all_files()[0] assert f.ID == 'ID1' assert f.mimetype == 'image/tiff' - assert f.url == fpath + assert not f.url assert f.local_filename == fpath assert f.pageId == 'phys1' - assert exists(fpath) + assert fpath.exists() def test_workspace_add_file_basename_no_content(plain_workspace): plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', page_id=None) f = next(plain_workspace.mets.find_files()) - - # assert - assert f.url == None - + assert f.url == '' def test_workspace_add_file_binary_content(plain_workspace): fpath = join(plain_workspace.directory, 'subdir', 'ID1.tif') @@ -160,19 +157,14 @@ def test_workspace_backup(plain_workspace): def _url_to_file(the_path): dummy_mets = OcrdMets.empty_mets() dummy_url = abspath(the_path) - return dummy_mets.add_file('DEPRECATED', ID=Path(dummy_url).name, url=dummy_url) + return dummy_mets.add_file('TESTGRP', ID=Path(dummy_url).name, url=dummy_url) def test_download_very_self_file(plain_workspace): - - # arrange with some dummy stuff the_file = _url_to_file(abspath(__file__)) - - # act fn = plain_workspace.download_file(the_file) - - # assert - assert fn, join('DEPRECATED', basename(__file__)) + assert fn, join('TESTGRP', basename(__file__)) + assert fn == the_file def test_download_url_without_baseurl_raises_exception(tmp_path): @@ -187,7 +179,7 @@ def test_download_url_without_baseurl_raises_exception(tmp_path): ws1.download_file(the_file) # assert exception message contents - assert "Already tried prepending baseurl '%s'" % str(tmp_path) in str(exc.value) + assert "File path passed as 'url' to download_to_directory does not exist:" in str(exc.value) def test_download_url_with_baseurl(tmp_path): @@ -208,7 +200,8 @@ def test_download_url_with_baseurl(tmp_path): f = Path(ws1.download_file(the_file).local_filename) # assert - assert str(f).endswith(join('OCR-D-IMG', '%s.tif' % SAMPLE_FILE_ID)) + assert f.name == f'{SAMPLE_FILE_ID}.tif' + assert f.parent.name == 'TESTGRP' assert Path(ws1.directory, f).exists() @@ -306,8 +299,8 @@ def test_remove_file_remote(plain_workspace): def test_rename_file_group(tmp_path): # arrange - copytree(assets.path_to('kant_aufklaerung_1784-page-region-line-word_glyph/data'), str(tmp_path)) - workspace = Workspace(Resolver(), directory=str(tmp_path)) + copytree(assets.path_to('kant_aufklaerung_1784-page-region-line-word_glyph/data'), tmp_path) + workspace = Workspace(Resolver(), directory=tmp_path) # before act # TODO clear semantics @@ -316,7 +309,7 @@ def test_rename_file_group(tmp_path): # which can be achieved with pushd_popd functionalities ocrd_file = next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001')) relative_name = ocrd_file.local_filename - ocrd_file.local_filename = join(tmp_path, relative_name) + ocrd_file.local_filename = tmp_path / relative_name pcgts_before = page_from_file(ocrd_file) # before assert assert pcgts_before.get_Page().imageFilename == 'OCR-D-IMG/INPUT_0017.tif' @@ -324,7 +317,7 @@ def test_rename_file_group(tmp_path): # act workspace.rename_file_group('OCR-D-IMG', 'FOOBAR') next_ocrd_file = next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001')) - next_ocrd_file.local_filename = join(tmp_path, relative_name) + next_ocrd_file.local_filename = tmp_path / relative_name pcgts_after = page_from_file(next_ocrd_file) # assert @@ -380,7 +373,7 @@ def _fixture_kant_complex(tmp_path): yield Workspace(Resolver, directory=tmp_path) -def test_remove_file_page_recursive(kant_complex_workspace): +def test_remove_file_page_recursive0(kant_complex_workspace): assert len(kant_complex_workspace.mets.find_all_files()) == 119 kant_complex_workspace.remove_file('OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001', page_recursive=True, page_same_group=False, keep_file=True) assert len(kant_complex_workspace.mets.find_all_files()) == 83 @@ -408,15 +401,15 @@ def test_download_to_directory_from_workspace_download_file(plain_workspace): f1 = plain_workspace.add_file('IMG', file_id='page1_img', mimetype='image/tiff', local_filename='test.tif', content='', page_id=None) f2 = plain_workspace.add_file('GT', file_id='page1_gt', mimetype='text/xml', local_filename='test.xml', content='', page_id=None) - assert f1.url == 'test.tif' - assert f2.url == 'test.xml' + assert not f1.url + assert not f2.url # these should be no-ops plain_workspace.download_file(f1) plain_workspace.download_file(f2) - assert f1.url == 'test.tif' - assert f2.url == 'test.xml' + assert f1.local_filename == Path('test.tif') + assert f2.local_filename == Path('test.xml') def test_save_image_file_invalid_mimetype_raises_exception(plain_workspace): @@ -510,8 +503,8 @@ def _fixture_workspace_sample_features(tmp_path): def test_image_feature_selectoro(workspace_sample_features): # arrange - with open(join(str(workspace_sample_features.directory), 'image_features.page.xml'), 'r') as f: - pcgts = parseString(f.read().encode('utf8')) + with open(Path(workspace_sample_features.directory) / 'image_features.page.xml', 'r', encoding='utf-8') as f: + pcgts = parseString(f.read().encode('utf-8')) # richest feature set is not last: _, info, _ = workspace_sample_features.image_from_page(pcgts.get_Page(), page_id='page1', feature_selector='dewarped') @@ -594,13 +587,13 @@ def test_deskewing(plain_workspace): def test_downsample_16bit_image(plain_workspace): # arrange image - img_path = join(plain_workspace.directory, '16bit.tif') - with gzip_open(join(dirname(__file__), 'data/OCR-D-IMG_APBB_Mitteilungen_62.0002.tif.gz'), 'rb') as gzip_in: + img_path = Path(plain_workspace.directory, '16bit.tif') + with gzip_open(Path(__file__).parent / 'data/OCR-D-IMG_APBB_Mitteilungen_62.0002.tif.gz', 'rb') as gzip_in: with open(img_path, 'wb') as tif_out: tif_out.write(gzip_in.read()) # act - plain_workspace.add_file('IMG', file_id='foo', url=img_path, mimetype='image/tiff', page_id=None) + plain_workspace.add_file('IMG', file_id='foo', local_filename=img_path, mimetype='image/tiff', page_id=None) # assert pil_before = Image.open(img_path) @@ -620,7 +613,7 @@ def test_mets_permissions(plain_workspace): assert filemode(stat(mets_path).st_mode) == '-rwxrwxrwx' -def test_merge(tmp_path): +def test_merge0(tmp_path): # arrange dst_path1 = tmp_path / 'kant_aufklaerung' @@ -655,16 +648,16 @@ def test_merge_no_copy_files(tmp_path): ws1 = Resolver().workspace_from_nothing(directory=dst_path1) ws2 = Resolver().workspace_from_nothing(directory=dst_path2) - ws2.add_file('GRP2', pageId='p01', mimetype='text/plain', ID='f1', local_filename='GRP2/f1', content='ws2') + ws2.add_file('GRP2', page_id='p01', mimetype='text/plain', file_id='f1', local_filename='GRP2/f1', content='ws2') ws1.merge(ws2, copy_files=False, fileId_mapping={'f1': 'f1_copy_files'}) - assert next(ws1.mets.find_files(ID='f1_copy_files')).url == 'ws2/GRP2/f1' + assert next(ws1.mets.find_files(ID='f1_copy_files')).local_filename == Path('ws2/GRP2/f1') with pytest.raises(FileExistsError): ws1.merge(ws2, copy_files=True, fileId_mapping={'f1': 'f1_copy_files'}) ws1.merge(ws2, copy_files=True, fileId_mapping={'f1': 'f1_copy_files'}, force=True) - assert next(ws1.mets.find_files(ID='f1_copy_files')).url == 'GRP2/f1' + assert next(ws1.mets.find_files(ID='f1_copy_files')).local_filename == Path('GRP2/f1') def test_merge_overwrite(tmp_path): # arrange @@ -677,8 +670,8 @@ def test_merge_overwrite(tmp_path): ws2 = Resolver().workspace_from_nothing(directory=dst_path2) with pytest.raises(Exception) as exc: - ws1.add_file('X', pageId='X', mimetype='X', ID='id123', local_filename='X/X', content='ws1') - ws2.add_file('X', pageId='X', mimetype='X', ID='id456', local_filename='X/X', content='ws2') + ws1.add_file('X', page_id='X', mimetype='X', file_id='id123', local_filename='X/X', content='ws1') + ws2.add_file('X', page_id='X', mimetype='X', file_id='id456', local_filename='X/X', content='ws2') ws1.merge(ws2) assert "would overwrite" == str(exc.value) diff --git a/tests/validator/test_workspace_backup.py b/tests/validator/test_workspace_backup.py index 03d5aaa34..0d0bc141c 100644 --- a/tests/validator/test_workspace_backup.py +++ b/tests/validator/test_workspace_backup.py @@ -46,4 +46,4 @@ def test_backup_mgr(self): self.mgr.restore(first, choose_first=False) if __name__ == '__main__': - main() + main(__name__) diff --git a/tests/validator/test_workspace_bagger.py b/tests/validator/test_workspace_bagger.py index fc3b03503..837819fdf 100644 --- a/tests/validator/test_workspace_bagger.py +++ b/tests/validator/test_workspace_bagger.py @@ -60,7 +60,7 @@ def test_bag_full(self): f = self.workspace.mets.find_all_files(ID='INPUT_0017')[0] f.url = 'bad-scheme://foo' f.local_filename = None - with self.assertRaisesRegex(Exception, "Not an http URL"): + with self.assertRaisesRegex(Exception, "No connection adapters were found for 'bad-scheme://foo'"): self.bagger.bag(self.workspace, 'kant_aufklaerung_1784', skip_zip=False) self.bagger.strict = False @@ -183,4 +183,4 @@ def test_recreate_checksums_zipped(self): if __name__ == '__main__': - main() + main(__name__) diff --git a/tests/validator/test_workspace_validator.py b/tests/validator/test_workspace_validator.py index 9fa353f5e..39c03bd84 100644 --- a/tests/validator/test_workspace_validator.py +++ b/tests/validator/test_workspace_validator.py @@ -138,15 +138,18 @@ def test_validate_weird_urls(self): workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=['pixel_density']) assert not report.is_valid - assert len(report.errors) == 2 + assert len(report.errors) == 1 assert "invalid (Java-specific) file URL" in report.errors[0] + assert len(report.warnings) == 1 + assert "non-HTTP" in report.warnings[0] + @pytest.mark.skip("Not usable as such anymore, because we properly distinguish .url and .local_filename now. Requires a server to test") def test_validate_pixel_no_download(self): imgpath = assets.path_to('kant_aufklaerung_1784-binarized/data/OCR-D-IMG-BIN/BIN_0020.png') with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' - workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', url=imgpath) + workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', local_filename=imgpath) workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=[], download=False) self.assertEqual(len(report.errors), 0) @@ -158,7 +161,7 @@ def test_validate_pixel_density_too_low(self): with TemporaryDirectory() as tempdir: workspace = self.resolver.workspace_from_nothing(directory=tempdir) workspace.mets.unique_identifier = 'foobar' - workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', url=imgpath) + workspace.mets.add_file('OCR-D-GT-BIN', ID='file1', mimetype='image/png', pageId='page1', local_filename=imgpath) workspace.save_mets() report = WorkspaceValidator.validate(self.resolver, join(tempdir, 'mets.xml'), skip=[], download=True) self.assertEqual(len(report.notices), 2)