diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index d77e981dd..e848bd494 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -23,6 +23,7 @@ from ocrd_utils import getLogger, initLogging, pushd_popd, EXT_TO_MIME, safe_filename, parse_json_string_or_file, partition_list, DEFAULT_METS_BASENAME from ocrd.decorators import mets_find_options from . import command_with_replaced_help +from ocrd_models.constants import METS_PAGE_DIV_ATTRIBUTE class WorkspaceCtx(): @@ -419,21 +420,22 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi @workspace_cli.command('find') @mets_find_options @click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab", - default=['local_filename'], - multiple=True, - type=click.Choice([ - 'url', - 'mimetype', - 'page_id', - 'pageId', - 'file_id', - 'ID', - 'file_grp', - 'fileGrp', - 'basename', - 'basename_without_extension', - 'local_filename', - ])) + default=['local_filename'], + show_default=True, + multiple=True, + type=click.Choice([ + 'url', + 'mimetype', + 'page_id', + 'pageId', + 'file_id', + 'ID', + 'file_grp', + 'fileGrp', + 'basename', + 'basename_without_extension', + 'local_filename', + ])) @click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file ") @click.option('--undo-download', is_flag=True, help="Remove all downloaded files from the METS") @click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests") @@ -596,31 +598,60 @@ def list_groups(ctx): # ---------------------------------------------------------------------- @workspace_cli.command('list-page') +@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab", + default=['ID'], + show_default=True, + multiple=True, + type=click.Choice(METS_PAGE_DIV_ATTRIBUTE.names())) @click.option('-f', '--output-format', help="Output format", type=click.Choice(['one-per-line', 'comma-separated', 'json']), default='one-per-line') @click.option('-D', '--chunk-number', help="Partition the return value into n roughly equally sized chunks", default=1, type=int) @click.option('-C', '--chunk-index', help="Output the nth chunk of results, -1 for all of them.", default=None, type=int) @click.option('-r', '--page-id-range', help="Restrict the pages to those matching the provided range, based on the @ID attribute. Separate start/end with ..") @click.option('-R', '--numeric-range', help="Restrict the pages to those in the range, in numerical document order. Separate start/end with ..") @pass_workspace -def list_pages(ctx, output_format, chunk_number, chunk_index, page_id_range, numeric_range): +def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page_id_range, numeric_range): """ List physical page IDs + + (If any ``FILTER`` starts with ``//``, then its remainder + will be interpreted as a regular expression.) """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) find_kwargs = {} - if page_id_range: + if page_id_range and 'ID' in output_field: find_kwargs['pageId'] = page_id_range - ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId}) + page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId}) + ret = [] + + if output_field == ['ID']: + ret = [[x] for x in page_ids] + else: + for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)): + ret.append([]) + for k in output_field: + ret[i].append(page_div.get(k, 'None')) + if numeric_range: start, end = map(int, numeric_range.split('..')) - ids = ids[start-1:end] - chunks = partition_list(ids, chunk_number, chunk_index) + ret = ret[start-1:end] + + chunks = partition_list(ret, chunk_number, chunk_index) + lines = [] if output_format == 'one-per-line': - print("\n".join(["\n".join(chunk) for chunk in chunks])) + for chunk in chunks: + line_strs = [] + for entry in chunk: + line_strs.append("\t".join(entry)) + lines.append('\n'.join(line_strs)) elif output_format == 'comma-separated': - print("\n".join([",".join(chunk) for chunk in chunks])) + for chunk in chunks: + line_strs = [] + for entry in chunk: + line_strs.append("\t".join(entry)) + lines.append(','.join(line_strs)) elif output_format == 'json': - print(dumps(chunks)) + lines.append(dumps(chunks)) + print('\n'.join(lines)) # ---------------------------------------------------------------------- # ocrd workspace get-id @@ -657,18 +688,30 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin workspace.save_mets() @workspace_cli.command('update-page') -@click.option('--order', help="@ORDER attribute for this mets:div", metavar='ORDER') -@click.option('--orderlabel', help="@ORDERLABEL attribute for this mets:div", metavar='ORDERLABEL') -@click.option('--contentids', help="@CONTENTIDS attribute for this mets:div", metavar='ORDERLABEL') +@click.option('--set', 'attr_value_pairs', help=f"set mets:div ATTR to VALUE. possible keys: {METS_PAGE_DIV_ATTRIBUTE.names()}", metavar="ATTR VALUE", nargs=2, multiple=True) +@click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER') +@click.option('--orderlabel', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL') +@click.option('--contentids', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL') @click.argument('PAGE_ID') @pass_workspace -def update_page(ctx, order, orderlabel, contentids, page_id): +def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): """ - Update the @ORDER, @ORDERLABEL o @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID + Update the @ID, @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) - workspace.mets.update_physical_page_attributes(page_id, order=order, orderlabel=orderlabel, contentids=contentids) - workspace.save_mets() + update_kwargs = {k: v for k, v in attr_value_pairs} + if order: + update_kwargs['ORDER'] = order + if orderlabel: + update_kwargs['ORDERLABEL'] = orderlabel + if contentids: + update_kwargs['CONTENTIDS'] = contentids + try: + workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace.mets.update_physical_page_attributes(page_id, **update_kwargs) + workspace.save_mets() + except Exception as err: + print(f"Error: {err}") + sys.exit(1) # ---------------------------------------------------------------------- # ocrd workspace merge diff --git a/src/ocrd_models/constants.py b/src/ocrd_models/constants.py index 01068b7f3..db6e51e3a 100644 --- a/src/ocrd_models/constants.py +++ b/src/ocrd_models/constants.py @@ -1,6 +1,7 @@ """ Constants for ocrd_models. """ +from enum import Enum, auto from ocrd_utils import resource_string __all__ = [ @@ -26,6 +27,7 @@ 'TAG_PAGE_TEXTLINE', 'TAG_PAGE_TEXTEQUIV', 'TAG_PAGE_TEXTREGION', + 'METS_PAGE_DIV_ATTRIBUTE', ] @@ -70,3 +72,14 @@ 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Separator', 'Table', 'Text', 'Unknown' ] + +class METS_PAGE_DIV_ATTRIBUTE(Enum): + ID = auto() + ORDER = auto() + ORDERLABEL = auto() + LABEL = auto() + CONTENTIDS = auto() + + @classmethod + def names(cls): + return [x.name for x in cls] diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index 19ccfb80e..c18bedc4f 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -3,11 +3,7 @@ """ from datetime import datetime import re -import typing -from typing_extensions import Optional -from lxml import etree as ET # type: ignore -from copy import deepcopy -from warnings import warn +from typing import Dict, Optional from ocrd_utils import ( getLogger, @@ -33,9 +29,10 @@ IDENTIFIER_PRIORITY, TAG_MODS_IDENTIFIER, METS_XML_EMPTY, + METS_PAGE_DIV_ATTRIBUTE ) -from .ocrd_xml_base import OcrdXmlDocument +from .ocrd_xml_base import OcrdXmlDocument, ET # type: ignore from .ocrd_file import OcrdFile from .ocrd_agent import OcrdAgent @@ -45,6 +42,23 @@ class OcrdMets(OcrdXmlDocument): """ API to a single METS file """ + _cache_flag : bool + # Cache for the pages (mets:div) + # The dictionary's Key: 'div.ID' + # The dictionary's Value: a 'div' object at some memory location + _page_cache : Dict[METS_PAGE_DIV_ATTRIBUTE, Dict[str, ET.Element]] + # Cache for the files (mets:file) - two nested dictionaries + # The outer dictionary's Key: 'fileGrp.USE' + # The outer dictionary's Value: Inner dictionary + # The inner dictionary's Key: 'file.ID' + # The inner dictionary's Value: a 'file' object at some memory location + _file_cache : Dict[str, Dict[str, ET.Element]] + # Cache for the file pointers (mets:fptr) - two nested dictionaries + # The outer dictionary's Key: 'div.ID' + # The outer dictionary's Value: Inner dictionary + # The inner dictionary's Key: 'fptr.FILEID' + # The inner dictionary's Value: a 'fptr' object at some memory location + _fptr_cache : Dict[str, Dict[str, ET.Element]] @staticmethod def empty_mets(now=None, cache_flag=False): @@ -71,15 +85,18 @@ def __init__(self, **kwargs): 'enabled' if config.OCRD_METS_CACHING else 'disabled', config.raw_value('OCRD_METS_CACHING')) self._cache_flag = config.OCRD_METS_CACHING + # If cache is enabled if self._cache_flag: - self.refresh_caches() + self._initialize_caches() + self._refresh_caches() def __str__(self): """ String representation """ - return 'OcrdMets[cached=%s,fileGrps=%s,files=%s]' % (self._cache_flag, self.file_groups, list(self.find_files())) + return 'OcrdMets[cached=%s,fileGrps=%s,files=%s]' % ( + self._cache_flag, self.file_groups, list(self.find_files())) def _fill_caches(self): """ @@ -103,7 +120,7 @@ def _fill_caches(self): for el_file in el_fileGrp: file_id = el_file.get('ID') - self._file_cache[fileGrp_use].update({file_id : el_file}) + self._file_cache[fileGrp_use].update({file_id: el_file}) # log.info("File added to the cache: %s" % file_id) # Fill with pages @@ -116,7 +133,8 @@ def _fill_caches(self): div_id = el_div.get('ID') log.debug("DIV_ID: %s" % el_div.get('ID')) - self._page_cache[div_id] = el_div + for attr in METS_PAGE_DIV_ATTRIBUTE: + self._page_cache[attr][str(el_div.get(attr.name))] = el_div # Assign an empty dictionary that will hold the fptr of the added page (div) self._fptr_cache[div_id] = {} @@ -124,46 +142,26 @@ def _fill_caches(self): # log.info("Page_id added to the cache: %s" % div_id) for el_fptr in el_div: - self._fptr_cache[div_id].update({el_fptr.get('FILEID') : el_fptr}) + self._fptr_cache[div_id].update({el_fptr.get('FILEID'): el_fptr}) # log.info("Fptr added to the cache: %s" % el_fptr.get('FILEID')) - # log.info("Len of page_cache: %s" % len(self._page_cache)) + # log.info("Len of page_cache: %s" % len(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID])) # log.info("Len of fptr_cache: %s" % len(self._fptr_cache)) - def _clear_caches(self): - """ - Deallocates the caches - """ - + def _initialize_caches(self): self._file_cache = {} - self._page_cache = {} + # NOTE we can only guarantee uniqueness for @ID and @ORDER + self._page_cache = {k : {} for k in METS_PAGE_DIV_ATTRIBUTE} self._fptr_cache = {} - - def refresh_caches(self): + + def _refresh_caches(self): if self._cache_flag: - # Cache for the files (mets:file) - two nested dictionaries - # The outer dictionary's Key: 'fileGrp.USE' - # The outer dictionary's Value: Inner dictionary - # The inner dictionary's Key: 'file.ID' - # The inner dictionary's Value: a 'file' object at some memory location - self._file_cache = {} - - # Cache for the pages (mets:div) - # The dictionary's Key: 'div.ID' - # The dictionary's Value: a 'div' object at some memory location - self._page_cache = {} - - # Cache for the file pointers (mets:fptr) - two nested dictionaries - # The outer dictionary's Key: 'div.ID' - # The outer dictionary's Value: Inner dictionary - # The inner dictionary's Key: 'fptr.FILEID' - # The inner dictionary's Value: a 'fptr' object at some memory location - self._fptr_cache = {} - + self._initialize_caches() + # Note, if the empty_mets() function is used to instantiate OcrdMets # Then the cache is empty even after this operation self._fill_caches() - + @property def unique_identifier(self): """ @@ -174,7 +172,7 @@ def unique_identifier(self): found = self._tree.getroot().find('.//mods:identifier[@type="%s"]' % t, NS) if found is not None: return found.text - + @unique_identifier.setter def unique_identifier(self, purl): """ @@ -214,7 +212,6 @@ def add_agent(self, *args, **kwargs): el_agent_last.addnext(el_agent) except StopIteration: el_metsHdr.insert(0, el_agent) - # print(ET.tostring(el_metsHdr)) return OcrdAgent(el_agent, *args, **kwargs) @property @@ -225,7 +222,7 @@ def file_groups(self): # WARNING: Actually we cannot return strings in place of elements! if self._cache_flag: - return list(self._file_cache.keys()) + return list(self._file_cache.keys()) return [el.get('USE') for el in self._tree.getroot().findall('.//mets:fileGrp', NS)] @@ -276,25 +273,13 @@ def find_files( """ pageId_list = [] if pageId: - pageId_patterns = [] - for pageId_token in re.split(r',', pageId): - if pageId_token.startswith(REGEX_PREFIX): - pageId_patterns.append(re.compile(pageId_token[REGEX_PREFIX_LEN:])) - elif '..' in pageId_token: - pageId_patterns += generate_range(*pageId_token.split('..', 1)) + # returns divs instead of strings of ids + physical_pages = self.get_physical_pages(for_pageIds=pageId, return_divs=True) + for div in physical_pages: + if self._cache_flag: + pageId_list += self._fptr_cache[div.get('ID')] else: - pageId_patterns += [pageId_token] - if self._cache_flag: - for page_id in self._page_cache.keys(): - if page_id in pageId_patterns or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]): - pageId_list += self._fptr_cache[page_id] - else: - for page in self._tree.getroot().xpath( - '//mets:div[@TYPE="page"]', namespaces=NS): - if page.get('ID') in pageId_patterns or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(page.get('ID')) for p in pageId_patterns]): - pageId_list += [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)] + pageId_list += [fptr.get('FILEID') for fptr in div.findall('mets:fptr', NS)] if ID and ID.startswith(REGEX_PREFIX): ID = re.compile(ID[REGEX_PREFIX_LEN:]) @@ -304,19 +289,20 @@ def find_files( mimetype = re.compile(mimetype[REGEX_PREFIX_LEN:]) if url and url.startswith(REGEX_PREFIX): url = re.compile(url[REGEX_PREFIX_LEN:]) - + candidates = [] if self._cache_flag: if fileGrp: if isinstance(fileGrp, str): candidates += self._file_cache.get(fileGrp, {}).values() else: - candidates = [x for fileGrp_needle, el_file_list in self._file_cache.items() if fileGrp.match(fileGrp_needle) for x in el_file_list.values()] + candidates = [x for fileGrp_needle, el_file_list in self._file_cache.items() if + fileGrp.match(fileGrp_needle) for x in el_file_list.values()] else: candidates = [el_file for id_to_file in self._file_cache.values() for el_file in id_to_file.values()] else: candidates = self._tree.getroot().xpath('//mets:file', namespaces=NS) - + for cand in candidates: if ID: if isinstance(ID, str): @@ -405,7 +391,7 @@ def rename_file_group(self, old, new): if el_fileGrp is None: raise FileNotFoundError("No such fileGrp '%s'" % old) el_fileGrp.set('USE', new) - + if self._cache_flag: self._file_cache[new] = self._file_cache.pop(old) @@ -432,7 +418,7 @@ def remove_file_group(self, USE, recursive=False, force=False): el_fileGrp = el_fileSec.find('mets:fileGrp[@USE="%s"]' % USE, NS) else: el_fileGrp = USE - if el_fileGrp is None: # pylint: disable=len-as-condition + if el_fileGrp is None: # pylint: disable=len-as-condition msg = "No such fileGrp: %s" % USE if force: log.warning(msg) @@ -460,7 +446,8 @@ def remove_file_group(self, USE, recursive=False, force=False): el_fileGrp.getparent().remove(el_fileGrp) - def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force=False, local_filename=None, ignore=False, **kwargs): + def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force=False, local_filename=None, + ignore=False, **kwargs): """ Instantiate and add a new :py:class:`ocrd_models.ocrd_file.OcrdFile`. Arguments: @@ -489,16 +476,19 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force mets_file = next(self.find_files(ID=ID, fileGrp=fileGrp), None) if mets_file: if mets_file.fileGrp == fileGrp and \ - mets_file.pageId == pageId and \ - mets_file.mimetype == mimetype: + mets_file.pageId == pageId and \ + mets_file.mimetype == mimetype: if not force: - raise FileExistsError(f"A file with ID=={ID} already exists {mets_file} and neither force nor ignore are set") + raise FileExistsError( + f"A file with ID=={ID} already exists {mets_file} and neither force nor ignore are set") self.remove_file(ID=ID, fileGrp=fileGrp) else: - raise FileExistsError(f"A file with ID=={ID} already exists {mets_file} but unrelated - cannot mitigate") + raise FileExistsError( + f"A file with ID=={ID} already exists {mets_file} but unrelated - cannot mitigate") # To get rid of Python's FutureWarning - checking if v is not None - kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None} + kwargs = {k: v for k, v in locals().items() if + k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None} # This separation is needed to reuse the same el_mets_file element in the caching if block el_mets_file = ET.SubElement(el_fileGrp, TAG_METS_FILE) # The caching of the physical page is done in the OcrdFile constructor @@ -521,7 +511,7 @@ def remove_file(self, *args, **kwargs): if len(files) > 1: return files else: - return files[0] # for backwards-compatibility + return files[0] # for backwards-compatibility if any(1 for kwarg in kwargs if isinstance(kwarg, str) and kwarg.startswith(REGEX_PREFIX)): # allow empty results if filter criteria involve a regex @@ -571,8 +561,9 @@ def remove_one_file(self, ID, fileGrp=None): page_div.getparent().remove(page_div) # Delete the empty pages from caches as well if self._cache_flag: - del self._page_cache[page_div.get('ID')] - del self._fptr_cache[page_div.get('ID')] + for attr in METS_PAGE_DIV_ATTRIBUTE: + if attr.name in page_div.attrib: + del self._page_cache[attr][page_div.attrib[attr.name]] # Delete the file reference from the cache if self._cache_flag: @@ -591,33 +582,139 @@ def physical_pages(self): List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``) """ if self._cache_flag: - return list(self._page_cache.keys()) - + return list(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].keys()) + return [str(x) for x in self._tree.getroot().xpath( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/@ID', namespaces=NS)] - def get_physical_pages(self, for_fileIds=None): + def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : Optional[str] = None, return_divs : bool = False): """ List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``), - optionally for a subset of ``mets:file`` ``@ID`` :py:attr:`for_fileIds`. + optionally for a subset of ``mets:file`` ``@ID`` :py:attr:`for_fileIds`, + or for a subset selector expression (comma-separated, range, and/or regex) :py:attr:`for_pageIds`. + If return_divs is set, returns div memory objects instead of strings of ids """ - if for_fileIds is None: + if for_fileIds is None and for_pageIds is None: return self.physical_pages + # log = getLogger('ocrd.models.ocrd_mets.get_physical_pages') + if for_pageIds is not None: + ret = [] + page_attr_patterns = [] + page_attr_patterns_raw = re.split(r',', for_pageIds) + for pageId_token in page_attr_patterns_raw: + if pageId_token.startswith(REGEX_PREFIX): + page_attr_patterns.append((None, re.compile(pageId_token[REGEX_PREFIX_LEN:]))) + elif '..' in pageId_token: + val_range = generate_range(*pageId_token.split('..', 1)) + page_attr_patterns.append(val_range) + else: + page_attr_patterns.append(pageId_token) + if not page_attr_patterns: + return [] + range_patterns_first_last = [(x[0], x[-1]) if isinstance(x, list) else None for x in page_attr_patterns] + page_attr_patterns_copy = list(page_attr_patterns) + if self._cache_flag: + for pat in page_attr_patterns: + try: + attr : METS_PAGE_DIV_ATTRIBUTE + if isinstance(pat, str): + attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat in self._page_cache[a]) + cache_keys = [pat] + elif isinstance(pat, list): + attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if any(x in self._page_cache[a] for x in pat)) + cache_keys = [v for v in pat if v in self._page_cache[attr]] + for k in cache_keys: + pat.remove(k) + elif isinstance(pat, tuple): + _, re_pat = pat + attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) for v in self._page_cache[a] if re_pat.fullmatch(v)) + cache_keys = [v for v in self._page_cache[attr] if re_pat.fullmatch(v)] + else: + raise ValueError + if return_divs: + ret += [self._page_cache[attr][v] for v in cache_keys] + else: + ret += [self._page_cache[attr][v].get('ID') for v in cache_keys] + except StopIteration: + raise ValueError(f"{pat} matches none of the keys of any of the _page_caches.") + else: + page_attr_patterns_matched = [] + for page in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS): + patterns_exhausted = [] + for pat_idx, pat in enumerate(page_attr_patterns): + try: + if isinstance(pat, str): + attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat == page.get(a.name)) + ret.append(page if return_divs else page.get('ID')) + patterns_exhausted.append(pat) + elif isinstance(pat, list): + if not isinstance(pat[0], METS_PAGE_DIV_ATTRIBUTE): + pat.insert(0, next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if any(x == page.get(a.name) for x in pat))) + attr_val = page.get(pat[0].name) + if attr_val in pat: + pat.remove(attr_val) + ret.append(page if return_divs else page.get('ID')) + if len(pat) == 1: + patterns_exhausted.append(pat) + elif isinstance(pat, tuple): + attr, re_pat = pat + if not attr: + attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if re_pat.fullmatch(page.get(a.name) or '')) + page_attr_patterns[pat_idx] = (attr, re_pat) + if re_pat.fullmatch(page.get(attr.name) or ''): + ret.append(page if return_divs else page.get('ID')) + else: + raise ValueError + page_attr_patterns_matched.append(pat) + except StopIteration: + continue + for p in patterns_exhausted: + page_attr_patterns.remove(p) + unmatched = [x for x in page_attr_patterns_copy if x not in page_attr_patterns_matched] + if unmatched: + raise ValueError(f"Patterns {unmatched} match none of the pages") + + ranges_without_start_match = [] + ranges_without_last_match = [] + for idx, pat in enumerate(page_attr_patterns_copy): + if isinstance(pat, list): + start, last = range_patterns_first_last[idx] + if start in pat: + print(pat, start, last) + ranges_without_start_match.append(page_attr_patterns_raw[idx]) + # if last in pat: + # ranges_without_last_match.append(page_attr_patterns_raw[idx]) + if ranges_without_start_match: + raise ValueError(f"Start of range patterns {ranges_without_start_match} not matched - invalid range") + # if ranges_without_last_match: + # raise ValueError(f"End of range patterns {ranges_without_last_match} not matched - invalid range") + return ret + + assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright ret = [None] * len(for_fileIds) - if self._cache_flag: for pageId in self._fptr_cache.keys(): for fptr in self._fptr_cache[pageId].keys(): if fptr in for_fileIds: - ret[for_fileIds.index(fptr)] = pageId + index = for_fileIds.index(fptr) + if return_divs: + ret[index] = self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][pageId] + else: + ret[index] = pageId else: - for page in self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', - namespaces=NS): - for fptr in page.findall('mets:fptr', NS): - if fptr.get('FILEID') in for_fileIds: - ret[for_fileIds.index(fptr.get('FILEID'))] = page.get('ID') + for page in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS): + for fptr in page.findall('mets:fptr', NS): + if fptr.get('FILEID') in for_fileIds: + index = for_fileIds.index(fptr.get('FILEID')) + if return_divs: + ret[index] = page + else: + ret[index] = page.get('ID') return ret def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=None): @@ -658,14 +755,14 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N if el_seqdiv is None: el_seqdiv = ET.SubElement(el_structmap, TAG_METS_DIV) el_seqdiv.set('TYPE', 'physSequence') - + el_pagediv = None if self._cache_flag: if pageId in self._page_cache: - el_pagediv = self._page_cache[pageId] + el_pagediv = self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][pageId] else: el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS) - + if el_pagediv is None: el_pagediv = ET.SubElement(el_seqdiv, TAG_METS_DIV) el_pagediv.set('TYPE', 'page') @@ -676,33 +773,33 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N el_pagediv.set('ORDERLABEL', orderlabel) if self._cache_flag: # Create a new entry in the page cache - self._page_cache[pageId] = el_pagediv + self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][pageId] = el_pagediv # Create a new entry in the fptr cache and # assign an empty dictionary to hold the fileids self._fptr_cache[pageId] = {} - + el_fptr = ET.SubElement(el_pagediv, TAG_METS_FPTR) el_fptr.set('FILEID', ocrd_file.ID) if self._cache_flag: # Assign the ocrd fileID to the pageId in the cache - self._fptr_cache[el_pagediv.get('ID')].update({ocrd_file.ID : el_fptr}) + self._fptr_cache[el_pagediv.get('ID')].update({ocrd_file.ID: el_fptr}) def update_physical_page_attributes(self, page_id, **kwargs): - mets_div = None - if self._cache_flag: - if page_id in self._page_cache.keys(): - mets_div = [self._page_cache[page_id]] - else: - mets_div = self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % page_id, - namespaces=NS) - if mets_div: - for attr_name, attr_value in kwargs.items(): - if attr_value: - mets_div[0].set(attr_name.upper(), attr_value) - else: - warn("Could not find mets:div[@ID={page_id}]") + invalid_keys = list(k for k in kwargs.keys() if k not in METS_PAGE_DIV_ATTRIBUTE.names()) + if invalid_keys: + raise ValueError(f"Invalid attribute {invalid_keys}. Allowed values: {METS_PAGE_DIV_ATTRIBUTE.names()}") + + page_div = self.get_physical_pages(for_pageIds=page_id, return_divs=True) + if not page_div: + raise ValueError(f"Could not find mets:div[@ID=={page_id}]") + page_div = page_div[0] + + for k, v in kwargs.items(): + if not v: + page_div.attrib.pop(k) + else: + page_div.attrib[k] = v def get_physical_page_for_file(self, ocrd_file): """ @@ -713,7 +810,7 @@ def get_physical_page_for_file(self, ocrd_file): if self._cache_flag: for pageId in self._fptr_cache.keys(): if ocrd_file.ID in self._fptr_cache[pageId].keys(): - ret.append(self._page_cache[pageId].get('ID')) + ret.append(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][pageId].get('ID')) else: ret = self._tree.getroot().xpath( '/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' % @@ -729,16 +826,19 @@ def remove_physical_page(self, ID): """ mets_div = None if self._cache_flag: - if ID in self._page_cache.keys(): - mets_div = [self._page_cache[ID]] + if ID in self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].keys(): + mets_div = [self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][ID]] else: mets_div = self._tree.getroot().xpath( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % ID, namespaces=NS) if mets_div: + mets_div_attrib = {** mets_div[0].attrib} mets_div[0].getparent().remove(mets_div[0]) if self._cache_flag: - del self._page_cache[ID] + for attr in METS_PAGE_DIV_ATTRIBUTE: + if attr.name in mets_div_attrib: + del self._page_cache[attr][mets_div_attrib[attr.name]] del self._fptr_cache[ID] def remove_physical_page_fptr(self, fileId): @@ -757,10 +857,11 @@ def remove_physical_page_fptr(self, fileId): if self._cache_flag: for page_id in self._fptr_cache.keys(): if fileId in self._fptr_cache[page_id].keys(): - mets_fptrs.append(self._fptr_cache[page_id][fileId]) + mets_fptrs.append(self._fptr_cache[page_id][fileId]) else: mets_fptrs = self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, namespaces=NS) + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, + namespaces=NS) ret = [] for mets_fptr in mets_fptrs: mets_div = mets_fptr.getparent() @@ -770,7 +871,20 @@ def remove_physical_page_fptr(self, fileId): mets_div.remove(mets_fptr) return ret - def merge(self, other_mets, force=False, fileGrp_mapping=None, fileId_mapping=None, pageId_mapping=None, after_add_cb=None, **kwargs): + @property + def physical_pages_labels(self): + """ + Map all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``) to their + ``@ORDER``, ``@ORDERLABEL`` and ``@LABEL`` attributes, if any. + """ + divs = self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS) + return {div.get('ID'): (div.get('ORDER', None), div.get('ORDERLABEL', None), div.get('LABEL', None)) + for div in divs} + + def merge(self, other_mets, force=False, fileGrp_mapping=None, fileId_mapping=None, pageId_mapping=None, + after_add_cb=None, **kwargs): """ Add all files from other_mets. Accepts the same kwargs as :py:func:`find_files` @@ -789,14 +903,15 @@ def merge(self, other_mets, force=False, fileGrp_mapping=None, fileId_mapping=No pageId_mapping = {} for f_src in other_mets.find_files(**kwargs): f_dest = self.add_file( - fileGrp_mapping.get(f_src.fileGrp, f_src.fileGrp), - mimetype=f_src.mimetype, - url=f_src.url, - local_filename=f_src.local_filename, - ID=fileId_mapping.get(f_src.ID, f_src.ID), - pageId=pageId_mapping.get(f_src.pageId, f_src.pageId), - force=force) + fileGrp_mapping.get(f_src.fileGrp, f_src.fileGrp), + mimetype=f_src.mimetype, + url=f_src.url, + local_filename=f_src.local_filename, + ID=fileId_mapping.get(f_src.ID, f_src.ID), + pageId=pageId_mapping.get(f_src.pageId, f_src.pageId), + force=force) # FIXME: merge metsHdr, amdSec, dmdSec as well # FIXME: merge structMap logical and structLink as well if after_add_cb: after_add_cb(f_dest) + diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py index 406fcc975..6a9986ae7 100644 --- a/src/ocrd_utils/str.py +++ b/src/ocrd_utils/str.py @@ -196,7 +196,7 @@ def safe_filename(url): # print('safe filename: %s -> %s' % (url, ret)) return ret -def generate_range(start, end): +def generate_range(start : str, end : str) -> List[str]: """ Generate a list of strings by incrementing the number part of ``start`` until including ``end``. """ @@ -205,6 +205,8 @@ def generate_range(start, end): start_num, end_num = re.findall(r'\d+', start)[-1], re.findall(r'\d+', end)[-1] except IndexError: raise ValueError("Range '%s..%s': could not find numeric part" % (start, end)) + if start[:-len(start_num)] != end[:-len(end_num)]: + raise ValueError(f"Range '{start}..{end}' differ in their non-numeric part: '{start[:-len(start_num)]}' != '{end[:-len(end_num)]}'") if start_num == end_num: warn("Range '%s..%s': evaluates to the same number") for i in range(int(start_num), int(end_num) + 1): diff --git a/tests/cli/test_workspace.py b/tests/cli/test_workspace.py index cf7eef3d0..831acbae5 100644 --- a/tests/cli/test_workspace.py +++ b/tests/cli/test_workspace.py @@ -560,14 +560,19 @@ def _call(args): with pushd_popd(Path(__file__).parent.parent / 'data/list-page-workspace'): assert _call([]) == 'PHYS_0001\nPHYS_0002\nPHYS_0003\nPHYS_0004\nPHYS_0005\nPHYS_0006\nPHYS_0008\nPHYS_0009\nPHYS_0010\nPHYS_0011\nPHYS_0012\nPHYS_0013\nPHYS_0014\nPHYS_0015\nPHYS_0016\nPHYS_0017\nPHYS_0018\nPHYS_0019\nPHYS_0020\nPHYS_0022\nPHYS_0023\nPHYS_0024\nPHYS_0025\nPHYS_0026\nPHYS_0027\nPHYS_0028\nPHYS_0029' assert _call(['-f', 'comma-separated']) == 'PHYS_0001,PHYS_0002,PHYS_0003,PHYS_0004,PHYS_0005,PHYS_0006,PHYS_0008,PHYS_0009,PHYS_0010,PHYS_0011,PHYS_0012,PHYS_0013,PHYS_0014,PHYS_0015,PHYS_0016,PHYS_0017,PHYS_0018,PHYS_0019,PHYS_0020,PHYS_0022,PHYS_0023,PHYS_0024,PHYS_0025,PHYS_0026,PHYS_0027,PHYS_0028,PHYS_0029' - assert _call(['-f', 'json']) == '[["PHYS_0001", "PHYS_0002", "PHYS_0003", "PHYS_0004", "PHYS_0005", "PHYS_0006", "PHYS_0008", "PHYS_0009", "PHYS_0010", "PHYS_0011", "PHYS_0012", "PHYS_0013", "PHYS_0014", "PHYS_0015", "PHYS_0016", "PHYS_0017", "PHYS_0018", "PHYS_0019", "PHYS_0020", "PHYS_0022", "PHYS_0023", "PHYS_0024", "PHYS_0025", "PHYS_0026", "PHYS_0027", "PHYS_0028", "PHYS_0029"]]' + assert _call(['-f', 'json']) == '[[["PHYS_0001"], ["PHYS_0002"], ["PHYS_0003"], ["PHYS_0004"], ["PHYS_0005"], ["PHYS_0006"], ["PHYS_0008"], ["PHYS_0009"], ["PHYS_0010"], ["PHYS_0011"], ["PHYS_0012"], ["PHYS_0013"], ["PHYS_0014"], ["PHYS_0015"], ["PHYS_0016"], ["PHYS_0017"], ["PHYS_0018"], ["PHYS_0019"], ["PHYS_0020"], ["PHYS_0022"], ["PHYS_0023"], ["PHYS_0024"], ["PHYS_0025"], ["PHYS_0026"], ["PHYS_0027"], ["PHYS_0028"], ["PHYS_0029"]]]' assert _call(['-f', 'comma-separated', '-R', '5..5']) == 'PHYS_0005' assert _call(['-f', 'comma-separated', '-R', '6..8']) == 'PHYS_0006,PHYS_0008,PHYS_0009' + assert _call(['-f', 'comma-separated', '-r', '1..5']) == 'PHYS_0001,PHYS_0002,PHYS_0003,PHYS_0004,PHYS_0005' + assert _call(['-f', 'comma-separated', '-r', '2..3']) == 'PHYS_0002,PHYS_0003' + assert _call(['-f', 'comma-separated', '-r', 'page 2..page 3']) == 'PHYS_0002,PHYS_0003' assert _call(['-f', 'comma-separated', '-r', 'PHYS_0006..PHYS_0009']) == 'PHYS_0006,PHYS_0008,PHYS_0009' assert _call(['-f', 'comma-separated', '-r', 'PHYS_0001..PHYS_0010', '-D', '3']) == 'PHYS_0001,PHYS_0002,PHYS_0003\nPHYS_0004,PHYS_0005,PHYS_0006\nPHYS_0008,PHYS_0009,PHYS_0010' assert _call(['-f', 'comma-separated', '-r', 'PHYS_0001..PHYS_0010', '-D', '3', '-C', '2']) == 'PHYS_0008,PHYS_0009,PHYS_0010' from json import loads - assert loads(_call(['-f', 'json', '-r', 'PHYS_0001..PHYS_0010', '-D', '3', '-C', '2'])) == [['PHYS_0008', 'PHYS_0009', 'PHYS_0010']] + assert loads(_call(['-f', 'json', '-r', 'PHYS_0001..PHYS_0010', '-D', '3', '-C', '2'])) == [[['PHYS_0008'], ['PHYS_0009'], ['PHYS_0010']]] + assert loads(_call(['-f', 'json', '-r', 'PHYS_0001..PHYS_0010', '-k', 'ID', '-k', 'ORDERLABEL', '-D', '3', '-C', '2'])) == \ + [[['PHYS_0008', 'page 7'], ['PHYS_0009', 'page 8'], ['PHYS_0010', 'page 9']]] if __name__ == '__main__': main(__file__) diff --git a/tests/data/list-page-workspace/mets.xml b/tests/data/list-page-workspace/mets.xml index 2786bfe07..5f5d0e306 100644 --- a/tests/data/list-page-workspace/mets.xml +++ b/tests/data/list-page-workspace/mets.xml @@ -99,89 +99,199 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 64ea1eccf..9634886e1 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -5,8 +5,8 @@ from os.path import join from os import environ from contextlib import contextmanager +import re import shutil -from logging import StreamHandler import lxml from tests.base import ( @@ -16,9 +16,6 @@ ) from ocrd_utils import ( - initLogging, - disableLogging, - getLogger, VERSION, MIMETYPE_PAGE ) @@ -76,24 +73,43 @@ def test_file_groups(sbb_sample_01): def test_find_all_files(sbb_sample_01): - assert len(sbb_sample_01.find_all_files()) == 35, '35 files total' - assert len(sbb_sample_01.find_all_files(fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' - assert len(sbb_sample_01.find_all_files(include_fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' - assert len(sbb_sample_01.find_all_files(fileGrp='//OCR-D-I.*')) == 13, '13 files in "//OCR-D-I.*"' - assert len(sbb_sample_01.find_all_files(fileGrp='//OCR-D-I.*', exclude_fileGrp=['OCR-D-IMG'])) == 10, '10 files in "//OCR-D-I.*" sans OCR-D-IMG' - assert len(sbb_sample_01.find_all_files(ID="FILE_0001_IMAGE")) == 1, '1 files with ID "FILE_0001_IMAGE"' - assert len(sbb_sample_01.find_all_files(ID="//FILE_0005_.*")) == 1, '1 files with ID "//FILE_0005_.*"' - assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001')) == 17, '17 files for page "PHYS_0001"' - assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001-NOTEXIST')) == 0, '0 pages for "PHYS_0001-NOTEXIST"' - assert len(sbb_sample_01.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff' - assert len(sbb_sample_01.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*' - assert len(sbb_sample_01.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE - assert len(sbb_sample_01.find_all_files(local_filename='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 FILE xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' - assert len(sbb_sample_01.find_all_files(url='https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif')) == 1, '1 URL xlink:href="https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif"' - assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' - assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' - assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' - assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' + mets = sbb_sample_01 + assert len(mets.find_all_files()) == 35, '35 files total' + assert len(mets.find_all_files(fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' + assert len(mets.find_all_files(include_fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' + assert len(mets.find_all_files(fileGrp='//OCR-D-I.*')) == 13, '13 files in "//OCR-D-I.*"' + assert len(mets.find_all_files(fileGrp='//OCR-D-I.*', exclude_fileGrp=['OCR-D-IMG'])) == 10, '10 files in "//OCR-D-I.*" sans OCR-D-IMG' + assert len(mets.find_all_files(ID="FILE_0001_IMAGE")) == 1, '1 files with ID "FILE_0001_IMAGE"' + assert len(mets.find_all_files(ID="//FILE_0005_.*")) == 1, '1 files with ID "//FILE_0005_.*"' + assert len(mets.find_all_files(pageId='PHYS_0001')) == 17, '17 files for page "PHYS_0001"' + assert len(mets.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff' + assert len(mets.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*' + assert len(mets.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE + assert len(mets.find_all_files(local_filename='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 FILE xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' + assert len(mets.find_all_files(url='https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif')) == 1, '1 URL xlink:href="https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif"' + assert len(mets.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' + assert len(mets.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' + assert len(mets.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' + assert len(mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' + assert len(mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' + assert len(mets.find_all_files(pageId='1..10')) == 35, '35 files in @ORDER range 1..10' + assert len(mets.find_all_files(pageId='1..5')) == 35, '35 files in @ORDER range 1..10' + assert len(mets.find_all_files(pageId='PHYS_0001,PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' + assert len(mets.find_all_files(pageId='PHYS_0001..PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' + assert len(mets.find_all_files(pageId='page 1..page 2,5')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' + assert len(mets.find_all_files(pageId='PHYS_0005,1..2')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' + with pytest.raises(ValueError, match='differ in their non-numeric part'): + len(mets.find_all_files(pageId='1..PHYS_0002')) + with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): + mets.find_all_files(pageId='PHYS_0006..PHYS_0029') + with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): + mets.find_all_files(pageId='PHYS_0001-NOTEXIST') + with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): + mets.find_all_files(pageId='1..5,PHYS_0006..PHYS_0029') + with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): + mets.find_all_files(pageId='//PHYS000.*') + with pytest.raises(ValueError, match=re.compile(f'Start of range pattern')): + mets.find_all_files(pageId='PHYS_0000..PHYS_0004') def test_find_all_files_local_only(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001', @@ -392,7 +408,7 @@ def test_update_physical_page_attributes(sbb_directory_ocrd_mets): assert len(m.physical_pages) == 1 assert b'ORDER' not in m.to_xml() assert b'ORDERLABEL' not in m.to_xml() - m.update_physical_page_attributes('new page', order='foo', orderlabel='bar') + m.update_physical_page_attributes('new page', ORDER='foo', ORDERLABEL='bar') assert b'ORDER' in m.to_xml() assert b'ORDERLABEL' in m.to_xml() diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index c6d560ba1..be4cc56ed 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -204,7 +204,7 @@ class ZipTestProcessor(Processor): pass ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id=None) ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0001') - for page_id in [None, 'phys_0001,phys_0002']: + for page_id in [None, 'phys_0001']: with self.subTest(page_id=page_id): proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index f61de4baf..3487afd16 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -1,6 +1,7 @@ -from collections.abc import Generator +import re from typing import Iterable, Tuple from pytest import fixture, raises +import pytest from tests.base import assets from itertools import repeat @@ -187,22 +188,44 @@ def test_mets_server_socket_stop(start_mets_server): def test_find_all_files(start_mets_server : Tuple[str, Workspace]): _, workspace_server = start_mets_server - assert len(workspace_server.mets.find_all_files()) == 35, '35 files total' - assert len(workspace_server.mets.find_all_files(fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' - assert len(workspace_server.mets.find_all_files(fileGrp='//OCR-D-I.*')) == 13, '13 files in "//OCR-D-I.*"' - assert len(workspace_server.mets.find_all_files(ID="FILE_0001_IMAGE")) == 1, '1 files with ID "FILE_0001_IMAGE"' - assert len(workspace_server.mets.find_all_files(ID="//FILE_0005_.*")) == 1, '1 files with ID "//FILE_0005_.*"' - assert len(workspace_server.mets.find_all_files(pageId='PHYS_0001')) == 17, '17 files for page "PHYS_0001"' - assert len(workspace_server.mets.find_all_files(pageId='PHYS_0001-NOTEXIST')) == 0, '0 pages for "PHYS_0001-NOTEXIST"' - assert len(workspace_server.mets.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff' - assert len(workspace_server.mets.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*' - assert len(workspace_server.mets.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE - assert len(workspace_server.mets.find_all_files(local_filename='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 FILE xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' - assert len(workspace_server.mets.find_all_files(url='https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif')) == 1, '1 URL xlink:href="https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif"' - assert len(workspace_server.mets.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' - assert len(workspace_server.mets.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' - assert len(workspace_server.mets.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' - assert len(workspace_server.mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' + mets = workspace_server.mets + assert len(mets.find_all_files()) == 35, '35 files total' + assert len(mets.find_all_files(fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' + # TODO https://github.com/OCR-D/core/issues/1185 + # assert len(mets.find_all_files(include_fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' + assert len(mets.find_all_files(fileGrp='//OCR-D-I.*')) == 13, '13 files in "//OCR-D-I.*"' + # TODO https://github.com/OCR-D/core/issues/1185 + # assert len(mets.find_all_files(fileGrp='//OCR-D-I.*', exclude_fileGrp=['OCR-D-IMG'])) == 10, '10 files in "//OCR-D-I.*" sans OCR-D-IMG' + assert len(mets.find_all_files(ID="FILE_0001_IMAGE")) == 1, '1 files with ID "FILE_0001_IMAGE"' + assert len(mets.find_all_files(ID="//FILE_0005_.*")) == 1, '1 files with ID "//FILE_0005_.*"' + assert len(mets.find_all_files(pageId='PHYS_0001')) == 17, '17 files for page "PHYS_0001"' + assert len(mets.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff' + assert len(mets.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*' + assert len(mets.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE + assert len(mets.find_all_files(local_filename='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 FILE xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' + assert len(mets.find_all_files(url='https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif')) == 1, '1 URL xlink:href="https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif"' + assert len(mets.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' + assert len(mets.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' + assert len(mets.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' + assert len(mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' + assert len(mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' + assert len(mets.find_all_files(pageId='1..10')) == 35, '35 files in @ORDER range 1..10' + assert len(mets.find_all_files(pageId='1..5')) == 35, '35 files in @ORDER range 1..10' + assert len(mets.find_all_files(pageId='PHYS_0001,PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' + assert len(mets.find_all_files(pageId='PHYS_0001..PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' + assert len(mets.find_all_files(pageId='page 1..page 2,5')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' + assert len(mets.find_all_files(pageId='PHYS_0005,1..2')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' + # TODO https://github.com/OCR-D/core/issues/1185 + # with pytest.raises(ValueError, match='differ in their non-numeric part'): + # len(mets.find_all_files(pageId='1..PHYS_0002')) + # with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): + # mets.find_all_files(pageId='PHYS_0006..PHYS_0029') + # with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): + # mets.find_all_files(pageId='PHYS_0001-NOTEXIST') + # with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): + # mets.find_all_files(pageId='1..5,PHYS_0006..PHYS_0029') + # with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): + # mets.find_all_files(pageId='//PHYS000.*') def test_reload(start_mets_server : Tuple[str, Workspace]): _, workspace_server = start_mets_server diff --git a/tests/test_utils.py b/tests/test_utils.py index 8fe3fd373..89ff6d90f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -296,9 +296,15 @@ def test_make_file_id_744(): def test_generate_range(): assert generate_range('PHYS_0001', 'PHYS_0005') == ['PHYS_0001', 'PHYS_0002', 'PHYS_0003', 'PHYS_0004', 'PHYS_0005'] with raises(ValueError, match='could not find numeric part'): - generate_range('NONUMBER', 'ALSO_NONUMBER') + assert generate_range('NONUMBER', 'ALSO_NONUMBER') + with raises(ValueError, match='differ in their non-numeric part'): + generate_range('PHYS_0001_123', 'PHYS_0010_123') + with raises(ValueError, match='differ in their non-numeric part'): + assert generate_range('1', 'PHYS_0005') == 0 + with raises(ValueError, match='differ in their non-numeric part'): + assert generate_range('1', 'page 5') == 0 with warns(UserWarning, match='same number'): - generate_range('PHYS_0001_123', 'PHYS_0010_123') == 'PHYS_0001_123' + assert generate_range('PHYS_0001_123', 'PHYS_0001_123') == ['PHYS_0001_123'] def test_safe_filename(): assert safe_filename('Hello world,!') == 'Hello_world_' diff --git a/tests/validator/test_workspace_validator.py b/tests/validator/test_workspace_validator.py index 39c03bd84..bc516d5a5 100644 --- a/tests/validator/test_workspace_validator.py +++ b/tests/validator/test_workspace_validator.py @@ -41,20 +41,20 @@ def test_check_file_grp_basic(self): def test_check_file_grp_page_id_str(self): workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml')) - report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id='PHYS_0003,PHYS_0001') + report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id='PHYS_0001') self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) self.assertEqual(report.errors[0], "Output fileGrp[@USE='OCR-D-IMG-BIN'] already contains output for page PHYS_0001") def test_check_file_grp_page_id_list(self): workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml')) - report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id=['PHYS_0003','PHYS_0001']) + report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id=['PHYS_0001']) self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) def test_check_file_grp_page_id_valid(self): workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml')) - report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id='PHYS_0004') + report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id='PHYS_0005') self.assertTrue(report.is_valid) def test_simple(self):