diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py
index d77e981dd..e848bd494 100644
--- a/src/ocrd/cli/workspace.py
+++ b/src/ocrd/cli/workspace.py
@@ -23,6 +23,7 @@
from ocrd_utils import getLogger, initLogging, pushd_popd, EXT_TO_MIME, safe_filename, parse_json_string_or_file, partition_list, DEFAULT_METS_BASENAME
from ocrd.decorators import mets_find_options
from . import command_with_replaced_help
+from ocrd_models.constants import METS_PAGE_DIV_ATTRIBUTE
class WorkspaceCtx():
@@ -419,21 +420,22 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi
@workspace_cli.command('find')
@mets_find_options
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
- default=['local_filename'],
- multiple=True,
- type=click.Choice([
- 'url',
- 'mimetype',
- 'page_id',
- 'pageId',
- 'file_id',
- 'ID',
- 'file_grp',
- 'fileGrp',
- 'basename',
- 'basename_without_extension',
- 'local_filename',
- ]))
+ default=['local_filename'],
+ show_default=True,
+ multiple=True,
+ type=click.Choice([
+ 'url',
+ 'mimetype',
+ 'page_id',
+ 'pageId',
+ 'file_id',
+ 'ID',
+ 'file_grp',
+ 'fileGrp',
+ 'basename',
+ 'basename_without_extension',
+ 'local_filename',
+ ]))
@click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file ")
@click.option('--undo-download', is_flag=True, help="Remove all downloaded files from the METS")
@click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests")
@@ -596,31 +598,60 @@ def list_groups(ctx):
# ----------------------------------------------------------------------
@workspace_cli.command('list-page')
+@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
+ default=['ID'],
+ show_default=True,
+ multiple=True,
+ type=click.Choice(METS_PAGE_DIV_ATTRIBUTE.names()))
@click.option('-f', '--output-format', help="Output format", type=click.Choice(['one-per-line', 'comma-separated', 'json']), default='one-per-line')
@click.option('-D', '--chunk-number', help="Partition the return value into n roughly equally sized chunks", default=1, type=int)
@click.option('-C', '--chunk-index', help="Output the nth chunk of results, -1 for all of them.", default=None, type=int)
@click.option('-r', '--page-id-range', help="Restrict the pages to those matching the provided range, based on the @ID attribute. Separate start/end with ..")
@click.option('-R', '--numeric-range', help="Restrict the pages to those in the range, in numerical document order. Separate start/end with ..")
@pass_workspace
-def list_pages(ctx, output_format, chunk_number, chunk_index, page_id_range, numeric_range):
+def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page_id_range, numeric_range):
"""
List physical page IDs
+
+ (If any ``FILTER`` starts with ``//``, then its remainder
+ will be interpreted as a regular expression.)
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
find_kwargs = {}
- if page_id_range:
+ if page_id_range and 'ID' in output_field:
find_kwargs['pageId'] = page_id_range
- ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId})
+ page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId})
+ ret = []
+
+ if output_field == ['ID']:
+ ret = [[x] for x in page_ids]
+ else:
+ for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)):
+ ret.append([])
+ for k in output_field:
+ ret[i].append(page_div.get(k, 'None'))
+
if numeric_range:
start, end = map(int, numeric_range.split('..'))
- ids = ids[start-1:end]
- chunks = partition_list(ids, chunk_number, chunk_index)
+ ret = ret[start-1:end]
+
+ chunks = partition_list(ret, chunk_number, chunk_index)
+ lines = []
if output_format == 'one-per-line':
- print("\n".join(["\n".join(chunk) for chunk in chunks]))
+ for chunk in chunks:
+ line_strs = []
+ for entry in chunk:
+ line_strs.append("\t".join(entry))
+ lines.append('\n'.join(line_strs))
elif output_format == 'comma-separated':
- print("\n".join([",".join(chunk) for chunk in chunks]))
+ for chunk in chunks:
+ line_strs = []
+ for entry in chunk:
+ line_strs.append("\t".join(entry))
+ lines.append(','.join(line_strs))
elif output_format == 'json':
- print(dumps(chunks))
+ lines.append(dumps(chunks))
+ print('\n'.join(lines))
# ----------------------------------------------------------------------
# ocrd workspace get-id
@@ -657,18 +688,30 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin
workspace.save_mets()
@workspace_cli.command('update-page')
-@click.option('--order', help="@ORDER attribute for this mets:div", metavar='ORDER')
-@click.option('--orderlabel', help="@ORDERLABEL attribute for this mets:div", metavar='ORDERLABEL')
-@click.option('--contentids', help="@CONTENTIDS attribute for this mets:div", metavar='ORDERLABEL')
+@click.option('--set', 'attr_value_pairs', help=f"set mets:div ATTR to VALUE. possible keys: {METS_PAGE_DIV_ATTRIBUTE.names()}", metavar="ATTR VALUE", nargs=2, multiple=True)
+@click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER')
+@click.option('--orderlabel', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
+@click.option('--contentids', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
@click.argument('PAGE_ID')
@pass_workspace
-def update_page(ctx, order, orderlabel, contentids, page_id):
+def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id):
"""
- Update the @ORDER, @ORDERLABEL o @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID
+ Update the @ID, @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID
"""
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
- workspace.mets.update_physical_page_attributes(page_id, order=order, orderlabel=orderlabel, contentids=contentids)
- workspace.save_mets()
+ update_kwargs = {k: v for k, v in attr_value_pairs}
+ if order:
+ update_kwargs['ORDER'] = order
+ if orderlabel:
+ update_kwargs['ORDERLABEL'] = orderlabel
+ if contentids:
+ update_kwargs['CONTENTIDS'] = contentids
+ try:
+ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
+ workspace.mets.update_physical_page_attributes(page_id, **update_kwargs)
+ workspace.save_mets()
+ except Exception as err:
+ print(f"Error: {err}")
+ sys.exit(1)
# ----------------------------------------------------------------------
# ocrd workspace merge
diff --git a/src/ocrd_models/constants.py b/src/ocrd_models/constants.py
index 01068b7f3..db6e51e3a 100644
--- a/src/ocrd_models/constants.py
+++ b/src/ocrd_models/constants.py
@@ -1,6 +1,7 @@
"""
Constants for ocrd_models.
"""
+from enum import Enum, auto
from ocrd_utils import resource_string
__all__ = [
@@ -26,6 +27,7 @@
'TAG_PAGE_TEXTLINE',
'TAG_PAGE_TEXTEQUIV',
'TAG_PAGE_TEXTREGION',
+ 'METS_PAGE_DIV_ATTRIBUTE',
]
@@ -70,3 +72,14 @@
'LineDrawing', 'Map', 'Maths', 'Music', 'Noise',
'Separator', 'Table', 'Text', 'Unknown'
]
+
+class METS_PAGE_DIV_ATTRIBUTE(Enum):
+ ID = auto()
+ ORDER = auto()
+ ORDERLABEL = auto()
+ LABEL = auto()
+ CONTENTIDS = auto()
+
+ @classmethod
+ def names(cls):
+ return [x.name for x in cls]
diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py
index 19ccfb80e..c18bedc4f 100644
--- a/src/ocrd_models/ocrd_mets.py
+++ b/src/ocrd_models/ocrd_mets.py
@@ -3,11 +3,7 @@
"""
from datetime import datetime
import re
-import typing
-from typing_extensions import Optional
-from lxml import etree as ET # type: ignore
-from copy import deepcopy
-from warnings import warn
+from typing import Dict, Optional
from ocrd_utils import (
getLogger,
@@ -33,9 +29,10 @@
IDENTIFIER_PRIORITY,
TAG_MODS_IDENTIFIER,
METS_XML_EMPTY,
+ METS_PAGE_DIV_ATTRIBUTE
)
-from .ocrd_xml_base import OcrdXmlDocument
+from .ocrd_xml_base import OcrdXmlDocument, ET # type: ignore
from .ocrd_file import OcrdFile
from .ocrd_agent import OcrdAgent
@@ -45,6 +42,23 @@ class OcrdMets(OcrdXmlDocument):
"""
API to a single METS file
"""
+ _cache_flag : bool
+ # Cache for the pages (mets:div)
+ # The dictionary's Key: 'div.ID'
+ # The dictionary's Value: a 'div' object at some memory location
+ _page_cache : Dict[METS_PAGE_DIV_ATTRIBUTE, Dict[str, ET.Element]]
+ # Cache for the files (mets:file) - two nested dictionaries
+ # The outer dictionary's Key: 'fileGrp.USE'
+ # The outer dictionary's Value: Inner dictionary
+ # The inner dictionary's Key: 'file.ID'
+ # The inner dictionary's Value: a 'file' object at some memory location
+ _file_cache : Dict[str, Dict[str, ET.Element]]
+ # Cache for the file pointers (mets:fptr) - two nested dictionaries
+ # The outer dictionary's Key: 'div.ID'
+ # The outer dictionary's Value: Inner dictionary
+ # The inner dictionary's Key: 'fptr.FILEID'
+ # The inner dictionary's Value: a 'fptr' object at some memory location
+ _fptr_cache : Dict[str, Dict[str, ET.Element]]
@staticmethod
def empty_mets(now=None, cache_flag=False):
@@ -71,15 +85,18 @@ def __init__(self, **kwargs):
'enabled' if config.OCRD_METS_CACHING else 'disabled', config.raw_value('OCRD_METS_CACHING'))
self._cache_flag = config.OCRD_METS_CACHING
+
# If cache is enabled
if self._cache_flag:
- self.refresh_caches()
+ self._initialize_caches()
+ self._refresh_caches()
def __str__(self):
"""
String representation
"""
- return 'OcrdMets[cached=%s,fileGrps=%s,files=%s]' % (self._cache_flag, self.file_groups, list(self.find_files()))
+ return 'OcrdMets[cached=%s,fileGrps=%s,files=%s]' % (
+ self._cache_flag, self.file_groups, list(self.find_files()))
def _fill_caches(self):
"""
@@ -103,7 +120,7 @@ def _fill_caches(self):
for el_file in el_fileGrp:
file_id = el_file.get('ID')
- self._file_cache[fileGrp_use].update({file_id : el_file})
+ self._file_cache[fileGrp_use].update({file_id: el_file})
# log.info("File added to the cache: %s" % file_id)
# Fill with pages
@@ -116,7 +133,8 @@ def _fill_caches(self):
div_id = el_div.get('ID')
log.debug("DIV_ID: %s" % el_div.get('ID'))
- self._page_cache[div_id] = el_div
+ for attr in METS_PAGE_DIV_ATTRIBUTE:
+ self._page_cache[attr][str(el_div.get(attr.name))] = el_div
# Assign an empty dictionary that will hold the fptr of the added page (div)
self._fptr_cache[div_id] = {}
@@ -124,46 +142,26 @@ def _fill_caches(self):
# log.info("Page_id added to the cache: %s" % div_id)
for el_fptr in el_div:
- self._fptr_cache[div_id].update({el_fptr.get('FILEID') : el_fptr})
+ self._fptr_cache[div_id].update({el_fptr.get('FILEID'): el_fptr})
# log.info("Fptr added to the cache: %s" % el_fptr.get('FILEID'))
- # log.info("Len of page_cache: %s" % len(self._page_cache))
+ # log.info("Len of page_cache: %s" % len(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID]))
# log.info("Len of fptr_cache: %s" % len(self._fptr_cache))
- def _clear_caches(self):
- """
- Deallocates the caches
- """
-
+ def _initialize_caches(self):
self._file_cache = {}
- self._page_cache = {}
+ # NOTE we can only guarantee uniqueness for @ID and @ORDER
+ self._page_cache = {k : {} for k in METS_PAGE_DIV_ATTRIBUTE}
self._fptr_cache = {}
-
- def refresh_caches(self):
+
+ def _refresh_caches(self):
if self._cache_flag:
- # Cache for the files (mets:file) - two nested dictionaries
- # The outer dictionary's Key: 'fileGrp.USE'
- # The outer dictionary's Value: Inner dictionary
- # The inner dictionary's Key: 'file.ID'
- # The inner dictionary's Value: a 'file' object at some memory location
- self._file_cache = {}
-
- # Cache for the pages (mets:div)
- # The dictionary's Key: 'div.ID'
- # The dictionary's Value: a 'div' object at some memory location
- self._page_cache = {}
-
- # Cache for the file pointers (mets:fptr) - two nested dictionaries
- # The outer dictionary's Key: 'div.ID'
- # The outer dictionary's Value: Inner dictionary
- # The inner dictionary's Key: 'fptr.FILEID'
- # The inner dictionary's Value: a 'fptr' object at some memory location
- self._fptr_cache = {}
-
+ self._initialize_caches()
+
# Note, if the empty_mets() function is used to instantiate OcrdMets
# Then the cache is empty even after this operation
self._fill_caches()
-
+
@property
def unique_identifier(self):
"""
@@ -174,7 +172,7 @@ def unique_identifier(self):
found = self._tree.getroot().find('.//mods:identifier[@type="%s"]' % t, NS)
if found is not None:
return found.text
-
+
@unique_identifier.setter
def unique_identifier(self, purl):
"""
@@ -214,7 +212,6 @@ def add_agent(self, *args, **kwargs):
el_agent_last.addnext(el_agent)
except StopIteration:
el_metsHdr.insert(0, el_agent)
- # print(ET.tostring(el_metsHdr))
return OcrdAgent(el_agent, *args, **kwargs)
@property
@@ -225,7 +222,7 @@ def file_groups(self):
# WARNING: Actually we cannot return strings in place of elements!
if self._cache_flag:
- return list(self._file_cache.keys())
+ return list(self._file_cache.keys())
return [el.get('USE') for el in self._tree.getroot().findall('.//mets:fileGrp', NS)]
@@ -276,25 +273,13 @@ def find_files(
"""
pageId_list = []
if pageId:
- pageId_patterns = []
- for pageId_token in re.split(r',', pageId):
- if pageId_token.startswith(REGEX_PREFIX):
- pageId_patterns.append(re.compile(pageId_token[REGEX_PREFIX_LEN:]))
- elif '..' in pageId_token:
- pageId_patterns += generate_range(*pageId_token.split('..', 1))
+ # returns divs instead of strings of ids
+ physical_pages = self.get_physical_pages(for_pageIds=pageId, return_divs=True)
+ for div in physical_pages:
+ if self._cache_flag:
+ pageId_list += self._fptr_cache[div.get('ID')]
else:
- pageId_patterns += [pageId_token]
- if self._cache_flag:
- for page_id in self._page_cache.keys():
- if page_id in pageId_patterns or \
- any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]):
- pageId_list += self._fptr_cache[page_id]
- else:
- for page in self._tree.getroot().xpath(
- '//mets:div[@TYPE="page"]', namespaces=NS):
- if page.get('ID') in pageId_patterns or \
- any([isinstance(p, typing.Pattern) and p.fullmatch(page.get('ID')) for p in pageId_patterns]):
- pageId_list += [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)]
+ pageId_list += [fptr.get('FILEID') for fptr in div.findall('mets:fptr', NS)]
if ID and ID.startswith(REGEX_PREFIX):
ID = re.compile(ID[REGEX_PREFIX_LEN:])
@@ -304,19 +289,20 @@ def find_files(
mimetype = re.compile(mimetype[REGEX_PREFIX_LEN:])
if url and url.startswith(REGEX_PREFIX):
url = re.compile(url[REGEX_PREFIX_LEN:])
-
+
candidates = []
if self._cache_flag:
if fileGrp:
if isinstance(fileGrp, str):
candidates += self._file_cache.get(fileGrp, {}).values()
else:
- candidates = [x for fileGrp_needle, el_file_list in self._file_cache.items() if fileGrp.match(fileGrp_needle) for x in el_file_list.values()]
+ candidates = [x for fileGrp_needle, el_file_list in self._file_cache.items() if
+ fileGrp.match(fileGrp_needle) for x in el_file_list.values()]
else:
candidates = [el_file for id_to_file in self._file_cache.values() for el_file in id_to_file.values()]
else:
candidates = self._tree.getroot().xpath('//mets:file', namespaces=NS)
-
+
for cand in candidates:
if ID:
if isinstance(ID, str):
@@ -405,7 +391,7 @@ def rename_file_group(self, old, new):
if el_fileGrp is None:
raise FileNotFoundError("No such fileGrp '%s'" % old)
el_fileGrp.set('USE', new)
-
+
if self._cache_flag:
self._file_cache[new] = self._file_cache.pop(old)
@@ -432,7 +418,7 @@ def remove_file_group(self, USE, recursive=False, force=False):
el_fileGrp = el_fileSec.find('mets:fileGrp[@USE="%s"]' % USE, NS)
else:
el_fileGrp = USE
- if el_fileGrp is None: # pylint: disable=len-as-condition
+ if el_fileGrp is None: # pylint: disable=len-as-condition
msg = "No such fileGrp: %s" % USE
if force:
log.warning(msg)
@@ -460,7 +446,8 @@ def remove_file_group(self, USE, recursive=False, force=False):
el_fileGrp.getparent().remove(el_fileGrp)
- def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force=False, local_filename=None, ignore=False, **kwargs):
+ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force=False, local_filename=None,
+ ignore=False, **kwargs):
"""
Instantiate and add a new :py:class:`ocrd_models.ocrd_file.OcrdFile`.
Arguments:
@@ -489,16 +476,19 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force
mets_file = next(self.find_files(ID=ID, fileGrp=fileGrp), None)
if mets_file:
if mets_file.fileGrp == fileGrp and \
- mets_file.pageId == pageId and \
- mets_file.mimetype == mimetype:
+ mets_file.pageId == pageId and \
+ mets_file.mimetype == mimetype:
if not force:
- raise FileExistsError(f"A file with ID=={ID} already exists {mets_file} and neither force nor ignore are set")
+ raise FileExistsError(
+ f"A file with ID=={ID} already exists {mets_file} and neither force nor ignore are set")
self.remove_file(ID=ID, fileGrp=fileGrp)
else:
- raise FileExistsError(f"A file with ID=={ID} already exists {mets_file} but unrelated - cannot mitigate")
+ raise FileExistsError(
+ f"A file with ID=={ID} already exists {mets_file} but unrelated - cannot mitigate")
# To get rid of Python's FutureWarning - checking if v is not None
- kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None}
+ kwargs = {k: v for k, v in locals().items() if
+ k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None}
# This separation is needed to reuse the same el_mets_file element in the caching if block
el_mets_file = ET.SubElement(el_fileGrp, TAG_METS_FILE)
# The caching of the physical page is done in the OcrdFile constructor
@@ -521,7 +511,7 @@ def remove_file(self, *args, **kwargs):
if len(files) > 1:
return files
else:
- return files[0] # for backwards-compatibility
+ return files[0] # for backwards-compatibility
if any(1 for kwarg in kwargs
if isinstance(kwarg, str) and kwarg.startswith(REGEX_PREFIX)):
# allow empty results if filter criteria involve a regex
@@ -571,8 +561,9 @@ def remove_one_file(self, ID, fileGrp=None):
page_div.getparent().remove(page_div)
# Delete the empty pages from caches as well
if self._cache_flag:
- del self._page_cache[page_div.get('ID')]
- del self._fptr_cache[page_div.get('ID')]
+ for attr in METS_PAGE_DIV_ATTRIBUTE:
+ if attr.name in page_div.attrib:
+ del self._page_cache[attr][page_div.attrib[attr.name]]
# Delete the file reference from the cache
if self._cache_flag:
@@ -591,33 +582,139 @@ def physical_pages(self):
List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``)
"""
if self._cache_flag:
- return list(self._page_cache.keys())
-
+ return list(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].keys())
+
return [str(x) for x in self._tree.getroot().xpath(
'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/@ID',
namespaces=NS)]
- def get_physical_pages(self, for_fileIds=None):
+ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : Optional[str] = None, return_divs : bool = False):
"""
List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``),
- optionally for a subset of ``mets:file`` ``@ID`` :py:attr:`for_fileIds`.
+ optionally for a subset of ``mets:file`` ``@ID`` :py:attr:`for_fileIds`,
+ or for a subset selector expression (comma-separated, range, and/or regex) :py:attr:`for_pageIds`.
+ If return_divs is set, returns div memory objects instead of strings of ids
"""
- if for_fileIds is None:
+ if for_fileIds is None and for_pageIds is None:
return self.physical_pages
+ # log = getLogger('ocrd.models.ocrd_mets.get_physical_pages')
+ if for_pageIds is not None:
+ ret = []
+ page_attr_patterns = []
+ page_attr_patterns_raw = re.split(r',', for_pageIds)
+ for pageId_token in page_attr_patterns_raw:
+ if pageId_token.startswith(REGEX_PREFIX):
+ page_attr_patterns.append((None, re.compile(pageId_token[REGEX_PREFIX_LEN:])))
+ elif '..' in pageId_token:
+ val_range = generate_range(*pageId_token.split('..', 1))
+ page_attr_patterns.append(val_range)
+ else:
+ page_attr_patterns.append(pageId_token)
+ if not page_attr_patterns:
+ return []
+ range_patterns_first_last = [(x[0], x[-1]) if isinstance(x, list) else None for x in page_attr_patterns]
+ page_attr_patterns_copy = list(page_attr_patterns)
+ if self._cache_flag:
+ for pat in page_attr_patterns:
+ try:
+ attr : METS_PAGE_DIV_ATTRIBUTE
+ if isinstance(pat, str):
+ attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat in self._page_cache[a])
+ cache_keys = [pat]
+ elif isinstance(pat, list):
+ attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if any(x in self._page_cache[a] for x in pat))
+ cache_keys = [v for v in pat if v in self._page_cache[attr]]
+ for k in cache_keys:
+ pat.remove(k)
+ elif isinstance(pat, tuple):
+ _, re_pat = pat
+ attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) for v in self._page_cache[a] if re_pat.fullmatch(v))
+ cache_keys = [v for v in self._page_cache[attr] if re_pat.fullmatch(v)]
+ else:
+ raise ValueError
+ if return_divs:
+ ret += [self._page_cache[attr][v] for v in cache_keys]
+ else:
+ ret += [self._page_cache[attr][v].get('ID') for v in cache_keys]
+ except StopIteration:
+ raise ValueError(f"{pat} matches none of the keys of any of the _page_caches.")
+ else:
+ page_attr_patterns_matched = []
+ for page in self._tree.getroot().xpath(
+ 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
+ namespaces=NS):
+ patterns_exhausted = []
+ for pat_idx, pat in enumerate(page_attr_patterns):
+ try:
+ if isinstance(pat, str):
+ attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat == page.get(a.name))
+ ret.append(page if return_divs else page.get('ID'))
+ patterns_exhausted.append(pat)
+ elif isinstance(pat, list):
+ if not isinstance(pat[0], METS_PAGE_DIV_ATTRIBUTE):
+ pat.insert(0, next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if any(x == page.get(a.name) for x in pat)))
+ attr_val = page.get(pat[0].name)
+ if attr_val in pat:
+ pat.remove(attr_val)
+ ret.append(page if return_divs else page.get('ID'))
+ if len(pat) == 1:
+ patterns_exhausted.append(pat)
+ elif isinstance(pat, tuple):
+ attr, re_pat = pat
+ if not attr:
+ attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if re_pat.fullmatch(page.get(a.name) or ''))
+ page_attr_patterns[pat_idx] = (attr, re_pat)
+ if re_pat.fullmatch(page.get(attr.name) or ''):
+ ret.append(page if return_divs else page.get('ID'))
+ else:
+ raise ValueError
+ page_attr_patterns_matched.append(pat)
+ except StopIteration:
+ continue
+ for p in patterns_exhausted:
+ page_attr_patterns.remove(p)
+ unmatched = [x for x in page_attr_patterns_copy if x not in page_attr_patterns_matched]
+ if unmatched:
+ raise ValueError(f"Patterns {unmatched} match none of the pages")
+
+ ranges_without_start_match = []
+ ranges_without_last_match = []
+ for idx, pat in enumerate(page_attr_patterns_copy):
+ if isinstance(pat, list):
+ start, last = range_patterns_first_last[idx]
+ if start in pat:
+ print(pat, start, last)
+ ranges_without_start_match.append(page_attr_patterns_raw[idx])
+ # if last in pat:
+ # ranges_without_last_match.append(page_attr_patterns_raw[idx])
+ if ranges_without_start_match:
+ raise ValueError(f"Start of range patterns {ranges_without_start_match} not matched - invalid range")
+ # if ranges_without_last_match:
+ # raise ValueError(f"End of range patterns {ranges_without_last_match} not matched - invalid range")
+ return ret
+
+ assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright
ret = [None] * len(for_fileIds)
-
if self._cache_flag:
for pageId in self._fptr_cache.keys():
for fptr in self._fptr_cache[pageId].keys():
if fptr in for_fileIds:
- ret[for_fileIds.index(fptr)] = pageId
+ index = for_fileIds.index(fptr)
+ if return_divs:
+ ret[index] = self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][pageId]
+ else:
+ ret[index] = pageId
else:
- for page in self._tree.getroot().xpath(
- 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
- namespaces=NS):
- for fptr in page.findall('mets:fptr', NS):
- if fptr.get('FILEID') in for_fileIds:
- ret[for_fileIds.index(fptr.get('FILEID'))] = page.get('ID')
+ for page in self._tree.getroot().xpath(
+ 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
+ namespaces=NS):
+ for fptr in page.findall('mets:fptr', NS):
+ if fptr.get('FILEID') in for_fileIds:
+ index = for_fileIds.index(fptr.get('FILEID'))
+ if return_divs:
+ ret[index] = page
+ else:
+ ret[index] = page.get('ID')
return ret
def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=None):
@@ -658,14 +755,14 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N
if el_seqdiv is None:
el_seqdiv = ET.SubElement(el_structmap, TAG_METS_DIV)
el_seqdiv.set('TYPE', 'physSequence')
-
+
el_pagediv = None
if self._cache_flag:
if pageId in self._page_cache:
- el_pagediv = self._page_cache[pageId]
+ el_pagediv = self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][pageId]
else:
el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS)
-
+
if el_pagediv is None:
el_pagediv = ET.SubElement(el_seqdiv, TAG_METS_DIV)
el_pagediv.set('TYPE', 'page')
@@ -676,33 +773,33 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N
el_pagediv.set('ORDERLABEL', orderlabel)
if self._cache_flag:
# Create a new entry in the page cache
- self._page_cache[pageId] = el_pagediv
+ self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][pageId] = el_pagediv
# Create a new entry in the fptr cache and
# assign an empty dictionary to hold the fileids
self._fptr_cache[pageId] = {}
-
+
el_fptr = ET.SubElement(el_pagediv, TAG_METS_FPTR)
el_fptr.set('FILEID', ocrd_file.ID)
if self._cache_flag:
# Assign the ocrd fileID to the pageId in the cache
- self._fptr_cache[el_pagediv.get('ID')].update({ocrd_file.ID : el_fptr})
+ self._fptr_cache[el_pagediv.get('ID')].update({ocrd_file.ID: el_fptr})
def update_physical_page_attributes(self, page_id, **kwargs):
- mets_div = None
- if self._cache_flag:
- if page_id in self._page_cache.keys():
- mets_div = [self._page_cache[page_id]]
- else:
- mets_div = self._tree.getroot().xpath(
- 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % page_id,
- namespaces=NS)
- if mets_div:
- for attr_name, attr_value in kwargs.items():
- if attr_value:
- mets_div[0].set(attr_name.upper(), attr_value)
- else:
- warn("Could not find mets:div[@ID={page_id}]")
+ invalid_keys = list(k for k in kwargs.keys() if k not in METS_PAGE_DIV_ATTRIBUTE.names())
+ if invalid_keys:
+ raise ValueError(f"Invalid attribute {invalid_keys}. Allowed values: {METS_PAGE_DIV_ATTRIBUTE.names()}")
+
+ page_div = self.get_physical_pages(for_pageIds=page_id, return_divs=True)
+ if not page_div:
+ raise ValueError(f"Could not find mets:div[@ID=={page_id}]")
+ page_div = page_div[0]
+
+ for k, v in kwargs.items():
+ if not v:
+ page_div.attrib.pop(k)
+ else:
+ page_div.attrib[k] = v
def get_physical_page_for_file(self, ocrd_file):
"""
@@ -713,7 +810,7 @@ def get_physical_page_for_file(self, ocrd_file):
if self._cache_flag:
for pageId in self._fptr_cache.keys():
if ocrd_file.ID in self._fptr_cache[pageId].keys():
- ret.append(self._page_cache[pageId].get('ID'))
+ ret.append(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][pageId].get('ID'))
else:
ret = self._tree.getroot().xpath(
'/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' %
@@ -729,16 +826,19 @@ def remove_physical_page(self, ID):
"""
mets_div = None
if self._cache_flag:
- if ID in self._page_cache.keys():
- mets_div = [self._page_cache[ID]]
+ if ID in self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].keys():
+ mets_div = [self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][ID]]
else:
mets_div = self._tree.getroot().xpath(
'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % ID,
namespaces=NS)
if mets_div:
+ mets_div_attrib = {** mets_div[0].attrib}
mets_div[0].getparent().remove(mets_div[0])
if self._cache_flag:
- del self._page_cache[ID]
+ for attr in METS_PAGE_DIV_ATTRIBUTE:
+ if attr.name in mets_div_attrib:
+ del self._page_cache[attr][mets_div_attrib[attr.name]]
del self._fptr_cache[ID]
def remove_physical_page_fptr(self, fileId):
@@ -757,10 +857,11 @@ def remove_physical_page_fptr(self, fileId):
if self._cache_flag:
for page_id in self._fptr_cache.keys():
if fileId in self._fptr_cache[page_id].keys():
- mets_fptrs.append(self._fptr_cache[page_id][fileId])
+ mets_fptrs.append(self._fptr_cache[page_id][fileId])
else:
mets_fptrs = self._tree.getroot().xpath(
- 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, namespaces=NS)
+ 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId,
+ namespaces=NS)
ret = []
for mets_fptr in mets_fptrs:
mets_div = mets_fptr.getparent()
@@ -770,7 +871,20 @@ def remove_physical_page_fptr(self, fileId):
mets_div.remove(mets_fptr)
return ret
- def merge(self, other_mets, force=False, fileGrp_mapping=None, fileId_mapping=None, pageId_mapping=None, after_add_cb=None, **kwargs):
+ @property
+ def physical_pages_labels(self):
+ """
+ Map all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``) to their
+ ``@ORDER``, ``@ORDERLABEL`` and ``@LABEL`` attributes, if any.
+ """
+ divs = self._tree.getroot().xpath(
+ 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
+ namespaces=NS)
+ return {div.get('ID'): (div.get('ORDER', None), div.get('ORDERLABEL', None), div.get('LABEL', None))
+ for div in divs}
+
+ def merge(self, other_mets, force=False, fileGrp_mapping=None, fileId_mapping=None, pageId_mapping=None,
+ after_add_cb=None, **kwargs):
"""
Add all files from other_mets.
Accepts the same kwargs as :py:func:`find_files`
@@ -789,14 +903,15 @@ def merge(self, other_mets, force=False, fileGrp_mapping=None, fileId_mapping=No
pageId_mapping = {}
for f_src in other_mets.find_files(**kwargs):
f_dest = self.add_file(
- fileGrp_mapping.get(f_src.fileGrp, f_src.fileGrp),
- mimetype=f_src.mimetype,
- url=f_src.url,
- local_filename=f_src.local_filename,
- ID=fileId_mapping.get(f_src.ID, f_src.ID),
- pageId=pageId_mapping.get(f_src.pageId, f_src.pageId),
- force=force)
+ fileGrp_mapping.get(f_src.fileGrp, f_src.fileGrp),
+ mimetype=f_src.mimetype,
+ url=f_src.url,
+ local_filename=f_src.local_filename,
+ ID=fileId_mapping.get(f_src.ID, f_src.ID),
+ pageId=pageId_mapping.get(f_src.pageId, f_src.pageId),
+ force=force)
# FIXME: merge metsHdr, amdSec, dmdSec as well
# FIXME: merge structMap logical and structLink as well
if after_add_cb:
after_add_cb(f_dest)
+
diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py
index 406fcc975..6a9986ae7 100644
--- a/src/ocrd_utils/str.py
+++ b/src/ocrd_utils/str.py
@@ -196,7 +196,7 @@ def safe_filename(url):
# print('safe filename: %s -> %s' % (url, ret))
return ret
-def generate_range(start, end):
+def generate_range(start : str, end : str) -> List[str]:
"""
Generate a list of strings by incrementing the number part of ``start`` until including ``end``.
"""
@@ -205,6 +205,8 @@ def generate_range(start, end):
start_num, end_num = re.findall(r'\d+', start)[-1], re.findall(r'\d+', end)[-1]
except IndexError:
raise ValueError("Range '%s..%s': could not find numeric part" % (start, end))
+ if start[:-len(start_num)] != end[:-len(end_num)]:
+ raise ValueError(f"Range '{start}..{end}' differ in their non-numeric part: '{start[:-len(start_num)]}' != '{end[:-len(end_num)]}'")
if start_num == end_num:
warn("Range '%s..%s': evaluates to the same number")
for i in range(int(start_num), int(end_num) + 1):
diff --git a/tests/cli/test_workspace.py b/tests/cli/test_workspace.py
index cf7eef3d0..831acbae5 100644
--- a/tests/cli/test_workspace.py
+++ b/tests/cli/test_workspace.py
@@ -560,14 +560,19 @@ def _call(args):
with pushd_popd(Path(__file__).parent.parent / 'data/list-page-workspace'):
assert _call([]) == 'PHYS_0001\nPHYS_0002\nPHYS_0003\nPHYS_0004\nPHYS_0005\nPHYS_0006\nPHYS_0008\nPHYS_0009\nPHYS_0010\nPHYS_0011\nPHYS_0012\nPHYS_0013\nPHYS_0014\nPHYS_0015\nPHYS_0016\nPHYS_0017\nPHYS_0018\nPHYS_0019\nPHYS_0020\nPHYS_0022\nPHYS_0023\nPHYS_0024\nPHYS_0025\nPHYS_0026\nPHYS_0027\nPHYS_0028\nPHYS_0029'
assert _call(['-f', 'comma-separated']) == 'PHYS_0001,PHYS_0002,PHYS_0003,PHYS_0004,PHYS_0005,PHYS_0006,PHYS_0008,PHYS_0009,PHYS_0010,PHYS_0011,PHYS_0012,PHYS_0013,PHYS_0014,PHYS_0015,PHYS_0016,PHYS_0017,PHYS_0018,PHYS_0019,PHYS_0020,PHYS_0022,PHYS_0023,PHYS_0024,PHYS_0025,PHYS_0026,PHYS_0027,PHYS_0028,PHYS_0029'
- assert _call(['-f', 'json']) == '[["PHYS_0001", "PHYS_0002", "PHYS_0003", "PHYS_0004", "PHYS_0005", "PHYS_0006", "PHYS_0008", "PHYS_0009", "PHYS_0010", "PHYS_0011", "PHYS_0012", "PHYS_0013", "PHYS_0014", "PHYS_0015", "PHYS_0016", "PHYS_0017", "PHYS_0018", "PHYS_0019", "PHYS_0020", "PHYS_0022", "PHYS_0023", "PHYS_0024", "PHYS_0025", "PHYS_0026", "PHYS_0027", "PHYS_0028", "PHYS_0029"]]'
+ assert _call(['-f', 'json']) == '[[["PHYS_0001"], ["PHYS_0002"], ["PHYS_0003"], ["PHYS_0004"], ["PHYS_0005"], ["PHYS_0006"], ["PHYS_0008"], ["PHYS_0009"], ["PHYS_0010"], ["PHYS_0011"], ["PHYS_0012"], ["PHYS_0013"], ["PHYS_0014"], ["PHYS_0015"], ["PHYS_0016"], ["PHYS_0017"], ["PHYS_0018"], ["PHYS_0019"], ["PHYS_0020"], ["PHYS_0022"], ["PHYS_0023"], ["PHYS_0024"], ["PHYS_0025"], ["PHYS_0026"], ["PHYS_0027"], ["PHYS_0028"], ["PHYS_0029"]]]'
assert _call(['-f', 'comma-separated', '-R', '5..5']) == 'PHYS_0005'
assert _call(['-f', 'comma-separated', '-R', '6..8']) == 'PHYS_0006,PHYS_0008,PHYS_0009'
+ assert _call(['-f', 'comma-separated', '-r', '1..5']) == 'PHYS_0001,PHYS_0002,PHYS_0003,PHYS_0004,PHYS_0005'
+ assert _call(['-f', 'comma-separated', '-r', '2..3']) == 'PHYS_0002,PHYS_0003'
+ assert _call(['-f', 'comma-separated', '-r', 'page 2..page 3']) == 'PHYS_0002,PHYS_0003'
assert _call(['-f', 'comma-separated', '-r', 'PHYS_0006..PHYS_0009']) == 'PHYS_0006,PHYS_0008,PHYS_0009'
assert _call(['-f', 'comma-separated', '-r', 'PHYS_0001..PHYS_0010', '-D', '3']) == 'PHYS_0001,PHYS_0002,PHYS_0003\nPHYS_0004,PHYS_0005,PHYS_0006\nPHYS_0008,PHYS_0009,PHYS_0010'
assert _call(['-f', 'comma-separated', '-r', 'PHYS_0001..PHYS_0010', '-D', '3', '-C', '2']) == 'PHYS_0008,PHYS_0009,PHYS_0010'
from json import loads
- assert loads(_call(['-f', 'json', '-r', 'PHYS_0001..PHYS_0010', '-D', '3', '-C', '2'])) == [['PHYS_0008', 'PHYS_0009', 'PHYS_0010']]
+ assert loads(_call(['-f', 'json', '-r', 'PHYS_0001..PHYS_0010', '-D', '3', '-C', '2'])) == [[['PHYS_0008'], ['PHYS_0009'], ['PHYS_0010']]]
+ assert loads(_call(['-f', 'json', '-r', 'PHYS_0001..PHYS_0010', '-k', 'ID', '-k', 'ORDERLABEL', '-D', '3', '-C', '2'])) == \
+ [[['PHYS_0008', 'page 7'], ['PHYS_0009', 'page 8'], ['PHYS_0010', 'page 9']]]
if __name__ == '__main__':
main(__file__)
diff --git a/tests/data/list-page-workspace/mets.xml b/tests/data/list-page-workspace/mets.xml
index 2786bfe07..5f5d0e306 100644
--- a/tests/data/list-page-workspace/mets.xml
+++ b/tests/data/list-page-workspace/mets.xml
@@ -99,89 +99,199 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
-
+
+
diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py
index 64ea1eccf..9634886e1 100644
--- a/tests/model/test_ocrd_mets.py
+++ b/tests/model/test_ocrd_mets.py
@@ -5,8 +5,8 @@
from os.path import join
from os import environ
from contextlib import contextmanager
+import re
import shutil
-from logging import StreamHandler
import lxml
from tests.base import (
@@ -16,9 +16,6 @@
)
from ocrd_utils import (
- initLogging,
- disableLogging,
- getLogger,
VERSION,
MIMETYPE_PAGE
)
@@ -76,24 +73,43 @@ def test_file_groups(sbb_sample_01):
def test_find_all_files(sbb_sample_01):
- assert len(sbb_sample_01.find_all_files()) == 35, '35 files total'
- assert len(sbb_sample_01.find_all_files(fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"'
- assert len(sbb_sample_01.find_all_files(include_fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"'
- assert len(sbb_sample_01.find_all_files(fileGrp='//OCR-D-I.*')) == 13, '13 files in "//OCR-D-I.*"'
- assert len(sbb_sample_01.find_all_files(fileGrp='//OCR-D-I.*', exclude_fileGrp=['OCR-D-IMG'])) == 10, '10 files in "//OCR-D-I.*" sans OCR-D-IMG'
- assert len(sbb_sample_01.find_all_files(ID="FILE_0001_IMAGE")) == 1, '1 files with ID "FILE_0001_IMAGE"'
- assert len(sbb_sample_01.find_all_files(ID="//FILE_0005_.*")) == 1, '1 files with ID "//FILE_0005_.*"'
- assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001')) == 17, '17 files for page "PHYS_0001"'
- assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001-NOTEXIST')) == 0, '0 pages for "PHYS_0001-NOTEXIST"'
- assert len(sbb_sample_01.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff'
- assert len(sbb_sample_01.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*'
- assert len(sbb_sample_01.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE
- assert len(sbb_sample_01.find_all_files(local_filename='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 FILE xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"'
- assert len(sbb_sample_01.find_all_files(url='https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif')) == 1, '1 URL xlink:href="https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif"'
- assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"'
- assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002'
- assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)'
- assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002'
+ mets = sbb_sample_01
+ assert len(mets.find_all_files()) == 35, '35 files total'
+ assert len(mets.find_all_files(fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"'
+ assert len(mets.find_all_files(include_fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"'
+ assert len(mets.find_all_files(fileGrp='//OCR-D-I.*')) == 13, '13 files in "//OCR-D-I.*"'
+ assert len(mets.find_all_files(fileGrp='//OCR-D-I.*', exclude_fileGrp=['OCR-D-IMG'])) == 10, '10 files in "//OCR-D-I.*" sans OCR-D-IMG'
+ assert len(mets.find_all_files(ID="FILE_0001_IMAGE")) == 1, '1 files with ID "FILE_0001_IMAGE"'
+ assert len(mets.find_all_files(ID="//FILE_0005_.*")) == 1, '1 files with ID "//FILE_0005_.*"'
+ assert len(mets.find_all_files(pageId='PHYS_0001')) == 17, '17 files for page "PHYS_0001"'
+ assert len(mets.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff'
+ assert len(mets.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*'
+ assert len(mets.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE
+ assert len(mets.find_all_files(local_filename='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 FILE xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"'
+ assert len(mets.find_all_files(url='https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif')) == 1, '1 URL xlink:href="https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif"'
+ assert len(mets.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"'
+ assert len(mets.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002'
+ assert len(mets.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)'
+ assert len(mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002'
+ assert len(mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002'
+ assert len(mets.find_all_files(pageId='1..10')) == 35, '35 files in @ORDER range 1..10'
+ assert len(mets.find_all_files(pageId='1..5')) == 35, '35 files in @ORDER range 1..10'
+ assert len(mets.find_all_files(pageId='PHYS_0001,PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005'
+ assert len(mets.find_all_files(pageId='PHYS_0001..PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005'
+ assert len(mets.find_all_files(pageId='page 1..page 2,5')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005'
+ assert len(mets.find_all_files(pageId='PHYS_0005,1..2')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005'
+ with pytest.raises(ValueError, match='differ in their non-numeric part'):
+ len(mets.find_all_files(pageId='1..PHYS_0002'))
+ with pytest.raises(ValueError, match=re.compile(f'match(es)? none')):
+ mets.find_all_files(pageId='PHYS_0006..PHYS_0029')
+ with pytest.raises(ValueError, match=re.compile(f'match(es)? none')):
+ mets.find_all_files(pageId='PHYS_0001-NOTEXIST')
+ with pytest.raises(ValueError, match=re.compile(f'match(es)? none')):
+ mets.find_all_files(pageId='1..5,PHYS_0006..PHYS_0029')
+ with pytest.raises(ValueError, match=re.compile(f'match(es)? none')):
+ mets.find_all_files(pageId='//PHYS000.*')
+ with pytest.raises(ValueError, match=re.compile(f'Start of range pattern')):
+ mets.find_all_files(pageId='PHYS_0000..PHYS_0004')
def test_find_all_files_local_only(sbb_sample_01):
assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001',
@@ -392,7 +408,7 @@ def test_update_physical_page_attributes(sbb_directory_ocrd_mets):
assert len(m.physical_pages) == 1
assert b'ORDER' not in m.to_xml()
assert b'ORDERLABEL' not in m.to_xml()
- m.update_physical_page_attributes('new page', order='foo', orderlabel='bar')
+ m.update_physical_page_attributes('new page', ORDER='foo', ORDERLABEL='bar')
assert b'ORDER' in m.to_xml()
assert b'ORDERLABEL' in m.to_xml()
diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py
index c6d560ba1..be4cc56ed 100644
--- a/tests/processor/test_processor.py
+++ b/tests/processor/test_processor.py
@@ -204,7 +204,7 @@ class ZipTestProcessor(Processor): pass
ws = self.resolver.workspace_from_nothing(directory=tempdir)
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id=None)
ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0001')
- for page_id in [None, 'phys_0001,phys_0002']:
+ for page_id in [None, 'phys_0001']:
with self.subTest(page_id=page_id):
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')]
diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py
index f61de4baf..3487afd16 100644
--- a/tests/test_mets_server.py
+++ b/tests/test_mets_server.py
@@ -1,6 +1,7 @@
-from collections.abc import Generator
+import re
from typing import Iterable, Tuple
from pytest import fixture, raises
+import pytest
from tests.base import assets
from itertools import repeat
@@ -187,22 +188,44 @@ def test_mets_server_socket_stop(start_mets_server):
def test_find_all_files(start_mets_server : Tuple[str, Workspace]):
_, workspace_server = start_mets_server
- assert len(workspace_server.mets.find_all_files()) == 35, '35 files total'
- assert len(workspace_server.mets.find_all_files(fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"'
- assert len(workspace_server.mets.find_all_files(fileGrp='//OCR-D-I.*')) == 13, '13 files in "//OCR-D-I.*"'
- assert len(workspace_server.mets.find_all_files(ID="FILE_0001_IMAGE")) == 1, '1 files with ID "FILE_0001_IMAGE"'
- assert len(workspace_server.mets.find_all_files(ID="//FILE_0005_.*")) == 1, '1 files with ID "//FILE_0005_.*"'
- assert len(workspace_server.mets.find_all_files(pageId='PHYS_0001')) == 17, '17 files for page "PHYS_0001"'
- assert len(workspace_server.mets.find_all_files(pageId='PHYS_0001-NOTEXIST')) == 0, '0 pages for "PHYS_0001-NOTEXIST"'
- assert len(workspace_server.mets.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff'
- assert len(workspace_server.mets.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*'
- assert len(workspace_server.mets.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE
- assert len(workspace_server.mets.find_all_files(local_filename='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 FILE xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"'
- assert len(workspace_server.mets.find_all_files(url='https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif')) == 1, '1 URL xlink:href="https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif"'
- assert len(workspace_server.mets.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"'
- assert len(workspace_server.mets.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002'
- assert len(workspace_server.mets.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)'
- assert len(workspace_server.mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002'
+ mets = workspace_server.mets
+ assert len(mets.find_all_files()) == 35, '35 files total'
+ assert len(mets.find_all_files(fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"'
+ # TODO https://github.com/OCR-D/core/issues/1185
+ # assert len(mets.find_all_files(include_fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"'
+ assert len(mets.find_all_files(fileGrp='//OCR-D-I.*')) == 13, '13 files in "//OCR-D-I.*"'
+ # TODO https://github.com/OCR-D/core/issues/1185
+ # assert len(mets.find_all_files(fileGrp='//OCR-D-I.*', exclude_fileGrp=['OCR-D-IMG'])) == 10, '10 files in "//OCR-D-I.*" sans OCR-D-IMG'
+ assert len(mets.find_all_files(ID="FILE_0001_IMAGE")) == 1, '1 files with ID "FILE_0001_IMAGE"'
+ assert len(mets.find_all_files(ID="//FILE_0005_.*")) == 1, '1 files with ID "//FILE_0005_.*"'
+ assert len(mets.find_all_files(pageId='PHYS_0001')) == 17, '17 files for page "PHYS_0001"'
+ assert len(mets.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff'
+ assert len(mets.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*'
+ assert len(mets.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE
+ assert len(mets.find_all_files(local_filename='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 FILE xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"'
+ assert len(mets.find_all_files(url='https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif')) == 1, '1 URL xlink:href="https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif"'
+ assert len(mets.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"'
+ assert len(mets.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002'
+ assert len(mets.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)'
+ assert len(mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002'
+ assert len(mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002'
+ assert len(mets.find_all_files(pageId='1..10')) == 35, '35 files in @ORDER range 1..10'
+ assert len(mets.find_all_files(pageId='1..5')) == 35, '35 files in @ORDER range 1..10'
+ assert len(mets.find_all_files(pageId='PHYS_0001,PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005'
+ assert len(mets.find_all_files(pageId='PHYS_0001..PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005'
+ assert len(mets.find_all_files(pageId='page 1..page 2,5')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005'
+ assert len(mets.find_all_files(pageId='PHYS_0005,1..2')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005'
+ # TODO https://github.com/OCR-D/core/issues/1185
+ # with pytest.raises(ValueError, match='differ in their non-numeric part'):
+ # len(mets.find_all_files(pageId='1..PHYS_0002'))
+ # with pytest.raises(ValueError, match=re.compile(f'match(es)? none')):
+ # mets.find_all_files(pageId='PHYS_0006..PHYS_0029')
+ # with pytest.raises(ValueError, match=re.compile(f'match(es)? none')):
+ # mets.find_all_files(pageId='PHYS_0001-NOTEXIST')
+ # with pytest.raises(ValueError, match=re.compile(f'match(es)? none')):
+ # mets.find_all_files(pageId='1..5,PHYS_0006..PHYS_0029')
+ # with pytest.raises(ValueError, match=re.compile(f'match(es)? none')):
+ # mets.find_all_files(pageId='//PHYS000.*')
def test_reload(start_mets_server : Tuple[str, Workspace]):
_, workspace_server = start_mets_server
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 8fe3fd373..89ff6d90f 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -296,9 +296,15 @@ def test_make_file_id_744():
def test_generate_range():
assert generate_range('PHYS_0001', 'PHYS_0005') == ['PHYS_0001', 'PHYS_0002', 'PHYS_0003', 'PHYS_0004', 'PHYS_0005']
with raises(ValueError, match='could not find numeric part'):
- generate_range('NONUMBER', 'ALSO_NONUMBER')
+ assert generate_range('NONUMBER', 'ALSO_NONUMBER')
+ with raises(ValueError, match='differ in their non-numeric part'):
+ generate_range('PHYS_0001_123', 'PHYS_0010_123')
+ with raises(ValueError, match='differ in their non-numeric part'):
+ assert generate_range('1', 'PHYS_0005') == 0
+ with raises(ValueError, match='differ in their non-numeric part'):
+ assert generate_range('1', 'page 5') == 0
with warns(UserWarning, match='same number'):
- generate_range('PHYS_0001_123', 'PHYS_0010_123') == 'PHYS_0001_123'
+ assert generate_range('PHYS_0001_123', 'PHYS_0001_123') == ['PHYS_0001_123']
def test_safe_filename():
assert safe_filename('Hello world,!') == 'Hello_world_'
diff --git a/tests/validator/test_workspace_validator.py b/tests/validator/test_workspace_validator.py
index 39c03bd84..bc516d5a5 100644
--- a/tests/validator/test_workspace_validator.py
+++ b/tests/validator/test_workspace_validator.py
@@ -41,20 +41,20 @@ def test_check_file_grp_basic(self):
def test_check_file_grp_page_id_str(self):
workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml'))
- report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id='PHYS_0003,PHYS_0001')
+ report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id='PHYS_0001')
self.assertFalse(report.is_valid)
self.assertEqual(len(report.errors), 1)
self.assertEqual(report.errors[0], "Output fileGrp[@USE='OCR-D-IMG-BIN'] already contains output for page PHYS_0001")
def test_check_file_grp_page_id_list(self):
workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml'))
- report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id=['PHYS_0003','PHYS_0001'])
+ report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id=['PHYS_0001'])
self.assertFalse(report.is_valid)
self.assertEqual(len(report.errors), 1)
def test_check_file_grp_page_id_valid(self):
workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml'))
- report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id='PHYS_0004')
+ report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id='PHYS_0005')
self.assertTrue(report.is_valid)
def test_simple(self):