Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ocrd_mets: add get_physical_pages(for_pageIds=...) #1063

Merged
merged 25 commits into from
Feb 12, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
1e3e702
ocrd_mets: add get_physical_pages(for_pageIds=...)
bertsky Jun 26, 2023
07a9fe0
ocrd workspace list-page: --page-id option
bertsky Jun 26, 2023
25854c5
ocrd_mets: expose property physical_pages_labels
bertsky Jun 28, 2023
ccb51ce
ocrd workspace list-page: add --output-field, delegating to page labels
bertsky Jun 28, 2023
e181758
get phys pages returns strs or divs
MehmedGIT Jul 4, 2023
26b64c9
merge master and adapt to page-range output changes
kba Jan 15, 2024
073d9b0
update list-page-workspace with @ORDER
kba Jan 15, 2024
e91cf50
add typing info for caches in OcrdMets
kba Jan 15, 2024
c642d04
more complete test workspace for page labelling/partitioning
kba Jan 15, 2024
9dea95f
replace update-page with a cleaner solution based on get_physical_pages
kba Jan 15, 2024
cfd1c91
OcrdMets: extend the _page_cache to include all METS_PAGE_DIV_ATTRIBUTEs
kba Jan 16, 2024
ee8fb69
implement generic page attribute ranges
kba Jan 16, 2024
1427c07
utils.generate_range: raise a ValueError if non-numeric parts differ
kba Jan 17, 2024
c36360d
fix tests
kba Jan 17, 2024
3a60c1f
revert accidental commit to ocrd_utils/pyproject.toml
kba Jan 17, 2024
643d1ef
Merge branch 'master' into ocrd-mets-get-pages-for-pageids
kba Jan 30, 2024
517814b
get_physical_pages: return early if no patterns
kba Jan 30, 2024
1225912
OcrdMets.find_all_files: fix page attr loop
kba Feb 6, 2024
4a25d1e
OcrdMets.get_physical_pages should return IDs if not return_divs
kba Feb 8, 2024
466c61d
OcrdMets.get_physical_pages: Cache the attribute in the non-cached re…
kba Feb 8, 2024
9f84067
OcrdMets.get_physical_pages: raise ValueError if a pattern matches no…
kba Feb 8, 2024
2647831
OcrdMets.get_physical_pages: iterate over pages, then patterns in non…
kba Feb 8, 2024
28a1f18
adapt tests to stricter page pattern matching
kba Feb 8, 2024
c6cfe03
OcrdMets.get_physical_pages: raise ValueError if range start not matched
kba Feb 9, 2024
8e06532
Merge branch 'master' into ocrd-mets-get-pages-for-pageids
kba Feb 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/ocrd_models/constants.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""
Constants for ocrd_models.
"""
from pkg_resources import resource_string
from enum import Enum, auto
from ocrd_utils import resource_string

__all__ = [
'IDENTIFIER_PRIORITY',
Expand Down
85 changes: 48 additions & 37 deletions src/ocrd_models/ocrd_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,6 @@ def add_agent(self, *args, **kwargs):
el_agent_last.addnext(el_agent)
except StopIteration:
el_metsHdr.insert(0, el_agent)
# print(ET.tostring(el_metsHdr))
return OcrdAgent(el_agent, *args, **kwargs)

@property
Expand Down Expand Up @@ -608,48 +607,60 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O
if pageId_token.startswith(REGEX_PREFIX):
page_attr_patterns.append(re.compile(pageId_token[REGEX_PREFIX_LEN:]))
elif '..' in pageId_token:
page_attr_patterns += generate_range(*pageId_token.split('..', 1))
val_range = generate_range(*pageId_token.split('..', 1))
if val_range:
page_attr_patterns.append(val_range)
else:
page_attr_patterns += [pageId_token]
if page_attr_patterns:
if self._cache_flag:
# determine attr to look for before iterating
try:
attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if (
any(p in self._page_cache[a] for p in page_attr_patterns) or \
any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) \
for p in page_attr_patterns \
for attr_val in self._page_cache[a]]
)))
for attr_val in self._page_cache[attr].keys():
if attr_val in page_attr_patterns or \
any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]):
if return_divs:
ret.append(self._page_cache[attr][attr_val])
else:
ret.append(attr_val)
except StopIteration:
log.debug(f"No pattern matches any keys of any of the _page_caches. patterns: {page_attr_patterns}")
else:
# determine attr during iterating
attr = None
page_attr_patterns.append(pageId_token)
if not page_attr_patterns:
return []
if self._cache_flag:
# determine attr to look for before iterating
kba marked this conversation as resolved.
Show resolved Hide resolved
try:
for pat in page_attr_patterns:
# for attr in list(METS_PAGE_DIV_ATTRIBUTE):
attr : METS_PAGE_DIV_ATTRIBUTE
if isinstance(pat, str):
attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat in self._page_cache[a])
cache_keys = [pat]
elif isinstance(pat, list):
attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat[0] in self._page_cache[a])
cache_keys = [v for v in pat if v in self._page_cache[attr]]
elif isinstance(pat, typing.Pattern):
attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) for v in self._page_cache[a] if pat.fullmatch(v))
cache_keys = [v for v in self._page_cache[attr] if pat.fullmatch(v)]
else:
raise ValueError
if return_divs:
ret += [self._page_cache[attr][v] for v in cache_keys]
else:
ret += cache_keys
kba marked this conversation as resolved.
Show resolved Hide resolved
except StopIteration:
log.debug(f"No pattern matches any keys of any of the _page_caches. patterns: {page_attr_patterns}")
kba marked this conversation as resolved.
Show resolved Hide resolved
else:
while page_attr_patterns:
pat = page_attr_patterns.pop(0)
for page in self._tree.getroot().xpath(
kba marked this conversation as resolved.
Show resolved Hide resolved
'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
namespaces=NS):
try:
if not attr:
attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if \
page.get(a.name) in page_attr_patterns or \
any([isinstance(p, typing.Pattern) and p.fullmatch(page.get(a.name)) for p in page_attr_patterns]))
attr_val = page.get(attr.name)
if attr_val in page_attr_patterns or \
any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]):
if return_divs:
ret.append(page)
else:
ret.append(attr_val)
if isinstance(pat, str):
attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat == page.get(a.name))
ret.append(page if return_divs else pat)
kba marked this conversation as resolved.
Show resolved Hide resolved
elif isinstance(pat, list):
if not isinstance(pat[0], METS_PAGE_DIV_ATTRIBUTE):
kba marked this conversation as resolved.
Show resolved Hide resolved
pat.insert(0, next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat[0] == page.get(a.name)))
attr_val = page.get(pat[0].name)
if attr_val in pat:
pat.remove(attr_val)
ret.append(page if return_divs else attr_val)
kba marked this conversation as resolved.
Show resolved Hide resolved
elif isinstance(pat, typing.Pattern):
attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat.fullmatch(page.get(a.name) or ''))
kba marked this conversation as resolved.
Show resolved Hide resolved
ret.append(page if return_divs else page.get(attr.name))
kba marked this conversation as resolved.
Show resolved Hide resolved
else:
raise ValueError
except StopIteration:
log.debug(f"No pattern matches any mets:div attributes. patterns: {page_attr_patterns}")
log.debug(f"No mets:div attributes match pattern {pat}")
kba marked this conversation as resolved.
Show resolved Hide resolved
return ret

assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright
Expand Down
2 changes: 1 addition & 1 deletion src/ocrd_utils/str.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def safe_filename(url):
# print('safe filename: %s -> %s' % (url, ret))
return ret

def generate_range(start, end):
def generate_range(start : str, end : str) -> List[str]:
"""
Generate a list of strings by incrementing the number part of ``start`` until including ``end``.
"""
Expand Down
12 changes: 10 additions & 2 deletions tests/model/test_ocrd_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,19 +80,27 @@ def test_find_all_files(sbb_sample_01):
assert len(sbb_sample_01.find_all_files(ID="FILE_0001_IMAGE")) == 1, '1 files with ID "FILE_0001_IMAGE"'
assert len(sbb_sample_01.find_all_files(ID="//FILE_0005_.*")) == 1, '1 files with ID "//FILE_0005_.*"'
assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001')) == 17, '17 files for page "PHYS_0001"'
assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001-NOTEXIST')) == 0, '0 pages for "PHYS_0001-NOTEXIST"'
# assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001-NOTEXIST')) == 0, '0 pages for "PHYS_0001-NOTEXIST"'
assert len(sbb_sample_01.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff'
assert len(sbb_sample_01.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*'
assert len(sbb_sample_01.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE
assert len(sbb_sample_01.find_all_files(local_filename='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 FILE xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"'
assert len(sbb_sample_01.find_all_files(url='https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif')) == 1, '1 URL xlink:href="https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif"'
# assert [str(x.ID) for x in sbb_sample_01.find_all_files()] == [str(x.ID) for x in sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0005')]
#print([x.ID for x in sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')])
assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"'
assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002'
print([x.ID for x in sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')])
assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)'
assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002'
assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002'
assert len(sbb_sample_01.find_all_files(pageId='1..10')) == 35, '35 files in @ORDER range 1..10'
with raises(ValueError, match='differ in their non-numeric part'):
assert len(sbb_sample_01.find_all_files(pageId='1..5')) == 35, '35 files in @ORDER range 1..10'
assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001,PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005'
assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005'
assert len(sbb_sample_01.find_all_files(pageId='page 1..page 2,5')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005'
assert len(sbb_sample_01.find_all_files(pageId='PHYS_0005,1..2')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005'
with pytest.raises(ValueError, match='differ in their non-numeric part'):
len(sbb_sample_01.find_all_files(pageId='1..PHYS_0002'))

def test_find_all_files_local_only(sbb_sample_01):
Expand Down