From e0826e73952c22fba48c1d8b4f3fa0409ad5057e Mon Sep 17 00:00:00 2001 From: M3ssman Date: Thu, 27 Aug 2020 16:14:15 +0200 Subject: [PATCH 1/9] [app][feat] extract mets from oai response --- ocrd/ocrd/resolver.py | 19 ++++++ .../response/mets_kant_aufklaerung_1784.xml | 58 +++++++++++++++++++ .../data/response/oai_get_record_2200909.xml | 34 +++++++++++ tests/test_resolver.py | 22 ++++++- 4 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 tests/data/response/mets_kant_aufklaerung_1784.xml create mode 100644 tests/data/response/oai_get_record_2200909.xml diff --git a/ocrd/ocrd/resolver.py b/ocrd/ocrd/resolver.py index 53bef578b..f567b0537 100644 --- a/ocrd/ocrd/resolver.py +++ b/ocrd/ocrd/resolver.py @@ -2,6 +2,7 @@ from pathlib import Path import requests +import lxml.etree as ET from ocrd.constants import TMP_PREFIX from ocrd_utils import ( @@ -176,3 +177,21 @@ def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_me mets_path.write_bytes(mets.to_xml(xmllint=True)) return Workspace(self, directory, mets, mets_basename=mets_basename) + +def handle_response(data): + """ + In case of an OAI-Response, extract METS-Subtree as new root + """ + + try: + xml_root = ET.fromstring(data) + root_tag = xml_root.tag + print(f"[DEBUG] having root tag : {root_tag}") + if str(root_tag).endswith('OAI-PMH'): + mets_root_el = xml_root.find('.//{http://www.loc.gov/METS/}mets') + if mets_root_el is not None: + return ET.ElementTree(mets_root_el).getroot() + except Exception as exc: + log.error(exc) + + return None diff --git a/tests/data/response/mets_kant_aufklaerung_1784.xml b/tests/data/response/mets_kant_aufklaerung_1784.xml new file mode 100644 index 000000000..3ada6a33b --- /dev/null +++ b/tests/data/response/mets_kant_aufklaerung_1784.xml @@ -0,0 +1,58 @@ + + + + + DFG-Koordinierungsprojekt zur Weiterentwicklung von Verfahren der Optical Character Recognition (OCR-D) + OCR-D + + + + + + + http://kant_aufklaerung_1784 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/data/response/oai_get_record_2200909.xml b/tests/data/response/oai_get_record_2200909.xml new file mode 100644 index 000000000..3683b33c1 --- /dev/null +++ b/tests/data/response/oai_get_record_2200909.xml @@ -0,0 +1,34 @@ + + + 2020-08-27T10:34:23Z + http://digital.bibliothek.uni-halle.de/hd/oai/ + + +
+ oai:digital.bibliothek.uni-halle.de/hd:2200909 + 2014-12-16T10:51:35Z + ulbhaldod +
+ + + vls/2.12.1ulbhal-hspedigital.bibliothek.uni-halle.de/hdvdProfessores Academiæ Ienensis. Stvdiosis Salvtem. Hodie ducetur funuss Vinariæ, Illustrissimae ... Coniugi inclyti Principis Eclectoris Iohannis Friderici Ducis Saxoniae ... Bene ualete 23. Feb: Anno salutis. 1554. Epitaphivm ... Ioan. StigeliusStigel, JohannJohannStigel1515-1562asnBeteiligte PersonSibylle <Sachsen, Kurfürstin>1512-1554asnVerstorb.Johann Friedrich <Sachsen, Kurfürst>1503-1554asnBeteiligte Persontext[S.l.]1554[1554]monographicHalle, SaaleUniversitäts- und Landesbibliothek Sachsen-Anhalt2014[Electronic ed.]lat[2] Bl. ; 2°Fragment, das nicht zu zuordnen isturn:nbn:de:gbv:3:3-52565AB 69103 (2)digitized copystudiosis salutem16-07-9815-12-14246545216 + + + +http://digital.bibliothek.uni-halle.de/hd/domainresource/static/graphics/connectors/viewerLogo.gifUniversitäts- und Landesbibliothek Sachsen-Anhalthttp://www.bibliothek.uni-halle.de + + + + + + +http://opac.bibliothek.uni-halle.de/DB=1/CLK?IKT=12&TRM=246545216 +http://digital.bibliothek.uni-halle.de/hd/id/2200909 + + + + + +
+
+
\ No newline at end of file diff --git a/tests/test_resolver.py b/tests/test_resolver.py index cfd828783..70d8260e1 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -1,10 +1,15 @@ +# -*- coding: utf-8 -*- + from os.path import join as pjoin from pathlib import Path from tempfile import TemporaryDirectory from tests.base import TestCase, assets, main, copy_of_directory -from ocrd.resolver import Resolver +from ocrd.resolver import ( + Resolver, + handle_response +) from ocrd_utils import pushd_popd METS_HEROLD = assets.url_of('SBB0000F29300010000/data/mets.xml') @@ -147,5 +152,20 @@ def test_download_to_directory_subdir(self): self.assertEqual(fn, pjoin('baz', 'mets.xml')) self.assertTrue(Path(dst, fn).exists()) + def test_handle_response_oai(self): + with open('./tests/data/response/oai_get_record_2200909.xml', 'rb') as f: + content = f.read() + result = handle_response(content) + expected_start = b' Date: Thu, 27 Aug 2020 20:50:48 +0200 Subject: [PATCH 2/9] [app][test] introduce mock requests --- ocrd/ocrd/resolver.py | 12 +++++++----- tests/test_resolver.py | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/ocrd/ocrd/resolver.py b/ocrd/ocrd/resolver.py index f567b0537..d0a5da026 100644 --- a/ocrd/ocrd/resolver.py +++ b/ocrd/ocrd/resolver.py @@ -99,7 +99,8 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip', response = requests.get(url) if response.status_code != 200: raise Exception("HTTP request failed: %s (HTTP %d)" % (url, response.status_code)) - dst_path.write_bytes(response.content) + contents = handle_response(response.content) + dst_path.write_bytes(contents) return ret @@ -180,18 +181,19 @@ def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_me def handle_response(data): """ - In case of an OAI-Response, extract METS-Subtree as new root + In case of an OAI-Response, extract first METS-Entry-Data """ try: xml_root = ET.fromstring(data) root_tag = xml_root.tag - print(f"[DEBUG] having root tag : {root_tag}") if str(root_tag).endswith('OAI-PMH'): mets_root_el = xml_root.find('.//{http://www.loc.gov/METS/}mets') if mets_root_el is not None: - return ET.ElementTree(mets_root_el).getroot() + new_tree = ET.ElementTree(mets_root_el) + return ET.tostring(new_tree, pretty_print=True, encoding='UTF-8') except Exception as exc: log.error(exc) - return None + return data + diff --git a/tests/test_resolver.py b/tests/test_resolver.py index 70d8260e1..8e9dbc591 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -1,8 +1,12 @@ # -*- coding: utf-8 -*- +import os, shutil from os.path import join as pjoin from pathlib import Path from tempfile import TemporaryDirectory +from unittest import mock + +import pytest from tests.base import TestCase, assets, main, copy_of_directory @@ -167,5 +171,39 @@ def test_handle_response_mets(self): expected_start = b' Date: Thu, 27 Aug 2020 21:05:07 +0200 Subject: [PATCH 3/9] [app][rfct] replace generic exception --- ocrd/ocrd/resolver.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ocrd/ocrd/resolver.py b/ocrd/ocrd/resolver.py index d0a5da026..c479412f6 100644 --- a/ocrd/ocrd/resolver.py +++ b/ocrd/ocrd/resolver.py @@ -179,6 +179,7 @@ def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_me return Workspace(self, directory, mets, mets_basename=mets_basename) + def handle_response(data): """ In case of an OAI-Response, extract first METS-Entry-Data @@ -188,12 +189,12 @@ def handle_response(data): xml_root = ET.fromstring(data) root_tag = xml_root.tag if str(root_tag).endswith('OAI-PMH'): + log.info("detected root.tag '%s'" % root_tag) mets_root_el = xml_root.find('.//{http://www.loc.gov/METS/}mets') if mets_root_el is not None: new_tree = ET.ElementTree(mets_root_el) return ET.tostring(new_tree, pretty_print=True, encoding='UTF-8') - except Exception as exc: + except ET.LxmlError as exc: log.error(exc) return data - From f8b87428c303dc6901f5bce3891736faed560ec4 Mon Sep 17 00:00:00 2001 From: M3ssman Date: Fri, 28 Aug 2020 11:21:56 +0200 Subject: [PATCH 4/9] [app][fix] handle non-xml downloads --- ocrd/ocrd/resolver.py | 54 +++++++++++++++++++-------- tests/test_resolver.py | 83 ++++++++++++++++++++++++++++++++---------- 2 files changed, 101 insertions(+), 36 deletions(-) diff --git a/ocrd/ocrd/resolver.py b/ocrd/ocrd/resolver.py index c479412f6..eee1b7ea9 100644 --- a/ocrd/ocrd/resolver.py +++ b/ocrd/ocrd/resolver.py @@ -99,7 +99,7 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip', response = requests.get(url) if response.status_code != 200: raise Exception("HTTP request failed: %s (HTTP %d)" % (url, response.status_code)) - contents = handle_response(response.content) + contents = handle_response(response) dst_path.write_bytes(contents) return ret @@ -180,21 +180,43 @@ def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_me return Workspace(self, directory, mets, mets_basename=mets_basename) -def handle_response(data): +def handle_response(response): """ - In case of an OAI-Response, extract first METS-Entry-Data + In case of a valid OAI-Response, extract first METS-Entry-Data """ - try: - xml_root = ET.fromstring(data) - root_tag = xml_root.tag - if str(root_tag).endswith('OAI-PMH'): - log.info("detected root.tag '%s'" % root_tag) - mets_root_el = xml_root.find('.//{http://www.loc.gov/METS/}mets') - if mets_root_el is not None: - new_tree = ET.ElementTree(mets_root_el) - return ET.tostring(new_tree, pretty_print=True, encoding='UTF-8') - except ET.LxmlError as exc: - log.error(exc) - - return data + content_type = response.headers['Content-Type'] + if 'xml' in content_type or 'text' in content_type: + content = response.content + + try: + if is_oai_response(content): + return extract_mets(content) + except ET.LxmlError as exc: + log.warning("textual response but no xml: %s (%s)", content, exc) + + return response.content + + +def is_oai_response(data): + xml_root = ET.fromstring(data) + root_tag = xml_root.tag + log.info("response data root.tag: '%s'" % root_tag) + return str(root_tag).endswith('OAI-PMH') + + +def extract_mets(data, preamble=''): + xml_root = ET.fromstring(data) + mets_root_el = xml_root.find('.//{http://www.loc.gov/METS/}mets') + if mets_root_el is not None: + new_tree = ET.ElementTree(mets_root_el) + xml_formatted = ET.tostring(new_tree, + pretty_print=True, + encoding='UTF-8').decode('UTF-8') + formatted_content = '{}\n{}'.format(preamble, xml_formatted) + return formatted_content.encode('UTF-8').replace(b'\n', b'\r\n') + + if 'mets' in xml_root.tag: + return data + + raise Exception("Missing mets-section in %s" % data) diff --git a/tests/test_resolver.py b/tests/test_resolver.py index 8e9dbc591..7b464d018 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -6,13 +6,16 @@ from tempfile import TemporaryDirectory from unittest import mock +from lxml.etree import ( + LxmlError +) import pytest from tests.base import TestCase, assets, main, copy_of_directory from ocrd.resolver import ( Resolver, - handle_response + extract_mets ) from ocrd_utils import pushd_popd @@ -36,9 +39,11 @@ def test_workspace_from_url_tempdir(self): mets_url='https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml') def test_workspace_from_url_download(self): + url_src = 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml' + #url_src = 'http://digital.bibliothek.uni-halle.de/hd/oai/?verb=GetRecord&metadataPrefix=mets&mode=xml&identifier=9049' with TemporaryDirectory() as dst_dir: self.resolver.workspace_from_url( - 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml', + url_src, mets_basename='foo.xml', dst_dir=dst_dir, download=True) @@ -156,27 +161,16 @@ def test_download_to_directory_subdir(self): self.assertEqual(fn, pjoin('baz', 'mets.xml')) self.assertTrue(Path(dst, fn).exists()) - def test_handle_response_oai(self): - with open('./tests/data/response/oai_get_record_2200909.xml', 'rb') as f: - content = f.read() - result = handle_response(content) - expected_start = b'\r\n Date: Fri, 28 Aug 2020 12:58:20 +0200 Subject: [PATCH 5/9] [test][fix] python 3.5 compliance --- tests/test_resolver.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_resolver.py b/tests/test_resolver.py index 7b464d018..1c0172151 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -243,10 +243,9 @@ def test_handle_response_for_invalid_content(mock_log_warning, mock_get, respons # assert behavior mock_get.assert_called_once_with(url) - mock_log_warning.assert_called_once() + assert mock_log_warning.call_count == 1 if __name__ == '__main__': main() - From fce77fed45b877d543f8ccbc55ce960d5a3aff4b Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 28 Aug 2020 14:28:26 +0200 Subject: [PATCH 6/9] [app][fix] document OAI methods in resolver, exit early if METS --- ocrd/ocrd/resolver.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/ocrd/ocrd/resolver.py b/ocrd/ocrd/resolver.py index eee1b7ea9..912937caf 100644 --- a/ocrd/ocrd/resolver.py +++ b/ocrd/ocrd/resolver.py @@ -14,6 +14,7 @@ ) from ocrd.workspace import Workspace from ocrd_models import OcrdMets +from ocrd_models.constants import NAMESPACES as NS log = getLogger('ocrd.resolver') @@ -199,15 +200,23 @@ def handle_response(response): def is_oai_response(data): + """ + Return True if data is an OAI-PMH request/response + """ xml_root = ET.fromstring(data) - root_tag = xml_root.tag + root_tag = xml_root.tag log.info("response data root.tag: '%s'" % root_tag) return str(root_tag).endswith('OAI-PMH') def extract_mets(data, preamble=''): + """ + Extract METS from an OAI-PMH GetRecord response + """ xml_root = ET.fromstring(data) - mets_root_el = xml_root.find('.//{http://www.loc.gov/METS/}mets') + if 'mets' in xml_root.tag: + return data + mets_root_el = xml_root.find('.//{%s}mets' % NS['mets']) if mets_root_el is not None: new_tree = ET.ElementTree(mets_root_el) xml_formatted = ET.tostring(new_tree, @@ -216,7 +225,4 @@ def extract_mets(data, preamble=''): formatted_content = '{}\n{}'.format(preamble, xml_formatted) return formatted_content.encode('UTF-8').replace(b'\n', b'\r\n') - if 'mets' in xml_root.tag: - return data - raise Exception("Missing mets-section in %s" % data) From 54c915f6ddc028b0e8fa35ef6dc66a09355ad282 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 28 Aug 2020 14:29:28 +0200 Subject: [PATCH 7/9] [test][rfct] move oai tests into a dedicated test_resolver_oai.py --- tests/test_resolver.py | 100 +------------------------------------ tests/test_resolver_oai.py | 95 +++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 98 deletions(-) create mode 100644 tests/test_resolver_oai.py diff --git a/tests/test_resolver.py b/tests/test_resolver.py index 1c0172151..773aae9bb 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -1,22 +1,12 @@ # -*- coding: utf-8 -*- -import os, shutil from os.path import join as pjoin from pathlib import Path from tempfile import TemporaryDirectory -from unittest import mock - -from lxml.etree import ( - LxmlError -) -import pytest from tests.base import TestCase, assets, main, copy_of_directory -from ocrd.resolver import ( - Resolver, - extract_mets -) +from ocrd.resolver import Resolver from ocrd_utils import pushd_popd METS_HEROLD = assets.url_of('SBB0000F29300010000/data/mets.xml') @@ -161,91 +151,5 @@ def test_download_to_directory_subdir(self): self.assertEqual(fn, pjoin('baz', 'mets.xml')) self.assertTrue(Path(dst, fn).exists()) - - -@pytest.fixture(name="response_dir") -def fixture_response_dir(tmpdir): - src = './tests/data/response/oai_get_record_2200909.xml' - target_file = str(tmpdir.mkdir('responses').join('oai_get_record_2200909.xml')) - shutil.copy(src, target_file) - src2 = './tests/data/response/mets_kant_aufklaerung_1784.xml' - target_file2 = str(tmpdir.join('responses').join('mets_kant_aufklaerung_1784.xml')) - shutil.copy(src2, target_file2) - return os.path.dirname(target_file) - - -@pytest.fixture(name="oai_response_content") -def fixture_oai_2200909_content(response_dir): - data_path = os.path.join(response_dir, 'oai_get_record_2200909.xml') - with open(data_path, 'rb') as f: - return f.read() - - -def test_extract_mets_from_oai_content(oai_response_content): - """Ensure that OAI-prelude gets dropped""" - - result = extract_mets(oai_response_content) - expected_start = b'\r\n\r\n Date: Fri, 28 Aug 2020 14:40:46 +0200 Subject: [PATCH 8/9] [app][rfct] move OAI functions to ocrd_models.utils --- ocrd/ocrd/resolver.py | 50 ++----------------------------- ocrd_models/ocrd_models/utils.py | 51 ++++++++++++++++++++++++++++++++ tests/test_resolver_oai.py | 14 ++++----- 3 files changed, 58 insertions(+), 57 deletions(-) diff --git a/ocrd/ocrd/resolver.py b/ocrd/ocrd/resolver.py index 912937caf..2fcb687d6 100644 --- a/ocrd/ocrd/resolver.py +++ b/ocrd/ocrd/resolver.py @@ -2,7 +2,6 @@ from pathlib import Path import requests -import lxml.etree as ET from ocrd.constants import TMP_PREFIX from ocrd_utils import ( @@ -15,6 +14,7 @@ from ocrd.workspace import Workspace from ocrd_models import OcrdMets from ocrd_models.constants import NAMESPACES as NS +from ocrd_models.utils import handle_oai_response log = getLogger('ocrd.resolver') @@ -100,7 +100,7 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip', response = requests.get(url) if response.status_code != 200: raise Exception("HTTP request failed: %s (HTTP %d)" % (url, response.status_code)) - contents = handle_response(response) + contents = handle_oai_response(response) dst_path.write_bytes(contents) return ret @@ -180,49 +180,3 @@ def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_me return Workspace(self, directory, mets, mets_basename=mets_basename) - -def handle_response(response): - """ - In case of a valid OAI-Response, extract first METS-Entry-Data - """ - - content_type = response.headers['Content-Type'] - if 'xml' in content_type or 'text' in content_type: - content = response.content - - try: - if is_oai_response(content): - return extract_mets(content) - except ET.LxmlError as exc: - log.warning("textual response but no xml: %s (%s)", content, exc) - - return response.content - - -def is_oai_response(data): - """ - Return True if data is an OAI-PMH request/response - """ - xml_root = ET.fromstring(data) - root_tag = xml_root.tag - log.info("response data root.tag: '%s'" % root_tag) - return str(root_tag).endswith('OAI-PMH') - - -def extract_mets(data, preamble=''): - """ - Extract METS from an OAI-PMH GetRecord response - """ - xml_root = ET.fromstring(data) - if 'mets' in xml_root.tag: - return data - mets_root_el = xml_root.find('.//{%s}mets' % NS['mets']) - if mets_root_el is not None: - new_tree = ET.ElementTree(mets_root_el) - xml_formatted = ET.tostring(new_tree, - pretty_print=True, - encoding='UTF-8').decode('UTF-8') - formatted_content = '{}\n{}'.format(preamble, xml_formatted) - return formatted_content.encode('UTF-8').replace(b'\n', b'\r\n') - - raise Exception("Missing mets-section in %s" % data) diff --git a/ocrd_models/ocrd_models/utils.py b/ocrd_models/ocrd_models/utils.py index b62c012cf..ff3e98fb2 100644 --- a/ocrd_models/ocrd_models/utils.py +++ b/ocrd_models/ocrd_models/utils.py @@ -3,10 +3,18 @@ """ from lxml import etree as ET +from ocrd_utils import getLogger +from .constants import NAMESPACES as NS + __all__ = [ 'xmllint_format', + 'handle_oai_response', + 'is_oai_content', + 'extract_mets_from_oai_content' ] +log = getLogger('ocrd_models.utils') + def xmllint_format(xml): """ Pretty-print XML like ``xmllint`` does. @@ -18,3 +26,46 @@ def xmllint_format(xml): document = ET.fromstring(xml, parser) return ('%s\n%s' % ('', ET.tostring(document, pretty_print=True, encoding='UTF-8').decode('utf-8'))).encode('utf-8') + +def handle_oai_response(response): + """ + In case of a valid OAI-Response, extract first METS-Entry-Data + """ + content_type = response.headers['Content-Type'] + if 'xml' in content_type or 'text' in content_type: + content = response.content + try: + if is_oai_content(content): + return extract_mets_from_oai_content(content) + except ET.LxmlError as exc: + log.warning("textual response but no xml: %s (%s)", content, exc) + return response.content + + +def is_oai_content(data): + """ + Return True if data is an OAI-PMH request/response + """ + xml_root = ET.fromstring(data) + root_tag = xml_root.tag + log.info("response data root.tag: '%s'" % root_tag) + return str(root_tag).endswith('OAI-PMH') + + +def extract_mets_from_oai_content(data, preamble=''): + """ + Extract METS from an OAI-PMH GetRecord response + """ + xml_root = ET.fromstring(data) + if 'mets' in xml_root.tag: + return data + mets_root_el = xml_root.find('.//{%s}mets' % NS['mets']) + if mets_root_el is not None: + new_tree = ET.ElementTree(mets_root_el) + xml_formatted = ET.tostring(new_tree, + pretty_print=True, + encoding='UTF-8').decode('UTF-8') + formatted_content = '{}\n{}'.format(preamble, xml_formatted) + return formatted_content.encode('UTF-8').replace(b'\n', b'\r\n') + + raise Exception("Missing mets-section in %s" % data) diff --git a/tests/test_resolver_oai.py b/tests/test_resolver_oai.py index deb4fe4f0..233b9f941 100644 --- a/tests/test_resolver_oai.py +++ b/tests/test_resolver_oai.py @@ -5,7 +5,8 @@ from tests.base import main -from ocrd.resolver import Resolver, extract_mets +from ocrd.resolver import Resolver +from ocrd_models.utils import extract_mets_from_oai_content @fixture(name="response_dir") def fixture_response_dir(tmpdir): @@ -34,21 +35,18 @@ def fixture_xml_kant_content(response_dir): def test_extract_mets_from_oai_content(oai_response_content): """Ensure that OAI-prelude gets dropped""" - result = extract_mets(oai_response_content) + result = extract_mets_from_oai_content(oai_response_content) expected_start = b'\r\n Date: Fri, 28 Aug 2020 14:57:12 +0200 Subject: [PATCH 9/9] [app][doc] Document that ocrd workspace clone supports OAI-PMH GetRecord now --- ocrd/ocrd/cli/workspace.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index 8729a3da1..a0d07cb00 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -116,6 +116,7 @@ def workspace_clone(ctx, clobber_mets, download, mets_url, workspace_dir): METS_URL can be a URL, an absolute path or a path relative to $PWD. If METS_URL is not provided, use --mets accordingly. + METS_URL can also be an OAI-PMH GetRecord URL wrapping a METS file. """ LOG = getLogger('ocrd.cli.workspace.clone') if workspace_dir: