Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

handle oai response with workspace clone #581

Merged
merged 10 commits into from
Sep 2, 2020
1 change: 1 addition & 0 deletions ocrd/ocrd/cli/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ def workspace_clone(ctx, clobber_mets, download, mets_url, workspace_dir):

METS_URL can be a URL, an absolute path or a path relative to $PWD.
If METS_URL is not provided, use --mets accordingly.
METS_URL can also be an OAI-PMH GetRecord URL wrapping a METS file.
"""
LOG = getLogger('ocrd.cli.workspace.clone')
if workspace_dir:
Expand Down
6 changes: 5 additions & 1 deletion ocrd/ocrd/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
)
from ocrd.workspace import Workspace
from ocrd_models import OcrdMets
from ocrd_models.constants import NAMESPACES as NS
from ocrd_models.utils import handle_oai_response

log = getLogger('ocrd.resolver')

Expand Down Expand Up @@ -98,7 +100,8 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip',
response = requests.get(url)
if response.status_code != 200:
raise Exception("HTTP request failed: %s (HTTP %d)" % (url, response.status_code))
dst_path.write_bytes(response.content)
contents = handle_oai_response(response)
dst_path.write_bytes(contents)

return ret

Expand Down Expand Up @@ -176,3 +179,4 @@ def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_me
mets_path.write_bytes(mets.to_xml(xmllint=True))

return Workspace(self, directory, mets, mets_basename=mets_basename)

51 changes: 51 additions & 0 deletions ocrd_models/ocrd_models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,18 @@
"""
from lxml import etree as ET

from ocrd_utils import getLogger
from .constants import NAMESPACES as NS

__all__ = [
'xmllint_format',
'handle_oai_response',
'is_oai_content',
'extract_mets_from_oai_content'
]

log = getLogger('ocrd_models.utils')

def xmllint_format(xml):
"""
Pretty-print XML like ``xmllint`` does.
Expand All @@ -18,3 +26,46 @@ def xmllint_format(xml):
document = ET.fromstring(xml, parser)
return ('%s\n%s' % ('<?xml version="1.0" encoding="UTF-8"?>',
ET.tostring(document, pretty_print=True, encoding='UTF-8').decode('utf-8'))).encode('utf-8')

def handle_oai_response(response):
"""
In case of a valid OAI-Response, extract first METS-Entry-Data
"""
content_type = response.headers['Content-Type']
if 'xml' in content_type or 'text' in content_type:
content = response.content
try:
if is_oai_content(content):
return extract_mets_from_oai_content(content)
except ET.LxmlError as exc:
log.warning("textual response but no xml: %s (%s)", content, exc)
return response.content


def is_oai_content(data):
"""
Return True if data is an OAI-PMH request/response
"""
xml_root = ET.fromstring(data)
root_tag = xml_root.tag
log.info("response data root.tag: '%s'" % root_tag)
return str(root_tag).endswith('OAI-PMH')


def extract_mets_from_oai_content(data, preamble='<?xml version="1.0" encoding="UTF-8"?>'):
"""
Extract METS from an OAI-PMH GetRecord response
"""
xml_root = ET.fromstring(data)
if 'mets' in xml_root.tag:
return data
mets_root_el = xml_root.find('.//{%s}mets' % NS['mets'])
if mets_root_el is not None:
new_tree = ET.ElementTree(mets_root_el)
xml_formatted = ET.tostring(new_tree,
pretty_print=True,
encoding='UTF-8').decode('UTF-8')
formatted_content = '{}\n{}'.format(preamble, xml_formatted)
return formatted_content.encode('UTF-8').replace(b'\n', b'\r\n')

raise Exception("Missing mets-section in %s" % data)
58 changes: 58 additions & 0 deletions tests/data/response/mets_kant_aufklaerung_1784.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
<?xml version="1.0" encoding="UTF-8"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xlink="http://www.w3.org/1999/xlink" xsi:schemaLocation="info:lc/xmlns/premis-v2 http://www.loc.gov/standards/premis/v2/premis-v2-0.xsd http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-6.xsd http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version17/mets.v1-7.xsd http://www.loc.gov/mix/v10 http://www.loc.gov/standards/mix/mix10/mix10.xsd">
<mets:metsHdr CREATEDATE="2017-11-30T16:18:26">
<mets:agent OTHERTYPE="SOFTWARE" ROLE="CREATOR" TYPE="OTHER">
<mets:name>DFG-Koordinierungsprojekt zur Weiterentwicklung von Verfahren der Optical Character Recognition (OCR-D)</mets:name>
<mets:note>OCR-D</mets:note>
</mets:agent>
</mets:metsHdr>
<mets:dmdSec ID="DMDLOG_0001">
<mets:mdWrap MDTYPE="MODS">
<mets:xmlData>
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
<mods:identifier type="purl">http://kant_aufklaerung_1784</mods:identifier>
</mods:mods>
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
<mets:fileSec>
<mets:fileGrp USE="OCR-D-IMG">
<mets:file ID="INPUT_0017" MIMETYPE="image/tiff">
<mets:FLocat LOCTYPE="URL" xlink:href="OCR-D-IMG/INPUT_0017.tif"/>
</mets:file>
<mets:file ID="INPUT_0020" MIMETYPE="image/tiff">
<mets:FLocat LOCTYPE="URL" xlink:href="OCR-D-IMG/INPUT_0020.tif"/>
</mets:file>
</mets:fileGrp>
<mets:fileGrp USE="OCR-D-GT-PAGE">
<mets:file ID="PAGE_0017_PAGE" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat LOCTYPE="URL" xlink:href="OCR-D-GT-PAGE/PAGE_0017_PAGE.xml"/>
</mets:file>
<mets:file ID="PAGE_0020_PAGE" MIMETYPE="application/vnd.prima.page+xml">
<mets:FLocat LOCTYPE="URL" xlink:href="OCR-D-GT-PAGE/PAGE_0020_PAGE.xml"/>
</mets:file>
</mets:fileGrp>
<mets:fileGrp USE="OCR-D-GT-ALTO">
<mets:file ID="PAGE_0017_ALTO" MIMETYPE="application/alto+xml">
<mets:FLocat LOCTYPE="URL" xlink:href="OCR-D-GT-ALTO/PAGE_0017_ALTO.xml"/>
</mets:file>
<mets:file ID="PAGE_0020_ALTO" MIMETYPE="application/alto+xml">
<mets:FLocat LOCTYPE="URL" xlink:href="OCR-D-GT-ALTO/PAGE_0020_ALTO.xml"/>
</mets:file>
</mets:fileGrp>
</mets:fileSec>
<mets:structMap TYPE="PHYSICAL">
<mets:div TYPE="physSequence">
<mets:div TYPE="page" ID="PHYS_0017">
<mets:fptr FILEID="INPUT_0017"/>
<mets:fptr FILEID="PAGE_0017_PAGE"/>
<mets:fptr FILEID="PAGE_0017_ALTO"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0020">
<mets:fptr FILEID="INPUT_0020"/>
<mets:fptr FILEID="PAGE_0020_PAGE"/>
<mets:fptr FILEID="PAGE_0020_ALTO"/>
</mets:div>
</mets:div>
</mets:structMap>
</mets:mets>
34 changes: 34 additions & 0 deletions tests/data/response/oai_get_record_2200909.xml

Large diffs are not rendered by default.

8 changes: 6 additions & 2 deletions tests/test_resolver.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-

from os.path import join as pjoin
from pathlib import Path
from tempfile import TemporaryDirectory
Expand Down Expand Up @@ -27,9 +29,11 @@ def test_workspace_from_url_tempdir(self):
mets_url='https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml')

def test_workspace_from_url_download(self):
url_src = 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml'
#url_src = 'http://digital.bibliothek.uni-halle.de/hd/oai/?verb=GetRecord&metadataPrefix=mets&mode=xml&identifier=9049'
with TemporaryDirectory() as dst_dir:
self.resolver.workspace_from_url(
'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml',
url_src,
mets_basename='foo.xml',
dst_dir=dst_dir,
download=True)
Expand Down Expand Up @@ -148,4 +152,4 @@ def test_download_to_directory_subdir(self):
self.assertTrue(Path(dst, fn).exists())

if __name__ == '__main__':
main()
main(__file__)
91 changes: 91 additions & 0 deletions tests/test_resolver_oai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from unittest import mock
from pytest import fixture
from shutil import copy
from os.path import join, dirname

from tests.base import main

from ocrd.resolver import Resolver
from ocrd_models.utils import extract_mets_from_oai_content

@fixture(name="response_dir")
def fixture_response_dir(tmpdir):
src = './tests/data/response/oai_get_record_2200909.xml'
target_file = str(tmpdir.mkdir('responses').join('oai_get_record_2200909.xml'))
copy(src, target_file)
src2 = './tests/data/response/mets_kant_aufklaerung_1784.xml'
target_file2 = str(tmpdir.join('responses').join('mets_kant_aufklaerung_1784.xml'))
copy(src2, target_file2)
return dirname(target_file)


@fixture(name="oai_response_content")
def fixture_oai_2200909_content(response_dir):
data_path = join(response_dir, 'oai_get_record_2200909.xml')
with open(data_path, 'rb') as f:
return f.read()

@fixture(name="plain_xml_response_content")
def fixture_xml_kant_content(response_dir):
data_path = join(response_dir, 'mets_kant_aufklaerung_1784.xml')
with open(data_path, 'rb') as f:
return f.read()


def test_extract_mets_from_oai_content(oai_response_content):
"""Ensure that OAI-prelude gets dropped"""

result = extract_mets_from_oai_content(oai_response_content)
expected_start = b'<?xml version="1.0" encoding="UTF-8"?>\r\n<mets:mets'
assert result.startswith(expected_start)
assert b'OAI-PHM' not in result

def test_handle_response_mets(plain_xml_response_content):
"""Ensure plain XML/Text Response is not broken"""

result = extract_mets_from_oai_content(plain_xml_response_content)
expected_start = b'<?xml version="1.0"'
assert result.startswith(expected_start)

@mock.patch("requests.get")
def test_handle_common_oai_response(mock_get, response_dir, oai_response_content):
"""Base use case with valid OAI Response data"""

# arrange
url = 'http://digital.bibliothek.uni-halle.de/hd/oai/?verb=GetRecord&metadataPrefix=mets&mode=xml&identifier=9049'
mock_get.return_value.status_code = 200
mock_get.return_value.content = oai_response_content
headers = {'Content-Type': 'text/xml'}
mock_get.return_value.headers = headers
resolver = Resolver()

# act
result = resolver.download_to_directory(response_dir, url)

# assert
mock_get.assert_called_once_with(url)
assert result == 'oai'


@mock.patch("requests.get")
@mock.patch("ocrd_models.utils.log.warning")
def test_handle_response_for_invalid_content(mock_log_warning, mock_get, response_dir):
"""If invalid content is returned, store warning log entry"""

# arrange
url = 'http://digital.bibliothek.uni-halle.de/hd/oai/?verb=GetRecord&metadataPrefix=mets&mode=xml&identifier=foo'
mock_get.return_value.status_code = 200
mock_get.return_value.content = b'foo bar'
headers = {'Content-Type': 'text/plain'}
mock_get.return_value.headers = headers
resolver = Resolver()

# act
resolver.download_to_directory(response_dir, url)

# assert behavior
mock_get.assert_called_once_with(url)
assert mock_log_warning.call_count == 1

if __name__ == '__main__':
main(__file__)