Skip to content

Commit

Permalink
Merge pull request #31 from metaodi/develop
Browse files Browse the repository at this point in the history
Release 0.1.0
  • Loading branch information
metaodi authored Oct 3, 2020
2 parents 79f1c22 + ea10527 commit 2ec2782
Show file tree
Hide file tree
Showing 22 changed files with 2,778 additions and 35 deletions.
14 changes: 13 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,17 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/) and this p

## [Unreleased]

## [0.1.0] - 2020-10-04
### Added
- Add `record_schema` parameter
- Add new dependencies to xmltodict and flatten-dict

### Changed
- recordData is now returned as flattened dict (if possible)

### Fixed
- Fix typo in `searchRetrieve` operation name

## [0.0.5] - 2020-06-10
### Changed
- Remove dependencies to convert md to rst
Expand Down Expand Up @@ -45,7 +56,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/) and this p
- `Fixed` for any bug fixes.
- `Security` to invite users to upgrade in case of vulnerabilities.

[Unreleased]: https://github.com/metaodi/sruthi/compare/v0.0.5...HEAD
[Unreleased]: https://github.com/metaodi/sruthi/compare/v0.1.0...HEAD
[0.1.0]: https://github.com/metaodi/sruthi/compare/v0.0.5...v0.1.0
[0.0.5]: https://github.com/metaodi/sruthi/compare/v0.0.4...v0.0.5
[0.0.4]: https://github.com/metaodi/sruthi/compare/v0.0.3...v0.0.4
[0.0.3]: https://github.com/metaodi/sruthi/compare/v0.0.2...v0.0.3
Expand Down
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ Currently only SRU 1.2 is supported.

* [Installation](#installation)
* [Usage](#usage)
* [`searchretrieve` operation](#searchretrieve-operation)
* [`explain` operation](#explain-operation)
* [Schemas](#schemas)
* [Release](#release)

## Installation

Expand Down Expand Up @@ -43,7 +46,6 @@ import sruthi

# note: records is an iterator
records = sruthi.searchretrieve('https://suche.staatsarchiv.djiktzh.ch/SRU/', query='Human')
print(records.cql)
print(records.sru_version)
print(records.count)

Expand All @@ -54,7 +56,7 @@ for record in records:

The return value of `searchretrieve` is iterable, so you can easily loop over it. Or you can use indices to access elements, e.g. `records[1]` to get the second elemenet, or `records[-1]` to get the last one.

Even [slicing](https://python-reference.readthedocs.io/en/latest/docs/brackets/slicing.html) is supported, so can can do things like only iterate over the first 5 elements using
Even [slicing](https://python-reference.readthedocs.io/en/latest/docs/brackets/slicing.html) is supported, so you can do things like only iterate over the first 5 elements using

```python
for records in records[:5]:
Expand Down
20 changes: 20 additions & 0 deletions examples/isad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import sruthi
from pprint import pprint

# check supported schemas of server
server_url = 'https://suche.staatsarchiv.djiktzh.ch/SRU/'
schema = 'isad'
server = sruthi.explain(server_url)


print(20 * '=')
print('=')
print(f"= Record with schema: {schema}")
print('=')
print(20 * '=')
records = sruthi.searchretrieve(
server_url,
query='Zurich',
record_schema=schema
)
pprint(records[0])
23 changes: 23 additions & 0 deletions examples/schemas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import sruthi
from pprint import pprint

# check supported schemas of server
server = sruthi.explain('http://lx2.loc.gov:210/LCDB?')

print(f"Supported schemas: {', '.join(server.schema.keys())}")


for schema in server.schema.keys():
print(20 * '=')
print('=')
print(f"= Record with schema: {schema}")
print('=')
print(20 * '=')
records = sruthi.searchretrieve(
'http://lx2.loc.gov:210/LCDB?',
query="human",
record_schema=schema
)
pprint(records[0])
print('')
print('')
14 changes: 14 additions & 0 deletions examples/searchretrieve.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import sruthi

records = sruthi.searchretrieve('https://suche.staatsarchiv.djiktzh.ch/SRU/', query='Zurich')
print("SRU version:", records.sru_version)
print("Count:", records.count)
print('')

for record in records:
# print fields from schema
print(record['reference'])
print(record['title'])
print(record['date'])
print(record['extra']['link']) # extra record data is available at the 'extra' key
print('')
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
requests
defusedxml
xmltodict
flatten-dict
2 changes: 1 addition & 1 deletion sruthi/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '0.0.5'
__version__ = '0.1.0'
__all__ = ['client', 'errors', 'response', 'sru', 'xmlparse']

from .errors import SruthiError, ServerIncompatibleError, SruError, NoMoreRecordsError # noqa
Expand Down
28 changes: 18 additions & 10 deletions sruthi/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,38 +7,44 @@


class Client(object):
def __init__(self, url=None, maximum_records=10):
def __init__(self, url=None, maximum_records=10, record_schema=None):
self.url = url
self.maximum_records = maximum_records
self.sru_version = '1.2'
self.record_schema = record_schema

def searchretrieve(self, query, start_record=1):
def searchretrieve(self, query, start_record=1, requests_kwargs=None):
params = {
'operation': 'searchretrieve',
'operation': 'searchRetrieve',
'version': self.sru_version,
'query': query,
'startRecord': start_record,
'maximumRecords': self.maximum_records,
}
data_loader = DataLoader(self.url, params)

if self.record_schema:
params['recordSchema'] = self.record_schema

data_loader = DataLoader(self.url, params, requests_kwargs)
return response.SearchRetrieveResponse(data_loader)

def explain(self):
def explain(self, requests_kwargs=None):
params = {
'operation': 'explain',
'version': self.sru_version,
}
data_loader = DataLoader(self.url, params)
data_loader = DataLoader(self.url, params, requests_kwargs)
return response.ExplainResponse(data_loader)


class DataLoader(object):
def __init__(self, url, params):
def __init__(self, url, params, requests_kwargs=None):
self.session = requests.Session()
self.url = url
self.params = params
self.response = None
self.xmlparser = xmlparse.XMLParser()
self.requests_kwargs = requests_kwargs or {}

def load(self, **kwargs):
self.params.update(kwargs)
Expand All @@ -50,7 +56,8 @@ def _get_content(self, url, params):
try:
res = self.session.get(
url,
params=params
params=params,
**self.requests_kwargs
)
res.raise_for_status()
except requests.exceptions.HTTPError as e:
Expand All @@ -62,10 +69,11 @@ def _get_content(self, url, params):

def _check_errors(self, xml):
sru = '{http://www.loc.gov/zing/srw/}'
diag = '{http://www.loc.gov/zing/srw/diagnostic/}'
diagnostics = self.xmlparser.find(
xml,
f'{sru}diagnostics/{sru}diagnostic'
f'{sru}diagnostics/{diag}diagnostic'
)
if diagnostics:
error_msg = " ".join([d.find('detail').text for d in diagnostics])
error_msg = ", ".join([d.text for d in diagnostics])
raise errors.SruError(error_msg)
50 changes: 34 additions & 16 deletions sruthi/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections import defaultdict
import re
import warnings
from flatten_dict import flatten
from . import xmlparse
from . import errors

Expand Down Expand Up @@ -127,29 +128,46 @@ def _extract_records(self, xml):
record_data = self.xmlparser.find(xml_rec, './sru:recordData')
extra_data = self.xmlparser.find(xml_rec, './sru:extraRecordData')

for elem in record_data.iter():
record = self._tag_data(record, elem)

extra = defaultdict()
for elem in extra_data.iter():
extra = self._tag_data(extra, elem)
record['extra'] = dict(extra)

record.pop('recordData', None)
record.pop('extraRecordData', None)
record.update(self._tag_data(record_data, 'sru:recordData') or {})
record['extra'] = self._tag_data(extra_data, 'sru:extraRecordData')

record = dict(record)
new_records.append(record)
self.records.extend(new_records)

def _tag_data(self, record, elem):
def _tag_data(self, elem, parent):
if not elem:
return None

record_data = self.xmlparser.todict(elem, xml_attribs=True).get(parent)
if not record_data:
return None

# check if there is only one element on the top level
keys = list(record_data.keys())
if len(record_data) == 1 and len(keys) > 0 and len(record_data[keys[0]]) > 0:
record_data = record_data[keys[0]]

record_data.pop('schemaLocation', None)
record_data.pop('xmlns', None)

def leaf_reducer(k1, k2):
# only use key of leaf element
return k2

try:
record_data = flatten(record_data, reducer=leaf_reducer)
except ValueError:
# if the keys of the leaf elements are not unique
# the dict will not be flattened
pass

return record_data

def _remove_namespace(self, elem):
ns_pattern = re.compile('{.+}')
tag_name = ns_pattern.sub('', elem.tag)
if elem.text and elem.text.strip():
record[tag_name] = elem.text.strip()
elif len(list(elem)) == 0: # leaf element
record[tag_name] = None
return record
return tag_name


class ExplainResponse(Response):
Expand Down
14 changes: 11 additions & 3 deletions sruthi/sru.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,17 @@
from . import client


def searchretrieve(url, query):
c = client.Client(url)
return c.searchretrieve(query)
def searchretrieve(url, query, **kwargs):
search_params = ['query', 'start_record', 'requests_kwargs']
search_kwargs = {k: v for k, v in kwargs.items() if k in search_params}
search_kwargs['query'] = query

# assume all others kwargs are for the client
client_kwargs = {k: v for k, v in kwargs.items() if k not in search_params}
client_kwargs['url'] = url

c = client.Client(**client_kwargs)
return c.searchretrieve(**search_kwargs)


def explain(url):
Expand Down
32 changes: 32 additions & 0 deletions sruthi/xmlparse.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import re
from xml.etree.ElementTree import Element
import defusedxml.ElementTree as etree
import xmltodict
from . import errors


Expand All @@ -25,6 +27,20 @@ def __init__(self):
'ap': 'http://www.archivportal.ch/srw/extension/',
'zr': 'http://explain.z3950.org/dtd/2.1/',
}
self.dict_namespaces = {
'http://www.loc.gov/zing/srw/': 'sru',
'http://explain.z3950.org/dtd/2.1/': 'zr',
'info:srw/extension/2/relevancy-1.0': None,
'http://www.archivportal.ch/srw/extension/': None,
'http://www.loc.gov/MARC21/slim': None,
'http://www.loc.gov/mods/v3': None,
'http://www.loc.gov/standards/mods/v3/mods-3-6.xsd': None,
'http://www.loc.gov/standards/mods/v3/mods-3-6.xsd': None,
'http://purl.org/dc/elements/1.1/': None,
'http://www.expertisecentrumdavid.be/xmlschemas/isad.xsd': None,
'http://www.w3.org/2001/XMLSchema-instance': None,
'http://www.w3.org/XML/1998/namespace': None,
}

def parse(self, content):
try:
Expand All @@ -50,6 +66,22 @@ def findall(self, xml, path):
def tostring(self, xml):
return etree.tostring(xml)

def todict(self, xml, **kwargs):
if isinstance(xml, XMLNone):
return None
if isinstance(xml, Element):
xml = self.tostring(xml)

dict_args = {
'dict_constructor': dict,
'process_namespaces': True,
'namespaces': self.dict_namespaces,
'attr_prefix': '',
'cdata_key': 'text',
}
dict_args.update(kwargs)
return dict(xmltodict.parse(xml, **dict_args))

def namespace(self, element):
m = re.match(r'\{(.*)\}', element.tag)
return m.group(1) if m else ''
Loading

0 comments on commit 2ec2782

Please sign in to comment.