-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdipwrapper.py
149 lines (129 loc) · 4.87 KB
/
dipwrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
dipwrapper
~~~~~~~~~~
Python wrapper for the API of the 'Dokumentations- und Informationssystems
für Parlamentsmaterialien (DIP)' of the german Bundestag:
https://dip.bundestag.de/%C3%BCber-dip/hilfe/api
An API key ist provided on the web site.
For implementation details of the API see the documentation:
https://dip.bundestag.de/documents/informationsblatt_zur_dip_api.pdf
"""
import requests
import xml.etree.ElementTree as ET
class DIP():
"""
Represents the API.
"""
base_url = 'https://search.dip.bundestag.de/api/v1'
doctypes = ['aktivitaet', 'drucksache', 'drucksache-text',
'person', 'plenarprotokoll', 'plenarprotokoll-text'
'vorgang', 'vorgangsposition']
fformats = ['xml', 'json']
parameters = ['cursor', 'format', 'f.id', 'f.datum.start', 'f.datum.end',
'f.drucksache', 'f.plenarprotokoll', 'f.vorgang',
'f.aktivitaet', 'f.zuordnung']
def __init__(self, apikey, fformat='json',):
self.apikey = apikey
self.fformat = fformat
def _validate_resource_type(self, resource):
"Make sure provided resource type is valid."
if resource not in self.doctypes:
raise ValueError(f"Invalid doctype '{resource}'. "
f"Expected one of: {self.doctypes}")
def _validate_parameter_types(self, parameters):
"""
Make sure provided parameters are valid.
Not all types of resources support all query parameters;
see DIP documentation.
This is NOT enforced by this wrapper.
"""
for key in parameters:
if key not in self.parameters:
raise ValueError(f"Invalid doctype '{key}'. "
f"Expected one of: {self.parameters}")
def _extract_documents(self, response):
"""
Extract meta data of documents from response.
"""
if self.fformat == 'json':
return response.get('documents')
else:
return response.findall('document')
def _extract_cursor(self, response):
"""
Extract cursor from response.
"""
if self.fformat == 'json':
return response.get('cursor')
else:
return response.find('cursor').text
def _get_and_handle_response(self, url, params):
"""
Get response using url and params.
Raise HTTPerror if error occurs.
Parse response based on format.
"""
response = requests.get(url, params=params, timeout=5)
response.raise_for_status()
if self.fformat == 'json':
return response.json()
else:
return ET.fromstring(response.content)
return response
def get_resource_all(self, res_type):
"""
Get all documents of res_type.
Returns generator that yields a list of
json: dict
xml: xml.etree.ElementTree.Element
"""
self._validate_resource_type(res_type)
url = f'{self.base_url}/{res_type}'
params = {'format': self.fformat,
'apikey': self.apikey}
while True:
response = self._get_and_handle_response(url, params=params)
cursor = self._extract_cursor()
if params.get('cursor') != cursor:
params['cursor'] = cursor
yield self._extract_documents(response)
else:
break
def get_resource_id(self, res_type, dpi_id):
"""
Get meta data for document with dpi_id.
Returns:
json -> dct
xml -> xml.etree.ElementTree.Element
"""
self._validate_resource_type(res_type)
url = f'{self.base_url}/{res_type}/{dpi_id}'
params = {'format': self.fformat,
'apikey': self.apikey}
response = self._get_and_handle_response(url, params)
return response
def get_resource_multiple(self, res_type, parameters):
"""
Get all documents that match query defined by parameters.
The API accepts duplication of key which can be provided as follows:
{'f.id': ['258442', '84394']}
Returns generator that yields a list of
json: dict
xml: xml.etree.ElementTree.Element
"""
self._validate_resource_type(res_type)
url = f'{self.base_url}/{res_type}'
params = {'format': self.fformat,
'apikey': self.apikey}
params.update(parameters)
self._validate_parameter_types(parameters)
while True:
response = self._get_and_handle_response(url, params=params)
cursor = self._extract_cursor(response)
if params.get('cursor') != cursor:
params['cursor'] = cursor
yield self._extract_documents(response)
else:
break