forked from Infiziert90/Tsuzuru-Bot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerriam_api.py
380 lines (319 loc) · 14.5 KB
/
merriam_api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
# -*- encoding: utf-8 -*-
import re
import xml.etree.cElementTree as ElementTree
from abc import ABCMeta, abstractmethod, abstractproperty
from urllib.parse import quote, quote_plus
from urllib.request import urlopen
class WordNotFoundException(KeyError):
def __init__(self, word, suggestions=None, *args, **kwargs):
self.word = word
if suggestions is None:
suggestions = []
self.suggestions = suggestions
message = "'{0}' not found.".format(word)
if suggestions:
message = "{0} Try: {1}".format(message, ", ".join(suggestions))
KeyError.__init__(self, message, *args, **kwargs)
class InvalidResponseException(WordNotFoundException):
def __init__(self, word, *args, **kwargs):
self.word = word
self.suggestions = []
message = "{0} not found. (Malformed XML from server).".format(word)
KeyError.__init__(self, message, *args, **kwargs)
class InvalidAPIKeyException(Exception):
pass
class MWApiWrapper:
""" Defines an interface for wrappers to Merriam Webster web APIs. """
__metaclass__ = ABCMeta
def __init__(self, key=None, urlopen=urlopen):
""" key is the API key string to use for requests. urlopen is a function
that accepts a url string and returns a file-like object of the results
of fetching the url. defaults to urllib2.urlopen, and should throw """
self.key = key
self.urlopen = urlopen
@abstractproperty
def base_url():
""" The api enpoint url without trailing slash or format (/xml).
"""
pass
@abstractmethod
def parse_xml(root, word):
pass
def request_url(self, word):
""" Returns the target url for an API GET request (w/ API key).
>>> class MWDict(MWApiWrapper):
... base_url = "mw.com/my-api-endpoint"
... def parse_xml(): pass
>>> MWDict("API-KEY").request_url("word")
'mw.com/my-api-endpoint/xml/word?key=API-KEY'
Override this method if you need something else.
"""
if self.key is None:
raise InvalidAPIKeyException("API key not set")
qstring = "{0}?key={1}".format(quote(word), quote_plus(self.key))
return ("{0}/xml/{1}").format(self.base_url, qstring)
def lookup(self, word):
response = self.urlopen(self.request_url(word))
data = response.read()
try:
root = ElementTree.fromstring(data)
except ElementTree.ParseError:
if re.search("Invalid API key", data):
raise InvalidAPIKeyException()
data = re.sub(r'&(?!amp;)', '&', data)
try:
root = ElementTree.fromstring(data)
except ElementTree.ParseError:
raise InvalidResponseException(word)
suggestions = root.findall("suggestion")
if suggestions:
suggestions = [s.text for s in suggestions]
raise WordNotFoundException(word, suggestions)
return self.parse_xml(root, word)
def _flatten_tree(self, root, exclude=None):
""" Returns a list containing the (non-None) .text and .tail for all
nodes in root.
exclude is a list of tag names whose text attributes should be
excluded. their tails will still be included.
"""
parts = [root.text] if root.text else []
for node in root:
targets = [node.tail]
if not exclude or node.tag not in exclude:
targets.insert(0, node.text)
for p in targets:
if p:
parts.append(p)
return parts
def _stringify_tree(self, *args, **kwargs):
" Returns a string of the concatenated results from _flatten_tree "
return ''.join(self._flatten_tree(*args, **kwargs))
class LearnersDictionary(MWApiWrapper):
base_url = "http://www.dictionaryapi.com/api/v1/references/learners"
def parse_xml(self, root, word):
entries = root.findall("entry")
for num, entry in enumerate(entries):
args = {}
args['illustration_fragments'] = [e.get('id') for e in
entry.findall("art/artref")
if e.get('id')]
args['headword'] = entry.find("hw").text
args['pronunciations'] = self._get_pronunciations(entry)
sound = entry.find("sound")
args['sound_fragments'] = []
if sound:
args['sound_fragments'] = [s.text for s in sound]
args['functional_label'] = getattr(entry.find('fl'), 'text', None)
args['inflections'] = self._get_inflections(entry)
args['senses'] = self._get_senses(entry)
yield LearnersDictionaryEntry(
re.sub(r'(?:\[\d+\])?\s*', '', entry.get('id')),
args)
def _get_inflections(self, root):
""" Returns a generator of Inflections found in root.
inflection nodes that have <il>also</il> will have their inflected form
added to the previous inflection entry.
"""
for node in root.findall("in"):
label, forms = None, []
for child in node:
if child.tag == 'il':
if child.text == 'also':
pass # next form will be added to prev inflection-list
else:
if label is not None or forms != []:
yield Inflection(label, forms)
label, forms = child.text, []
if child.tag == 'if':
forms.append(child.text)
if label is not None or forms != []:
yield Inflection(label, forms)
def _get_pronunciations(self, root):
""" Returns list of IPA for regular and 'alternative' pronunciation. """
prons = root.find("./pr")
pron_list = []
if prons is not None:
ps = self._flatten_tree(prons, exclude=['it'])
pron_list.extend(ps)
prons = root.find("./altpr")
if prons is not None:
ps = self._flatten_tree(prons, exclude=['it'])
pron_list.extend(ps)
return [p.strip(', ') for p in pron_list]
def _get_senses(self, root):
""" Returns a generator yielding tuples of definitions and example
sentences: (definition_string, list_of_usage_example_strings). Each
tuple should represent a different sense of the word.
"""
for definition in root.findall('./def/dt'):
# could add support for phrasal verbs here by looking for
# <gram>phrasal verb</gram> and then looking for the phrase
# itself in <dre>phrase</dre> in the def node or its parent.
dstring = self._stringify_tree(definition,
exclude=['vi', 'wsgram',
'ca', 'dx', 'snote',
'un'])
dstring = re.sub("^:", "", dstring)
dstring = re.sub(r'(\s*):', r';\1', dstring).strip()
if not dstring: # use usage note instead
un = definition.find('un')
if un is not None:
dstring = self._stringify_tree(un, exclude=['vi'])
usage = [self._vi_to_text(u).strip()
for u in definition.findall('.//vi')]
yield WordSense(dstring, usage)
def _vi_to_text(self, root):
example = self._stringify_tree(root)
return re.sub(r'\s*\[=.*?\]', '', example)
class Inflection(object):
def __init__(self, label, forms):
self.label = label
self.forms = forms
class WordSense(object):
def __init__(self, definition, examples):
self.definition = definition
self.examples = examples
def __str__(self):
return "{0}, ex: [{1}]".format(self.definition[:30],
", ".join(i[:15] for i in self.examples))
def __repr__(self):
return "WordSense({0})".format(self.__str__())
def __iter__(self):
yield self.definition
yield self.examples
class MWDictionaryEntry(object):
def build_sound_url(self, fragment):
base_url = "http://media.merriam-webster.com/soundc11"
prefix_match = re.search(r'^([0-9]+|gg|bix)', fragment)
if prefix_match:
prefix = prefix_match.group(1)
else:
prefix = fragment[0]
return "{0}/{1}/{2}".format(base_url, prefix, fragment)
class LearnersDictionaryEntry(MWDictionaryEntry):
def __init__(self, word, attrs):
# word, pronounce, sound_url, art_url, inflection, pos
self.word = word
self.headword = attrs.get("headword")
self.alternate_headwords = attrs.get("alternate_headwords")
self.pronunciations = attrs.get("pronunciations")
self.function = attrs.get("functional_label")
self.inflections = attrs.get("inflections") # (form, [pr], note,)
self.senses = attrs.get("senses") # list of ("def text", ["examples"]
self.audio = [self.build_sound_url(f) for f in
attrs.get("sound_fragments")]
self.illustrations = [self.build_illustration_url(f) for f in
attrs.get("illustration_fragments")]
def build_illustration_url(self, fragment):
base_url = "www.learnersdictionary.com/art/ld"
fragment = re.sub(r'\.(tif|eps)', '.gif', fragment)
return "{0}/{1}".format(base_url, fragment)
class CollegiateDictionaryEntry(MWDictionaryEntry):
def __init__(self, word, attrs):
self.word = word
self.headword = attrs.get('headword')
self.function = attrs.get('functional_label')
self.pronunciations = attrs.get("pronunciations")
self.inflections = attrs.get("inflections")
self.senses = attrs.get("senses")
self.audio = [self.build_sound_url(f) for f in
attrs.get("sound_fragments")]
self.illustrations = [self.build_illustration_url(f) for f in
attrs.get("illustration_fragments")]
def build_illustration_url(self, fragment):
base_url = 'http://www.merriam-webster.com/art/dict'
fragment = re.sub(r'\.(bmp)', '.htm', fragment)
return "{0}/{1}".format(base_url, fragment)
"""
<!ELEMENT entry
(((subj?, art?, formula?, table?),
hw,
(pr?, pr_alt?, pr_ipa?, pr_wod?, sound?)*,
(ahw, (pr, pr_alt?, pr_ipa?, pr_wod?, sound?)?)*,
vr?),
(fl?, in*, lb*, ((cx, (ss | us)*) | et)*, sl*),
(dx | def)*,
(list? |
(uro*, dro*, ((pl, pt, sa?) |
(note) |
quote+)*)))>
"""
class CollegiateDictionary(MWApiWrapper):
base_url = "http://www.dictionaryapi.com/api/v1/references/collegiate"
def parse_xml(self, root, word):
for entry in root.findall('entry'):
args = {}
args['headword'] = entry.find('hw').text
args['functional_label'] = getattr(entry.find('fl'), 'text', None)
args['pronunciations'] = self._get_pronunciations(entry)
args['inflections'] = self._get_inflections(entry)
args['senses'] = self._get_senses(entry)
args['sound_fragments'] = []
args['illustration_fragments'] = [e.text for e in
entry.findall("art/bmp")
if e.text]
sound = entry.find("sound")
if sound:
args['sound_fragments'] = [s.text for s in sound]
yield CollegiateDictionaryEntry(word, args)
def _get_pronunciations(self, root):
""" Returns list of IPA for regular and 'alternative' pronunciation. """
prons = root.find("./pr")
pron_list = []
if prons is not None:
ps = self._flatten_tree(prons, exclude=['it'])
pron_list.extend(ps)
return pron_list
def _get_inflections(self, root):
""" Returns a generator of Inflections found in root.
inflection nodes that have <il>also</il> will have their inflected form
added to the previous inflection entry.
"""
for node in root.findall("in"):
label, forms = None, []
for child in node:
if child.tag == 'il':
if child.text in ['also', 'or']:
pass # next form will be added to prev inflection-list
else:
if label is not None or forms != []:
yield Inflection(label, forms)
label, forms = child.text, []
if child.tag == 'if':
forms.append(child.text)
if label is not None or forms != []:
yield Inflection(label, forms)
"""
<!ELEMENT def (vt?, date?, sl*, sense, ss?, us?)+ >
<!ELEMENT sense (sn?,
(sp, sp_alt?, sp_ipa?, sp_wod?, sound?)?,
svr?, sin*, slb*, set?, ssl*, dt*,
(sd, sin?,
(sp, sp_alt?, sp_ipa?, sp_wod?, sound?)?,
slb*, ssl*, dt+)?)>
"""
def _get_senses(self, root):
""" Returns a generator yielding tuples of definitions and example
sentences: (definition_string, list_of_usage_example_strings). Each
tuple should represent a different sense of the word.
"""
for definition in root.findall('./def/dt'):
# could add support for phrasal verbs here by looking for
# <gram>phrasal verb</gram> and then looking for the phrase
# itself in <dre>phrase</dre> in the def node or its parent.
dstring = self._stringify_tree(definition,
exclude=['vi', 'wsgram',
'ca', 'dx', 'snote',
'un'])
dstring = re.sub("^:", "", dstring)
dstring = re.sub(r'(\s*):', r';\1', dstring).strip()
if not dstring: # use usage note instead
un = definition.find('un')
if un is not None:
dstring = self._stringify_tree(un, exclude=['vi'])
usage = [self._vi_to_text(u).strip()
for u in definition.findall('.//vi')]
yield WordSense(dstring, usage)
def _vi_to_text(self, root):
example = self._stringify_tree(root)
return re.sub(r'\s*\[=.*?\]', '', example)