-
Notifications
You must be signed in to change notification settings - Fork 2
/
corpus.py
89 lines (60 loc) · 2.23 KB
/
corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from urllib2 import urlopen
from urllib import urlencode
import json
import cPickle as pickle
from time import time
import sys
import traceback
from pprint import pprint
import nltk
from itertools import *
from collections import defaultdict, Counter
def fetchCorpus(num_sentences=0):
data = {
"corpus" : "ATTASIDOR",
"cqp" : "<sentence> []",
"start" : 0,
"end" : (num_sentences or 999) if num_sentences < 1000 else 999,
"show" : "lex,prefix,suffix"
}
output = []
exit = False
kwic = []
while True:
result = json.load(urlopen("http://spraakbanken.gu.se/ws/korp", urlencode(data)))
kwic = result["kwic"]
if not kwic: break
output.extend(kwic)
data["start"] += 999
data["end"] += 999
if exit: break
if num_sentences and data["end"] > num_sentences:
data["end"] = num_sentences - 2
exit = True
return output
# pickle.dump(output, open("kwic.pickle", "w"))
def countAmbiguities(tokens):
d = defaultdict(Counter)
for token in tokens:
d[token["word"]].update(token["lex"].strip("|").split("|"))
print "ambiguous:", map(lambda x: len(x) > 1, d.values()).count(True)
print "not:", map(lambda x: len(x) > 1, d.values()).count(False)
if __name__ == '__main__':
corpus = fetchCorpus(0)
print len(corpus)
# tokens = chain.from_iterable(map(lambda x: x["tokens"], corpus))
# d = imap(lambda x: (x["msd"], x["word"]), tokens)
# for msd, word in d:
# if msd.startswith("PM"):
# print word
# d = imap(lambda x: (x["lex"], x["word"], x["prefix"], x["suffix"]), tokens)
# print [x for x in d if x[2].strip("|") or x[3].strip("|")]
# isAmbiguous = lambda tup: len(tup[0].strip("|").split("|")) > 1
# amb = ifilter(isAmbiguous, d)
# fd = nltk.FreqDist(amb)
# pprint([" ".join(k) + " " + str(v) for k,v in fd.items()])
# for (lemgrams, word), v in fd.items():
# print "%s:" % word
# print "\n".join(lemgrams.strip("|").split("|"))
# print
# pprint(map(lambda x: (x["word"], x["lex"]), corpus.tokens))