-
Notifications
You must be signed in to change notification settings - Fork 0
/
TeproDTO.py
187 lines (148 loc) · 5.83 KB
/
TeproDTO.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import sys
from TeproAlgo import TeproAlgo
class TeproDTO(object):
"""This class will encapsulate all data that is
sent back and forth among NLP apps that belong
to the TEPROLIN platform."""
def __init__(self, text: str, conf: dict):
# The original text to be preprocessed
self._text = text
# The sentence splitter will store each
# sentence in this list, as a str
self._sentences = []
# This is a list of lists of TeproTok(s) with
# all available information
self._tokenized = []
# The set of all performed operations on
# this DTO object
self._performedOps = set()
# This is the configuration dict
# that comes from Teprolin
self._opsConf = conf
# Number of processed tokens in
# this DTO
self._proctoks = 0
def getProcessedTokens(self):
return self._proctoks
def getConfiguredAlgoForOper(self, oper: str):
if oper in self._opsConf:
return self._opsConf[oper]
return None
def addPerformedOp(self, op: str):
if op in TeproAlgo.getAvailableOperations():
self._performedOps.add(op)
else:
raise RuntimeError("Operation '" + op +
"' is not a valid TeproAlgo operation!")
def isOpPerformed(self, op: str) -> bool:
if op in TeproAlgo.getAvailableOperations():
return op in self._performedOps
else:
raise RuntimeError("Operation '" + op +
"' is not a valid TeproAlgo operation!")
def setText(self, text: str):
self._text = text
def getText(self) -> str:
return self._text
def getNumberOfSentences(self) -> int:
return len(self._sentences)
def getSentenceString(self, i: int):
"""Get the i-th sentence."""
if i >= 0 and i < len(self._sentences):
return self._sentences[i]
return None
def getSentenceTokens(self, i: int):
"""Get the i-th sentence as a list of TeproTok(s)."""
if i >= 0 and i < len(self._tokenized):
return self._tokenized[i]
return None
def addSentenceTokens(self, tokens: list):
"""Adds a new list of TeproTok(s) to the internal
list of tokenized sentences."""
self._tokenized.append(tokens)
self._proctoks += len(tokens)
def addSentenceString(self, sentence: str):
"""Adds a str sentence to the list of internal
list of sentences."""
self._sentences.append(sentence)
def dumpConllX(self, outfile=sys.stdout):
"""Prints the CoNLL-X format in outfile,
for the current DTO."""
for ts in self._tokenized:
for tt in ts:
print(tt.getConllXRecord(), file=outfile)
print(file=outfile, flush=True)
def jsonDict(self):
"""Returns the dict representation of this DTO
for JSON encoding."""
return {
'text': self._text,
'sentences': self._sentences,
'tokenized': self._tokenized
}
def alignSentences(self, fromSent: list, sid: int):
if sid < len(self._tokenized):
toSent = self._tokenized[sid]
# Indexes into fromSent
i = 0
# Indexes into toSent
j = 0
alignment = []
while i < len(fromSent) and j < len(toSent):
fromTok = fromSent[i]
toTok = toSent[j]
if fromTok == toTok:
# Sentences are in sync
alignment.append((i, j))
# And advance one position
i += 1
j += 1
else:
oi = i
oj = j
aFound = False
for i in range(oi, oi + 10):
if i >= len(fromSent):
break
fromTok = fromSent[i]
for j in range(oj, oj + 10):
if j >= len(toSent):
break
toTok = toSent[j]
if fromTok == toTok:
# Add all sources indexes which do
# not match with all target indexes which
# do not match.
for ii in range(oi, i):
for jj in range(oj, j):
alignment.append((ii, jj))
# Sentences are in sync
alignment.append((i, j))
# And advance one position
i += 1
j += 1
aFound = True
break
# end for y
if aFound:
break
# end for x
if not aFound:
return None
# end else (alignment out of sync)
# end while
return alignment
else:
return None
def copyTokenAnnotation(self, fromSent: list, sid: int, align: list, oper: str):
"""Copy the annotation corresponding to oper from fromSent into
the sentence with sid in self._tokenized.
Use the align list to map from fromSent into sentence with sid in self._tokenized."""
if align is None:
return
if sid < len(self._tokenized):
toSent = self._tokenized[sid]
for (i, j) in align:
fromTok = fromSent[i]
toTok = toSent[j]
toTok.copyFrom(fromTok, align, oper)