-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunitex_tagger.py
241 lines (210 loc) · 8.33 KB
/
unitex_tagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
import os
import yaml
import re
from collections import defaultdict
from unitex.config import UnitexConfig
from unitex.resources import load_persistent_alphabet
from unitex.processor import UnitexProcessor
from unidecode import unidecode
from timeit import default_timer as timer
class TextExtractor(UnitexProcessor):
"""
The TextExtractor class allows to apply Unitex preprocessing and
grammars to a text, and to access to the matches, and the
sentences that contains them.
This class inherits from the UnitexProcessor class
(https://forge.uclouvain.be/watrin/python-unitex). It uses the
parent import, configuration and processing methods, and adds new
ones that allow to extract full sentences where a match occurs.
"""
def __init__(self, config_file):
"""
Args:
config_file (str): A .yaml file containing the parameters
that have to be used to process text.
"""
# Imports and applies configuration file.
self.config = self._configure(config_file)
super().__init__(self.config) # Inheritance.
# Imports alphabets.
alph = self.config['resources']['alphabet']
self.alphabet_sorted = self.config['resources']['alphabet-sorted']
self.alphabet = load_persistent_alphabet(alph)
# Path to the .snt file.
self.text_path = None
self.dir = None
# Snt file content.
self.processed_text = None
# Initial text file content.
self.text = None
# Path to the index file generated by locate.
self.index = None
# All processed sentences.
self.all_sentences = None
# Table
self.table = None
def _configure(self, config_file):
"""
Applies configuration file to the current object.
Args:
config_file (str): A .yaml file containing the parameters
that have to be used to process text.
Return:
A UnitexConfig object.
"""
options = None
with open(config_file, "r") as f:
options = yaml.load(f, yaml.Loader)
config = UnitexConfig()
config.load(options)
return config
def _get_path(self):
""" Get file path """
directory, filename = os.path.split(self.text_path)
name, extension = os.path.splitext(filename)
self.dir = os.path.join(directory, "%s_snt" % name)
def import_text(self, text):
"""
Imports the content of a text file.
Open a text file in the object, and applies all Unitex
processing steps, i.e segmentation into sentences,
normalisation of text, tokenization and application of
dictionaries.
Args:
text (str): Path to the text file to process.
"""
# mode:
# 's': segment (apply Sentence.fst2) // not used, made by hand.
# 'r': replace (apply Replace.fst2)
# 't': tokenize
# 'l': lexicalize (apply dictionaries)
self.open(text, mode="srtl", tagged=False)
# Extracts processed text.
processed = self._UnitexProcessor__snt
self.text_path = processed
self._get_path()
with open(processed, 'r') as fp:
self.processed_text = fp.read()
# Divides text to sentences.
sentences = self.processed_text.split('{S}')
self.all_sentences = [elt for elt in sentences if elt != ""]
# Extracts initial text.
with open(text, 'r') as fq:
self.text = fq.read()
class TextTagger(TextExtractor):
def __init__(self, config_file):
super().__init__(config_file)
self.regexps = None
self.tagged_sentences = None
self.tagged_path = None
self.extracted = None
self.initial = None
self.matched = []
self.terms = None
self.terms_dict = None
def _clean_images(self):
img_clean = re.compile(r'(<img(.|\s)*?)(?=<(img|p|/div))')
spans_remove = r'<span(.*?)>(.*?)</span>'
without_token = self.text.replace('{S}', '')
images = img_clean.findall(without_token)
for image in images:
tmp_img = image[0]
new_image = re.sub(spans_remove, r'\2', image[0])
without_token = without_token.replace(tmp_img, new_image)
return without_token
def _markup_pages_indices(self, html_string):
pages_tag = re.compile('((<div id="page)0)')
current_page = 0
for tag in pages_tag.findall(html_string):
incremented_tag = f'{tag[1]}-{current_page}'
current_page += 1
html_string = pages_tag.sub(incremented_tag, html_string, 1)
return html_string
def tag_text(self, grammar):
"""
Tags the text using the given grammar.
Args:
grammar: The grammar used to tag the text.
"""
self.tagged_path = os.path.join(self.dir, "tagged.txt")
self.config["tools"]["concord"]["output"] = self.tagged_path
self.tag(grammar, self.tagged_path)
self.import_text(self.tagged_path)
cleaned_images = self._clean_images()
marked_pages = self._markup_pages_indices(cleaned_images)
marked_pages = self.extract_tags(marked_pages)
with open('tagged_pdf.html', 'w') as fp:
fp.write(unidecode(
marked_pages.replace(r'\.', '.').replace(r'\,', '.')))
return unidecode(
marked_pages.replace(r'\.', '.').replace(r'\,', '.'))
def extract_tags(self, html_string):
"""
Extract tagged sentences.
Sentences are tagged with HTML span elements. Text corresponding to
classes is extracted.
Generates a list of tuples:
(Sentences,
Extracted tags/class,
Index of block of text,
extracted class with it's content)
"""
highlight_re = re.compile(
r'(<span class="(.*?) highlight")(>(.*?)</span>)')
matches = highlight_re.findall(
html_string.replace(r'\.', '.').replace(r'\,', '.'))
counter = {elt[1]: 0 for elt in matches}
idx_dict = {elt[1]: {} for elt in matches}
def mark_matches(match):
html = (rf'{match.group(1)} id={match.group(2)}'
rf'-{counter[match.group(2)]}{match.group(3)}"')
if match.group(4).lower() in idx_dict[match.group(2)]:
idx_dict[match.group(2)][match.group(4).lower()].append(
counter[match.group(2)])
else:
idx_dict[match.group(2)][match.group(4).lower()] = [
counter[match.group(2)]]
counter[match.group(2)] += 1
return html
html_string = highlight_re.sub(
mark_matches, html_string.replace(r'\.', '.').replace(r'\,', '.'))
self.matched = idx_dict
return html_string
def _get_terms(self):
"""
Extracts tagged terms with their text-block indices and their
corresponding class.
Format:
{class: [
{term: str, indices: [idx_1, idx_2, ..., idx_n]},
]
}
"""
terms = []
final_format = defaultdict(list)
terms_idx = defaultdict(list) # Temporary dict to store indices.
# Iterates over extracted terms.
for elt in self.matched:
for sent in elt[3]:
# Get and format extracted term.
this_term = sent[1].lower().replace(
r'\.', '.').replace(r'\,', ',').strip()
terms.append({'term': this_term, 'category': sent[0]})
# Adds index to occurence.
terms_idx[this_term].append(elt[2])
# Filters repeted occurences.
filtered_terms = list({v['term']: v for v in terms}.values())
# Generates expected format.
for elt in filtered_terms:
elt['indices'] = sorted(list(set(terms_idx[elt['term']])))
cat = elt.pop('category')
final_format[cat].append(elt)
self.terms = final_format
if __name__ == "__main__":
my_text = ("full_pdf.html")
my_grammar = "config/graphs/highlight.fst2"
my_config = "config/unitex-example.yaml"
tagger = TextTagger(my_config)
tagger.import_text(my_text)
tagger.tag_text(my_grammar)
tagger.terms