-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess.py
executable file
·97 lines (80 loc) · 3.67 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import lxml
from io import StringIO
from bs4 import BeautifulSoup
from images_layout import get_snippets
parser = lxml.etree.HTMLParser()
def extract_text(body):
"""extract clean text form HTML body"""
soup = BeautifulSoup(body, 'html.parser')
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
return text
def remove_odd_nodes(tree):
""" remove empty nodes that has only one direct children like <body> and replace it with its' child"""
for branch in tree:
if len(branch.getchildren()) == 1:
if branch.text is None:
branch.getparent().replace(branch, branch[0])
remove_odd_nodes(tree)
return tree
def _recurse_over_nodes(tree, parent_key, data):
""" reconstruct the tree to have a tree with only one level rather than having too many levels"""
for branch in tree:
key = branch.tag
if branch.getchildren():
key = '%s_%s' % (parent_key, key)
data.append((key, branch))
data = _recurse_over_nodes(branch, key, data)
else:
key = '%s_%s' % (parent_key, key)
data.append((key, branch))
return data
def post_process(html_context, answer, url, tokenizer):
"""
post process the model prediction to get the full HTML chunk the holds the answer
:param html_context:
:param answer:
:param url:
:param tokenizer:
:return:
"""
# encode the full text to get the same decoding as the model answer
full_text = tokenizer.decode(tokenizer.encode(extract_text(html_context), max_length=512, truncation=True),
skip_special_tokens=True, clean_up_tokenization_spaces=True)
# construct the html as a tree and convert it to 1 level tree with many elements acts as chunks
tree = lxml.etree.parse(StringIO(html_context), parser=parser)
tree = remove_odd_nodes(tree.getroot())
paths = _recurse_over_nodes(tree, 'root', [])
paths_lengths = [len(x[0].split('_')) for x in paths]
cleaned = []
for x in paths:
text = x[1].text
tail = x[1].tail
if (len(x[0].split('_')) == (min(paths_lengths))) and (
(text is not None) and (len(text) > 1) or (tail is not None) and (len(tail) > 1)):
cleaned.append(x[1])
elif len(x[0].split('_')) == (min(paths_lengths) + 1) and x[1].getparent() not in cleaned:
cleaned.append(x[1])
# merge the cleaned tree to html again
html_elements = [lxml.etree.tostring(d).strip().decode('utf-8') for d in cleaned]
# decode the text of the html with the same tokenizer
texts = [
tokenizer.decode(tokenizer.encode(extract_text(x), max_length=512, truncation=True), skip_special_tokens=True,
clean_up_tokenization_spaces=True)
for x in
html_elements]
# search for the chunk that holds the extracted answer from the full content
chunks = []
for i, text in enumerate(texts):
start = full_text.find(text)
end = start + len(text)
chunks.append((i, start, end))
start_chunk_answer = list(filter(lambda x: answer[1] in range(x[1], x[2] + 1), chunks))[0][0]
end_chunk_answer = list(filter(lambda x: answer[2] in range(x[1], x[2] + 1), chunks))[0][0]
chunk_html = [html_elements[x[0]] for x in chunks[start_chunk_answer:end_chunk_answer + 1]]
# clean the snippet
html_snippet = '\n'.join(chunk_html)
html_snippet, text_snippet, images = get_snippets(html_snippet, url)
return html_snippet, text_snippet, images