-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathword_normalizer.py
136 lines (124 loc) · 4.77 KB
/
word_normalizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
import re
import sys
import toolkit as tk
from converter import TSV2dict, dict2TSV
def build_regex_from_list(list_of_strings):
"""
Create a regex that matches any string from a list of strings.
"""
regex = fr"{list_of_strings[0]}"
for i in range(len(list_of_strings)):
if i != 0:
regex += fr"|{list_of_strings[i]}"
return regex
def identify_invalid_words(string):
"""
Return a set of invalid words found a string.
"""
invalid_words = set()
word_set = make_word_set(string, delimiters)
for word in word_set:
m = re.fullmatch(word_regex, word)
if not m:
invalid_words.add(word)
return invalid_words
def make_word_set(string, delimiters):
"""
Return a set of all words in a string.
"""
string_stripped = string
for delimiter in delimiters:
string_stripped = " ".join(string_stripped.split(delimiter))
word_list = string_stripped.split(" ")
word_set = set(word_list)
if "" in word_set:
word_set.remove("")
return word_set
def normalize_words(style, data_file, original_column, review_file, reference_file):
"""
Performs word normalization on target_column in data_file.
Reads or creates reference & review dicts, attempts to replace or remove
invalid words in those data items if possible, and if no replacement has
been specified, adds them to review file for manual review.
"""
data_dict = TSV2dict(data_file)
target_column = f"char_normalized_{original_column}"
new_column = f"word_normalized_{original_column}"
allowed_words = set()
if os.path.isfile(review_file):
review_dict = TSV2dict(review_file)
else:
review_dict = {}
if os.path.isfile(reference_file):
reference_dict = TSV2dict(reference_file)
else:
reference_dict = {}
review_dict, reference_dict, allowed_words = tk.update_reference(review_dict, reference_dict, allowed_words)
review_dict = tk.clean_occurrences(review_dict)
for index, rowdict in data_dict.items():
data_item = rowdict[target_column]
m = re.match(r"!\s.+\s!", data_item)
if m:
rowdict["word_validation"] = "stopped"
rowdict[new_column] = data_item
continue
if style == "data_loc":
m = re.fullmatch(r"https:\/\/hla-ligand-atlas.org\/peptide\/[a-zA-Z]+", data_item)
if m:
rowdict["word_validation"] = "pass"
rowdict[new_column] = data_item
rowdict = tk.evaluate_ld(rowdict,
"word",
target_column,
new_column)
continue
invalid_words = identify_invalid_words(data_item)
rowdict["word_validation"] = tk.validate(invalid_words, "string")
if tk.validate(invalid_words, "boolean"):
rowdict[new_column] = data_item
else:
review_dict, reference_dict, data_item, allowed_words = tk.handle_invalid_items(
style,
invalid_words,
review_dict,
reference_dict,
"word",
data_item,
allowed_words,
delimiters
)
invalid_words = identify_invalid_words(data_item)
for word in invalid_words.copy():
if word in allowed_words:
invalid_words.remove(word)
rowdict["word_validation"] = tk.validate(invalid_words, "string")
if tk.validate(invalid_words, "boolean"):
rowdict[new_column] = data_item
else:
rowdict[new_column] = f"! Invalid words: {sorted(invalid_words)} !"
rowdict = tk.evaluate_ld(rowdict,
"word",
target_column,
new_column)
output_path = os.path.join(style, "output_files", f"w_norm_{style}.tsv")
dict2TSV(data_dict, output_path)
if len(review_dict.keys()) != 0:
dict2TSV(review_dict, review_file)
if len(reference_dict.keys()) != 0:
dict2TSV(reference_dict, reference_file)
if __name__ == "__main__":
style = sys.argv[1]
input_file = os.path.join(style, "output_files", f"c_norm_{style}.tsv")
original_column = sys.argv[2]
review = os.path.join(style, "output_files", "word_review.tsv")
reference = os.path.join(style, "output_files", "word_reference.tsv")
approved_words = [
"are",
"is",
"than",
r"\d+"
]
delimiters = [",", ".", "-", " ", "(", ")", ":", ";"]
word_regex = build_regex_from_list(approved_words)
normalize_words(style, input_file, original_column, review, reference)