-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbuild_data.py
152 lines (130 loc) · 5.34 KB
/
build_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""Procedure
1. extract word_vocab that appear both in dataset (train, dev, test)
and in word2vec; also extract tag_vocab and char_vocab from training
data; then write vocab into three files, in the format that each word/
tag/char occupies exactly one line, and in the order that more frequent
words/chars are placed in upper lines, so that each word/tag/char may
have an id (line #). To deal with unknown words/chars, add config.UNK
to the first item in word_vocab/char_vocab (with id 0).
@note: this procedure is activated by config.BUILD_VOCAB
@note: id starts at zero
@TODO: remove low frequent words
2. load vocab and convert words/tags/chars in dataset (train, dev, test)
into ids, then store the trimmed dataset for further usage. Trimmed
dataset is formatted in such way that each line comprises three
component (word_id, tag_id, char_ids) and sentences are separated by an
empty line. Also, we store those word vectors that are involved in
word_vocab into np.ndarray, of which the i-th entry corresponds to the
i-th word, and save it as an archive.
@note: this procedure is activated by config.TRIM_DATA
@requires: procedure 1 was run previously
"""
from model import config
from model.utils import readData
from collections import Counter
import codecs
import json
import numpy as np
def build_vocab():
'''procedure 1
@return: None
'''
# extract word_vocab in word2vec
vocab_w2v = set()
with codecs.open(config.w2v_path, "r", "utf-8") as f:
for line in f:
spl = line.strip().split()
if len(spl) == 0: continue
vocab_w2v.add(spl[0])
# extract vocab
word_counter = Counter()
char_counter = Counter()
tags = set()
for word_ls, tag_ls in readData(config.raw_train_path):
for wd in word_ls:
if wd in vocab_w2v:
word_counter[wd] += 1
char_counter.update(wd)
tags.update(tag_ls)
for word_ls, _ in readData(config.raw_dev_path):
for wd in word_ls:
if wd in vocab_w2v:
word_counter[wd] += 1
for word_ls, _ in readData(config.raw_test_path):
for wd in word_ls:
if wd in vocab_w2v:
word_counter[wd] += 1
sorted_wd_cnt = word_counter.most_common()
sorted_wds = [config.UNK] + [x[0] for x in sorted_wd_cnt]
sorted_chr_cnt = char_counter.most_common()
sorted_chrs = [config.UNK] + [x[0] for x in sorted_chr_cnt]
# save vocab
with codecs.open(config.word_vocab_path, "w", "utf-8") as f:
for w in sorted_wds:
f.write(w + "\n")
with codecs.open(config.tag_vocab_path, "w", "utf-8") as f:
for t in tags:
f.write(t + "\n")
with codecs.open(config.char_vocab_path, "w", "utf-8") as f:
for c in sorted_chrs:
f.write(c + "\n")
def trim_data():
'''procedure 2
@return: None
'''
# load vocab and build map: word -> id; tag -> id; char -> id
with codecs.open(config.word_vocab_path, "r", "utf-8") as f:
word2id = {w.strip(): id for id, w in enumerate(f)}
with codecs.open(config.tag_vocab_path, "r", "utf-8") as f:
tag2id = {t.strip(): id for id, t in enumerate(f)}
with codecs.open(config.char_vocab_path, "r", "utf-8") as f:
char2id = {c.strip(): id for id, c in enumerate(f)}
word_vocab_sz = len(word2id)
tag_vocab_sz = len(tag2id)
char_vocab_sz = len(char2id)
# trim data
fins = (config.raw_train_path, config.raw_dev_path,
config.raw_test_path)
fouts = (config.trim_train_path, config.trim_dev_path,
config.trim_test_path)
for fin, fout in zip(fins, fouts):
fo = codecs.open(fout, "w", "utf-8")
for word_ls, tag_ls in readData(fin):
for word, tag in zip(word_ls, tag_ls):
wid = word2id.get(word, 0)
tid = tag2id[tag]
cids = [char2id.get(c, 0) for c in word]
fo.write(str(wid) + " " + str(tid))
for cid in cids:
fo.write(" " + str(cid))
fo.write("\n")
fo.write("\n")
fo.close()
# trim word2vec
embed_mat = np.empty((word_vocab_sz, config.dim_word), dtype=np.float32)
embed_mat[0] = np.zeros((config.dim_word,), dtype=np.float32) # embedding for unk
with codecs.open(config.w2v_path, "r", "utf-8") as f:
for line in f:
spl = line.strip().split()
if len(spl) == 0: continue
wd = spl[0]
if wd in word2id:
id = word2id[wd]
embed_mat[id] = np.asarray(spl[1:], dtype=np.float32)
np.savez_compressed(config.trim_w2v_path, embed_mat=embed_mat)
size = {"word_vocab_sz": word_vocab_sz,
"tag_vocab_sz": tag_vocab_sz,
"char_vocab_sz": char_vocab_sz}
with open(config.size_info_path, "w") as f:
json.dump(size, f)
if __name__ == "__main__":
# Procedure 1
if config.BUILD_VOCAB:
print("build vocab...")
build_vocab()
print("-done")
# procedure 2
if config.TRIM_DATA:
print("trim data...")
trim_data()
print("-done")