-
Notifications
You must be signed in to change notification settings - Fork 110
/
plain_word_vectors.py
51 lines (44 loc) · 1.23 KB
/
plain_word_vectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import plac
import gensim
from gensim import utils
class Corpus:
def __init__(self, corpus_file):
self.corpus_file = corpus_file
def __iter__(self):
with open(self.corpus_file) as fd:
for line in fd:
yield utils.simple_preprocess(line)
@plac.annotations(
in_dir=("Location of input directory"),
out_loc=("Location of output file"),
n_workers=("Number of workers", "option", "n", int),
size=("Dimension of the word vectors", "option", "d", int),
window=("Context window size", "option", "w", int),
min_count=("Min count", "option", "m", int),
negative=("Number of negative samples", "option", "g", int),
nr_iter=("Number of iterations", "option", "i", int),
)
def main(
in_dir,
out_loc,
negative=5,
n_workers=4,
window=5,
size=128,
min_count=10,
nr_iter=2,
):
sentences = Corpus(in_dir)
model = gensim.models.Word2Vec(
sentences=sentences,
size=size,
window=window,
min_count=min_count,
workers=n_workers,
sample=1e-5,
negative=negative,
iter=nr_iter,
)
model.wv.save_word2vec_format(out_loc, binary=False)
if __name__ == "__main__":
plac.call(main)