-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvocab.py
60 lines (51 loc) · 2.08 KB
/
vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""**Vocabulary**.
This module extacts the pretrained vectors and creates a dictionary word to index for the top words.
.. _Google Python Style Guide:
http://google.github.io/styleguide/pyguide.html
"""
import logging
import pickle
import bcolz
import numpy as np
from tqdm import tqdm
from config import hparams
logger = logging.getLogger()
def extract_glove_vocab(glove_path, word_embedding_dim, max_vocab_size):
"""Extacts the pretrained vectors and creates a dictionary word to index for the top words.
Args:
glove_path (str): Path to the GloVec file.
word_embedding_dim (int): Word vector dimension.
max_vocab_size (int): Vocabulary size.
"""
words = []
idx = 1
word2idx = {}
vectors = bcolz.carray( np.zeros(1),
rootdir=f'{glove_path}/6B.'+str(word_embedding_dim)+'.dat',
mode='w')
vectors.append(np.zeros(word_embedding_dim))
logger.info("Generating vocabulary dictionary...")
vocab_size = 0
with open(f'{glove_path}/glove.6B.'+str(word_embedding_dim)+'d.txt', 'rb') as file:
for line in tqdm(file):
line = line.decode().split()
word = line[0]
words.append(word)
word2idx[word] = idx
idx += 1
vector = np.array(line[1:]).astype(np.float)
vectors.append(vector)
vocab_size+=1
if vocab_size >= max_vocab_size:
break
vectors = bcolz.carray( vectors[1:].reshape((-1, word_embedding_dim)),
rootdir=f'{glove_path}/6B.'+str(word_embedding_dim)+'.dat',
mode='w')
vectors.flush()
pickle.dump(words, open(f'{glove_path}/6B.'+str(word_embedding_dim)+'_words.pkl', 'wb'))
pickle.dump(word2idx, open(f'{glove_path}/6B.'+str(word_embedding_dim)+'_idx.pkl', 'wb'))
if __name__ == '__main__':
print('Extracting Word2idx and GloVe vectors..')
extract_glove_vocab(hparams['glove_path'],
hparams['model']['embed_size'],
hparams['max_vocab_size'])