-
Notifications
You must be signed in to change notification settings - Fork 6
/
utils.py
52 lines (42 loc) · 1.32 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import cPickle as pickle
import os
import sys
import numpy as np
from tqdm import tqdm
def load_vocabulary(vocab_file):
try:
with open(vocab_file, 'rb') as f:
vocab = pickle.load(f)
print('vocabulary loaded')
return vocab
except IOError:
print('can not load vocabulary')
sys.exit(0)
def load_emb_matrix(vocab_size, emb_size, data):
embedding_weights = {}
f = open('../glove.42B.{}d.txt'.format(emb_size), 'r')
loop = tqdm(f)
loop.set_description('Load Glove')
for line in loop:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embedding_weights[word] = coefs
f.close()
print('Total {} word vectors in Glove.'.format(len(embedding_weights)))
embedding_matrix = np.random.uniform(-1.0, 1.0, (vocab_size, emb_size))
embedding_matrix[0, :] = np.zeros(emb_size)
oov_count = 0
vocab = load_vocabulary(os.path.join(data, 'word_vocab.pkl'))
for word, i in vocab.items():
embedding_vector = embedding_weights.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
else:
oov_count += 1
print('Number of OOV words: %d' % oov_count)
return embedding_matrix
if __name__ == '__main__':
vocab = load_vocabulary('../data/eclipse/word_vocab.pkl')
for token in vocab:
print(token)