-
Notifications
You must be signed in to change notification settings - Fork 2
/
load_glove_embeddings.py
45 lines (42 loc) · 2 KB
/
load_glove_embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# coding: utf-8
import numpy as np
def load_glove_embeddings(fp, embedding_dim, include_empty_char=True):
"""
Loads pre-trained word embeddings (GloVe embeddings)
Inputs: - fp: filepath of pre-trained glove embeddings
- embedding_dim: dimension of each vector embedding
- generate_matrix: whether to generate an embedding matrix
Outputs:
- word2coefs: Dictionary. Word to its corresponding coefficients
- word2index: Dictionary. Word to word-index
- embedding_matrix: Embedding matrix for Keras Embedding layer
"""
# First, build the "word2coefs" and "word2index"
word2coefs = {} # word to its corresponding coefficients
word2index = {} # word to word-index
with open(fp, encoding='utf-8') as f:
for idx, line in enumerate(f):
try:
data = [x.strip().lower() for x in line.split()]
word = data[0]
coefs = np.asarray(data[1:embedding_dim + 1], dtype='float32')
word2coefs[word] = coefs
if word not in word2index:
word2index[word] = len(word2index)
except Exception as e:
print('Exception occurred in `load_glove_embeddings`:', e)
continue
# End of for loop.
# End of with open
if include_empty_char:
word2index[''] = len(word2index)
# Second, build the "embedding_matrix"
# Words not found in embedding index will be all-zeros. Hence, the "+1".
vocab_size = len(word2coefs) + 1 if include_empty_char else len(word2coefs)
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, idx in word2index.items():
embedding_vec = word2coefs.get(word)
if embedding_vec is not None and embedding_vec.shape[0] == embedding_dim:
embedding_matrix[idx] = np.asarray(embedding_vec)
# return word2coefs, word2index, embedding_matrix
return word2index, np.asarray(embedding_matrix)