-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathload_w2v.py
31 lines (27 loc) · 1.01 KB
/
load_w2v.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import os
import copy
import torch
import random
import gensim
import linecache
import numpy as np
# load pretrained word emb
def load_pretrained_embedding(glove_dir, word_list, dimension_size=300, encoding='utf-8'):
pre_words = []
count = 0
with open(glove_dir+"/glove_words.txt", 'r',encoding=encoding) as fopen:
for line in fopen:
pre_words.append(line.strip())
word2offset = {w: i for i, w in enumerate(pre_words)}
word_vectors = []
for word in word_list:
if word in word2offset:
line = linecache.getline(glove_dir+"/glove.840B.300d.txt", word2offset[word]+1)
assert(word == line[:line.find(' ')].strip())
word_vectors.append(np.fromstring(line[line.find(' '):].strip(), sep=' ', dtype=np.float32))
count += 1
else:
# init zero
word_vectors.append(np.zeros(dimension_size, dtype=np.float32))
print("Loading {}/{} words from vocab...".format(count, len(word_list)))
return word_vectors