-
Notifications
You must be signed in to change notification settings - Fork 2
/
aminovec.py
79 lines (60 loc) · 2.99 KB
/
aminovec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
class AminoVec():
"""
Creates the amino acid embedding
uses the Protein class
"""
def __init__(self, proteins, size, window, epochs, force_build=False):
# Word2Vec word embedding
self.n = proteins[0].n
self.corpus = [["0"*self.n]+ngram+["Z"*self.n] for protein in proteins for ngram in protein.ngrams]
self.source = [protein.source+["Z"*self.n] for protein in proteins]
self.size = size
self.window = window
self.epochs = epochs
self.filename = '-'.join(str(i) for i in [self.n, self.size, self.window, self.epochs])
if force_build:
self.embedding = self._build_embedding()
else:
try:
self.embedding = self.get_embedding()
except:
self.embedding = self._build_embedding()
def get_embedding(self):
X_wordvec = np.load(os.path.join(EMB_DIR, self.filename+".npy"))
print("Retreived embedding...")
print("n: %d - size: %d - window: %d - epochs: %d"
% (self.n, self.size, self.window, self.epochs))
return X_wordvec
def _build_embedding(self):
print("Building embedding...")
print("n: %d - size: %d - window: %d - epochs: %d"
% (self.n, self.size, self.window, self.epochs))
model = Word2Vec(self.corpus,
size=self.size, # Dense vector size
window=self.window,
min_count=1, # Discount words with freq < 2
workers=10)
model.train(self.source, total_words=len(preproc.source_vocab_dict), epochs=self.epochs)
self.vocab = list(sorted(model.wv.vocab.keys()))
X_wordvec = model.wv[self.vocab]
np.save(os.path.join(EMB_DIR, self.filename+".npy"), X_wordvec)
print("Saved embedding")
print("Creating metadata...")
self._create_metadata("features.csv", self.filename+".tsv")
print("Saved metadata")
return X_wordvec
def _create_metadata(self, feature_file, metadata_file):
# Create feature dictionary
features = {'0' : [0.0 for i in range(243)], 'Z' : [1.0 for i in range(243)]}
with open(os.path.join(EMB_DIR, feature_file), 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='|')
next(reader, None)
for row in reader:
features[row[0]] = list(map(float, row[1:]))
# Write metadata
with open(os.path.join(EMB_DIR, metadata_file), 'w') as mf:
header = "Ngram"+"\t{}"*243 + "\n"
mf.write(header.format(*[str(label) for label in range(243)]))
for ngram in self.vocab:
line = ''.join(ngram)+"\t{}"*243 + "\n"
mf.write(line.format(*[str(np.mean(value)) for value in zip(*[features[aa] for aa in ngram])]))