-
Notifications
You must be signed in to change notification settings - Fork 2
/
keras_model.py
326 lines (271 loc) · 12.5 KB
/
keras_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
from keras.models import Sequential
from keras.layers import Merge
from keras.layers import GRU
from keras.layers import Dense
from keras.layers import Dropout
from keras.optimizers import SGD
# from keras.layers.convolutional import Convolution1D
# from keras.layers.pooling import GlobalAveragePooling1D
from keras.layers.wrappers import Bidirectional
from keras import backend as K
from keras.optimizers import SGD
from preprocess import KerasIterator
from preprocess import TweetIterator
from preprocess import text2mat
from utils import loadPickle
import numpy as np
import os
from warnings import warn
from numpy.linalg import norm
from utils import saveTweet2Vec
from utils import loadTweet2Vec
import matplotlib.pyplot as plt
from keras.callbacks import ModelCheckpoint
from keras.callbacks import CSVLogger
from sklearn.metrics.pairwise import euclidean_distances
mlb_file = './models/mlb.pickle'
if os.path.exists(mlb_file):
mlb = loadPickle(mlb_file)
else:
warn("{} doesn't exist - need this to generate labels for training: run `./preprocess.py --prepare input.txt` first")
class Tweet2Vec:
def __init__(self, model=None, char=True, chrd=True, word=True, normalize=False):
'''
Initialize stuff
'''
self.char = char
self.chrd = chrd
self.word = word
charX, chrdX, wordX, y = next(TweetIterator(['this is to figure out input/output dimensions'], False, 'char_mat', 'chrd_mat', 'word_mat', 'label'))
self.char_dim = charX.shape[1]
self.chrd_dim = chrdX.shape[1]
self.word_dim = wordX.shape[1]
self.output_dim = y.shape[1]
self.normalize = normalize
self.vector_cache_ = {}
# I think this just affects the feature preprocessing, probably should be num cores - 1
# ACTUALLY doesn't seem to do what we want to do - seems to make it so we loop over same data and overfit like crazy
self.num_workers = 1
if model is None:
# nothing specified, generate a model
self.gen_model()
elif isinstance(model, str):
# model is a filename
self.load(model)
else:
# model is a keras model
self.model = model
# Former is using a merged model, latter if not
if hasattr(self.model.layers[0], 'layers'):
self.get_vec_ = K.function([layer.input for layer in self.model.layers[0].layers] + [K.learning_phase()], [self.model.layers[-2].output])
num_expected = len([layer.input for layer in self.model.layers[0].layers])
else:
self.get_vec_ = K.function([self.model.layers[0].input, K.learning_phase()], [self.model.layers[-2].output])
num_expected = 1
num_actual = len([i for i in [char, chrd, word] if i])
if num_expected != num_actual:
warn("Number of expected inputs to your model ({}) and number of actual inputs ({}) are different. Either you need to change your model or change the Tweet2Vec() arguments".format(num_expected, num_actual))
def gen_model(self):
'''
Build the model
'''
# word matrix branch
word_branch = Sequential()
word_branch.add(Bidirectional(GRU(self.word_dim * 4, input_dim=self.word_dim, return_sequences=False), input_shape=(None, self.word_dim)))
# word_branch.add(GlobalAveragePooling1D())
# chrd matrix branch
chrd_branch = Sequential()
chrd_branch.add(Bidirectional(GRU(self.chrd_dim * 4, input_dim=self.chrd_dim, return_sequences=False), input_shape=(None, self.chrd_dim)))
# merge models (concat outputs)
self.model = Sequential()
# The order here determines the order of your inputs. This must correspond to the standard (char, chrd, word) order.
merged = Merge([chrd_branch, word_branch], mode='concat')
self.model.add(merged)
# final hidden layer(s)
self.model.add(Dropout(.1))
self.model.add(Dense(3000, activation='relu'))
self.model.add(Dropout(.5))
self.model.add(Dense(300))
# output layer
self.model.add(Dense(self.output_dim, activation='softmax'))
# loss function/optimizer
sgd = SGD(lr=.3, decay=.01)
self.model.compile(loss='categorical_crossentropy', optimizer=sgd)
def fit(self, source, test=None, batch_size=100, samples=None, num_epochs=1, checkpoint=False):
'''
Fit the model using data in source
The inputs mean what they mean
This will loop through source forever so it's okay if the numbers are more than your actual data
For KerasIterator object, specify what matrices it should yield (char, chrd, word),
these must correspond to what inputs the model expects.
Note: the inputs will always feed to the model in the (char, chrd, word) order.
'''
keras_iterator = KerasIterator(source, batch_size, char=self.char, chrd=self.chrd, word=self.word)
if test is None:
test_iterator = None
test_length = None
# If no test, need to monitor train loss not val_loss
checker = ModelCheckpoint('./models/latest_model.keras', monitor='loss', verbose=1, save_best_only=True)
else:
test_iterator = KerasIterator(test, batch_size, char=self.char, chrd=self.chrd, word=self.word)
# test_length = len(test_iterator.tweet_iterator)
# TODO debugging
test_length = 1000
checker = ModelCheckpoint('./models/latest_model.keras', verbose=1, save_best_only=True)
if checkpoint:
callbacks = [checker]
else:
callbacks = []
logger = CSVLogger('./models/epoch_history.csv')
callbacks.append(logger)
# If not specified, train on ALL data in source
if samples is None:
samples = len(keras_iterator.tweet_iterator)
self.fit_data = self.model.fit_generator(keras_iterator, samples, num_epochs, validation_data=test_iterator, nb_val_samples=test_length, verbose=1, nb_worker=self.num_workers, pickle_safe=False, callbacks=callbacks)
def plot(self, filename='./models/training_loss.png'):
plt.figure()
plt.plot(self.fit_data.history['loss'], lw=3, label='train', color='r')
if 'val_loss' in self.fit_data.history:
plt.plot(self.fit_data.history['val_loss'], lw=3, label='test', color='b')
plt.title('Model Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend()
plt.savefig(filename)
def evaluate(self, source, batch_size=100):
'''
Prints the loss on tweets in source
'''
keras_iterator = KerasIterator(source, batch_size, char=self.char, chrd=self.chrd, word=self.word)
num_samples = len(keras_iterator.tweet_iterator)
loss = self.model.evaluate_generator(keras_iterator, num_samples, nb_worker=self.num_workers, pickle_safe=False)
print("\nLoss on the {} samples in {} is: {}\n".format(num_samples, source, loss))
def predict_hashtags(self, source, num_to_validate=None, num_best=1, batch_size=500):
'''
Prints the `num_best` top predicted hashtags for `num_to_validate` lines in source
'''
raw = TweetIterator(source, True, 'raw_tweet')
# If not specified, run on ALL tweets in source
if num_to_validate is None:
num_to_validate = len(raw)
x = self.model.predict_generator(KerasIterator(source, batch_size, char=self.char, chrd=self.chrd, word=self.word), num_to_validate, nb_worker=self.num_workers, pickle_safe=False)
for i, r in zip(x, raw):
# goes through the highest prediction values and outputs
if num_best > 1:
best = i.argsort()[-num_best:][::-1]
else:
best = [i.argmax()]
print("\nTweet: {}".format(r))
best_hashtags = []
for b in best:
label = np.zeros((1, i.shape[0]))
label[0, b] = 1
predicted_hashtag = mlb.inverse_transform(label)[0][0]
best_hashtags.append(predicted_hashtag)
print("Predicted hashtags: {}\n".format(', '.join(best_hashtags)))
def __getitem__(self, tweet, batch_size=500):
'''
Gets the vector for tweet like the word2vec api
e.g.
tweet2vec['Raw text of the tweet']
will return the vector.
Also works on lists of tweets
(in fact, this is the recommended way if you are getting lots of vectors
because it seems to be just as slow getting one vector as it is many)
caches vectors, so if you ask for them again it will happen in O(1) time
'''
if type(tweet) == str:
tweet = [tweet]
not_cached = [t for t in tweet if t not in self.vector_cache_]
if not_cached:
# separate not_cached into batches
not_cached = [not_cached[i * batch_size:(i + 1) * batch_size] for i in range(len(not_cached) // batch_size + 1)]
for nc in not_cached:
if nc:
mats_in = []
if self.char:
charX = []
for t in nc:
charX.append(text2mat(t, 'char'))
mats_in.append(np.stack(charX))
if self.chrd:
chrdX = []
for t in nc:
chrdX.append(text2mat(t, 'chrd'))
mats_in.append(np.stack(chrdX))
if self.word:
wordX = []
for t in nc:
wordX.append(text2mat(t, 'word'))
mats_in.append(np.stack(wordX))
nc_vectors = self.get_vec_(mats_in + [0])[0]
for t, v in zip(nc, nc_vectors):
self.vector_cache_[t] = v
if self.normalize:
norm_v = norm(v)
if norm_v == 0:
norm_v = 1
self.vector_cache_[t] /= norm(v)
return np.array([self.vector_cache_[t] for t in tweet])
def most_similar(self, tweet, source, batch_size=500):
'''
Iterates through `source` and finds the line with the highest cosine
similarity to `tweet`
'''
best_d = None
best_t = ''
target_v = self[tweet]
batch = []
i = 0
for t in TweetIterator(source, False, 'raw_tweet'):
batch.append(t)
i += 1
if i == batch_size:
# cosine distance
# dists = -np.dot(self[batch], target_v.T)
dists = euclidean_distances(self[batch], target_v)
best_i = np.argmin(dists)
d = dists[best_i]
t = batch[best_i]
if best_d is None or d < best_d:
best_t = t
best_d = d
i = 0
batch = []
if batch:
dists = np.dot(self[batch], target_v.T)
best_i = np.argmax(dists)
d = dists[best_i]
t = batch[best_i]
if d > best_d:
best_t = t
best_d = d
return best_t, best_d
def most_similar_test(self, source1, source2, num_test=10):
'''
Another "sanity check"
Picks a random tweet in source1 and finds the closest tweet in source2 to it
Does so `num_test` times
Ideally there is no overlap between the two sources
'''
ti = TweetIterator(source1, False, 'raw_tweet')
for _ in range(num_test):
t1 = ti.get_random()
t2, d = self.most_similar(t1, source2)
print("\nOriginal tweet: {}\nClosest tweet: {}\nDistance: {}\n".format(t1, t2, d))
def save(self, filename):
saveTweet2Vec(self.model, filename)
def load(self, filename):
self.model = loadTweet2Vec(filename)
if __name__ == '__main__':
tweet2vec = Tweet2Vec(char=False, chrd=True, word=True)
train = './data/train.csv'
test = './data/test.csv'
train = './data/all_shuffled_train.csv'
test = './data/all_shuffled_test.csv'
# samples=None (the default) will train on all input data
# 6331717 samples in train set
tweet2vec.fit(train, test=test, samples=10**6, num_epochs=1000, checkpoint=True)
# tweet2vec.evaluate(test)
# tweet2vec.most_similar_test(train, test)
# tweet2vec.plot()