-
Notifications
You must be signed in to change notification settings - Fork 0
/
classifier_trainer.py
253 lines (210 loc) · 10.6 KB
/
classifier_trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import numpy as np
class ClassifierTrainer(object):
""" The trainer class performs SGD with momentum on a cost function """
def __init__(self):
self.step_cache = {} # for storing velocities in momentum update
# variables used for SMORMS3
self.step_cache_square={}
self.step_cache_mem={}
def train(self, X, y, X_val, y_val,
model, loss_function,
reg=0.0,
learning_rate=1e-2, momentum=0, learning_rate_decay=0.95,
update='momentum', sample_batches=True,
num_epochs=30, batch_size=100, acc_frequency=None,
verbose=False):
"""
Optimize the parameters of a model to minimize a loss function. We use
training data X and y to compute the loss and gradients, and periodically
check the accuracy on the validation set.
Inputs:
- X: Array of training data; each X[i] is a training sample.
- y: Vector of training labels; y[i] gives the label for X[i].
- X_val: Array of validation data
- y_val: Vector of validation labels
- model: Dictionary that maps parameter names to parameter values. Each
parameter value is a numpy array.
- loss_function: A function that can be called in the following ways:
scores = loss_function(X, model, reg=reg)
loss, grads = loss_function(X, model, y, reg=reg)
- reg: Regularization strength. This will be passed to the loss function.
- learning_rate: Initial learning rate to use.
- momentum: Parameter to use for momentum updates.
- learning_rate_decay: The learning rate is multiplied by this after each
epoch.
- update: The update rule to use. One of 'sgd', 'momentum', or 'rmsprop'.
- sample_batches: If True, use a minibatch of data for each parameter update
(stochastic gradient descent); if False, use the entire training set for
each parameter update (gradient descent).
- num_epochs: The number of epochs to take over the training data.
- batch_size: The number of training samples to use at each iteration.
- acc_frequency: If set to an integer, we compute the training and
validation set error after every acc_frequency iterations.
- verbose: If True, print status after each epoch.
Returns a tuple of:
- best_model: The model that got the highest validation accuracy during
training.
- loss_history: List containing the value of the loss function at each
iteration.
- train_acc_history: List storing the training set accuracy at each epoch.
- val_acc_history: List storing the validation set accuracy at each epoch.
"""
N = X.shape[0]
print "N: ",N
if sample_batches:
iterations_per_epoch = N / batch_size # using SGD
else:
iterations_per_epoch = 1 # using GD
num_iters = num_epochs * iterations_per_epoch
epoch = 0
best_val_acc = 0.0
best_model = {}
loss_history = []
train_acc_history = []
val_acc_history = []
#print "number of iterations: ",num_iters
for it in xrange(num_iters):
#if it % 10 == 0: print 'starting iteration ', it
# get batch of data
if sample_batches:
batch_mask = np.random.choice(N, batch_size)
X_batch = X[batch_mask]
y_batch = y[batch_mask]
else:
# no SGD used, full gradient descent
X_batch = X
y_batch = y
# evaluate cost and gradient
cost, grads = loss_function(X_batch, model, y_batch, reg)
#print cost, grads['W1'].shape
loss_history.append(cost)
# perform a parameter update
for p in model:
# compute the parameter step
if update == 'sgd':
dx = -learning_rate * grads[p]#simple gradient (Vanilla update) x += - learning_rate * dx
elif update == 'momentum':
if not p in self.step_cache:
self.step_cache[p] = np.zeros(grads[p].shape)
#dx = np.zeros_like(grads[p]) # you can remove this after
#####################################################################
# TODO: implement the momentum update formula and store the step #
# update into variable dx. You should use the variable #
# step_cache[p] and the momentum strength is stored in momentum. #
# Don't forget to also update the step_cache[p]. #
#####################################################################
#################### http://cs231n.github.io/neural-networks-3/ #################
# Momentum update v = mu * v - learning_rate * dx # integrate velocity and update x += v # integrate position
v = self.step_cache[p]
v = (momentum*v) - (learning_rate * grads[p])
dx = v
elif update == 'nestrov-momentum':
if not p in self.step_cache:
self.step_cache[p] = np.zeros(grads[p].shape)
#####################################################################
# TODO: implement the Nestrov-Momentum update formula and store the step #
# update into variable dx. You should use the variable #
# step_cache[p] and the momentum strength is stored in momentum. #
# Don't forget to also update the step_cache[p]. #
#####################################################################
'''
#################### http://cs231n.github.io/neural-networks-3/ #################
v_prev = v # back this up
v = mu * v - learning_rate * dx # velocity update stays the same
x += -mu * v_prev + (1 + mu) * v # position update changes form
'''
v = self.step_cache[p]
v_prev = v
v = (momentum*v) - (learning_rate * grads[p])
dx = (-momentum*v_prev) + (1+momentum) * v
#####################################################################
# END OF YOUR CODE #
#####################################################################
elif update == 'rmsprop':
decay_rate = 0.99 # you could also make this an option
if not p in self.step_cache:
self.step_cache[p] = np.zeros(grads[p].shape)
# dx = np.zeros_like(grads[p]) # you can remove this after
#####################################################################
# TODO: implement the RMSProp update and store the parameter update #
# dx. Don't forget to also update step_cache[p]. Use smoothing 1e-8 #
#####################################################################
'''
#################### http://cs231n.github.io/neural-networks-3/ #################
cache = decay_rate * cache + (1 - decay_rate) * dx**2
x += - learning_rate * dx / np.sqrt(cache + 1e-8)
'''
self.step_cache[p] = (decay_rate*self.step_cache[p]) + (1 - decay_rate) * grads[p]**2
dx = -learning_rate*grads[p]/np.sqrt(self.step_cache[p] + 1e-8)
#####################################################################
# END OF YOUR CODE #
#####################################################################
elif update =="SMORMS3":
#####################################################################
# TODO: implement the SMORMS3 update formula and store the step #
# update into variable dx. #
#####################################################################
###############http://sifter.org/~simon/journal/20150420.html############
''' #didn't sort out yet!
Initialize:
mem = np.ones(model[p]) # Vector of 1's, same shape as p
g = np.zeros(model[p]) # Vector of 0's, same shape as p
g2 = np.zeros(model[p])
Given cost function cost(p), and some trivial epsilon (1e-16), take one step per mini-batch as follows:
grad = grads[p] # Gradient of cost(p) with respect to p
r = 1/(mem+1)
g = (1-r) * g + r * grad
g2 = (1-r) * g2 + r * grad ** 2
p = p - grad*min(lrate, g*g/(g2 + epsilon))/(sqrt(g2)+epsilon)
mem = 1 + mem*(1 - g*g/(g2 + epsilon))
(Where products, divides, and min are element-wise.)
'''
dx=0
#####################################################################
# END OF YOUR CODE #
#####################################################################
else:
raise ValueError('Unrecognized update type "%s"' % update)
# update the parameters
model[p] += dx
# every epoch perform an evaluation on the validation set
first_it = (it == 0)
epoch_end = (it + 1) % iterations_per_epoch == 0
acc_check = (acc_frequency is not None and it % acc_frequency == 0)
if first_it or epoch_end or acc_check:
if it > 0 and epoch_end:
# decay the learning rate
learning_rate *= learning_rate_decay
epoch += 1
# evaluate train accuracy
if N > 1000:
train_mask = np.random.choice(N, 1000)
X_train_subset = X[train_mask]
y_train_subset = y[train_mask]
else:
X_train_subset = X
y_train_subset = y
scores_train = loss_function(X_train_subset, model)
y_pred_train = np.argmax(scores_train, axis=1)
train_acc = np.mean(y_pred_train == y_train_subset)
train_acc_history.append(train_acc)
# evaluate val accuracy
scores_val = loss_function(X_val, model)
y_pred_val = np.argmax(scores_val, axis=1)
val_acc = np.mean(y_pred_val == y_val)
val_acc_history.append(val_acc)
# keep track of the best model based on validation accuracy
if val_acc > best_val_acc:
# make a copy of the model
best_val_acc = val_acc
best_model = {}
for p in model:
best_model[p] = model[p].copy()
# print progress if needed
if verbose:
print ('Finished epoch %d / %d: cost %f, train: %f, val %f, lr %e'
% (epoch, num_epochs, cost, train_acc, val_acc, learning_rate))
if verbose:
print 'finished optimization. best validation accuracy: %f' % (best_val_acc, )
# return the best model and the training history statistics
return best_model, loss_history, train_acc_history, val_acc_history