-
Notifications
You must be signed in to change notification settings - Fork 118
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Compatibility with Keras 2.2.3 and Tensorflow 1.11.0
- Loading branch information
Showing
6 changed files
with
359 additions
and
0 deletions.
There are no files selected for viewing
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
|
||
# Weight Normalization using Keras | ||
|
||
Example code for using Weight Normalization using [Keras](https://keras.io). | ||
|
||
```cifar10_cnn.py``` contains the standard CIFAR-10 example from Keras, with lines 64 and 69 edited to include weight normalization and data dependent initialization. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
''' | ||
CIFAR-10 example from https://github.com/keras-team/keras/blob/master/examples/cifar10_cnn.py | ||
Now with weight normalization. Lines 64-65 and 78-79 contain the changes w.r.t. original. | ||
''' | ||
|
||
from __future__ import print_function | ||
import keras | ||
from keras.datasets import cifar10 | ||
from keras.preprocessing.image import ImageDataGenerator | ||
from keras.models import Sequential | ||
from keras.layers import Dense, Dropout, Activation, Flatten | ||
from keras.layers import Conv2D, MaxPooling2D | ||
import os | ||
|
||
'''Train a simple deep CNN on the CIFAR10 small images dataset. | ||
It gets to 75% validation accuracy in 25 epochs, and 79% after 50 epochs. | ||
(it's still underfitting at that point, though). | ||
With weight normalization, a validation accuracy of 75% is already reached | ||
after 10 epochs. | ||
''' | ||
|
||
batch_size = 32 | ||
num_classes = 10 | ||
epochs = 100 | ||
data_augmentation = True | ||
num_predictions = 20 | ||
save_dir = os.path.join(os.getcwd(), 'saved_models') | ||
model_name = 'keras_cifar10_trained_model.h5' | ||
|
||
# The data, split between train and test sets: | ||
(x_train, y_train), (x_test, y_test) = cifar10.load_data() | ||
print('x_train shape:', x_train.shape) | ||
print(x_train.shape[0], 'train samples') | ||
print(x_test.shape[0], 'test samples') | ||
|
||
# Convert class vectors to binary class matrices. | ||
y_train = keras.utils.to_categorical(y_train, num_classes) | ||
y_test = keras.utils.to_categorical(y_test, num_classes) | ||
|
||
model = Sequential() | ||
model.add(Conv2D(32, (3, 3), padding='same', | ||
input_shape=x_train.shape[1:])) | ||
model.add(Activation('relu')) | ||
model.add(Conv2D(32, (3, 3))) | ||
model.add(Activation('relu')) | ||
model.add(MaxPooling2D(pool_size=(2, 2))) | ||
model.add(Dropout(0.25)) | ||
|
||
model.add(Conv2D(64, (3, 3), padding='same')) | ||
model.add(Activation('relu')) | ||
model.add(Conv2D(64, (3, 3))) | ||
model.add(Activation('relu')) | ||
model.add(MaxPooling2D(pool_size=(2, 2))) | ||
model.add(Dropout(0.25)) | ||
|
||
model.add(Flatten()) | ||
model.add(Dense(512)) | ||
model.add(Activation('relu')) | ||
model.add(Dropout(0.5)) | ||
model.add(Dense(num_classes)) | ||
model.add(Activation('softmax')) | ||
|
||
# let's train the model using SGD + momentum (how original). EDIT: now with weight normalization, so slightly more original ;-) | ||
from weightnorm import SGDWithWeightnorm, AdamWithWeightnorm | ||
opt_wn = SGDWithWeightnorm(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) | ||
#opt_wn = AdamWithWeightnorm(lr=0.001, decay=1e-6) | ||
|
||
# Let's train the model using RMSprop | ||
model.compile(loss='categorical_crossentropy', | ||
optimizer=opt_wn, | ||
metrics=['accuracy']) | ||
|
||
x_train = x_train.astype('float32') | ||
x_test = x_test.astype('float32') | ||
x_train /= 255 | ||
x_test /= 255 | ||
|
||
# data based initialization of parameters | ||
from weightnorm import data_based_init | ||
data_based_init(model, x_train[:100]) | ||
|
||
if not data_augmentation: | ||
print('Not using data augmentation.') | ||
model.fit(x_train, y_train, | ||
batch_size=batch_size, | ||
epochs=epochs, | ||
validation_data=(x_test, y_test), | ||
shuffle=True) | ||
else: | ||
print('Using real-time data augmentation.') | ||
# This will do preprocessing and realtime data augmentation: | ||
datagen = ImageDataGenerator( | ||
featurewise_center=False, # set input mean to 0 over the dataset | ||
samplewise_center=False, # set each sample mean to 0 | ||
featurewise_std_normalization=False, # divide inputs by std of the dataset | ||
samplewise_std_normalization=False, # divide each input by its std | ||
zca_whitening=False, # apply ZCA whitening | ||
zca_epsilon=1e-06, # epsilon for ZCA whitening | ||
rotation_range=0, # randomly rotate images in the range (degrees, 0 to 180) | ||
# randomly shift images horizontally (fraction of total width) | ||
width_shift_range=0.1, | ||
# randomly shift images vertically (fraction of total height) | ||
height_shift_range=0.1, | ||
shear_range=0., # set range for random shear | ||
zoom_range=0., # set range for random zoom | ||
channel_shift_range=0., # set range for random channel shifts | ||
# set mode for filling points outside the input boundaries | ||
fill_mode='nearest', | ||
cval=0., # value used for fill_mode = "constant" | ||
horizontal_flip=True, # randomly flip images | ||
vertical_flip=False, # randomly flip images | ||
# set rescaling factor (applied before any other transformation) | ||
rescale=None, | ||
# set function that will be applied on each input | ||
preprocessing_function=None, | ||
# image data format, either "channels_first" or "channels_last" | ||
data_format=None, | ||
# fraction of images reserved for validation (strictly between 0 and 1) | ||
validation_split=0.0) | ||
|
||
# Compute quantities required for feature-wise normalization | ||
# (std, mean, and principal components if ZCA whitening is applied). | ||
datagen.fit(x_train) | ||
|
||
# Fit the model on the batches generated by datagen.flow(). | ||
model.fit_generator(datagen.flow(x_train, y_train, | ||
batch_size=batch_size), | ||
epochs=epochs, | ||
steps_per_epoch=len(x_train)/batch_size, | ||
validation_data=(x_test, y_test), | ||
workers=4) | ||
|
||
# Save model and weights | ||
if not os.path.isdir(save_dir): | ||
os.makedirs(save_dir) | ||
model_path = os.path.join(save_dir, model_name) | ||
model.save(model_path) | ||
print('Saved trained model at %s ' % model_path) | ||
|
||
# Score trained model. | ||
scores = model.evaluate(x_test, y_test, verbose=1) | ||
print('Test loss:', scores[0]) | ||
print('Test accuracy:', scores[1]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
from keras import backend as K | ||
from keras.optimizers import SGD,Adam | ||
import tensorflow as tf | ||
|
||
# adapted from keras.optimizers.SGD | ||
class SGDWithWeightnorm(SGD): | ||
def get_updates(self, loss, params): | ||
grads = self.get_gradients(loss, params) | ||
self.updates = [] | ||
|
||
lr = self.lr | ||
if self.initial_decay > 0: | ||
lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.floatx()))) | ||
self.updates .append(K.update_add(self.iterations, 1)) | ||
|
||
# momentum | ||
shapes = [K.get_variable_shape(p) for p in params] | ||
moments = [K.zeros(shape) for shape in shapes] | ||
self.weights = [self.iterations] + moments | ||
for p, g, m in zip(params, grads, moments): | ||
|
||
# if a weight tensor (len > 1) use weight normalized parameterization | ||
ps = K.get_variable_shape(p) | ||
if len(ps) > 1: | ||
|
||
# get weight normalization parameters | ||
V, V_norm, V_scaler, g_param, grad_g, grad_V = get_weightnorm_params_and_grads(p, g) | ||
|
||
# momentum container for the 'g' parameter | ||
V_scaler_shape = K.get_variable_shape(V_scaler) | ||
m_g = K.zeros(V_scaler_shape) | ||
|
||
# update g parameters | ||
v_g = self.momentum * m_g - lr * grad_g # velocity | ||
self.updates.append(K.update(m_g, v_g)) | ||
if self.nesterov: | ||
new_g_param = g_param + self.momentum * v_g - lr * grad_g | ||
else: | ||
new_g_param = g_param + v_g | ||
|
||
# update V parameters | ||
v_v = self.momentum * m - lr * grad_V # velocity | ||
self.updates.append(K.update(m, v_v)) | ||
if self.nesterov: | ||
new_V_param = V + self.momentum * v_v - lr * grad_V | ||
else: | ||
new_V_param = V + v_v | ||
|
||
# if there are constraints we apply them to V, not W | ||
if getattr(p, 'constraint', None) is not None: | ||
new_V_param = p.constraint(new_V_param) | ||
|
||
# wn param updates --> W updates | ||
add_weightnorm_param_updates(self.updates, new_V_param, new_g_param, p, V_scaler) | ||
|
||
else: # normal SGD with momentum | ||
v = self.momentum * m - lr * g # velocity | ||
self.updates.append(K.update(m, v)) | ||
|
||
if self.nesterov: | ||
new_p = p + self.momentum * v - lr * g | ||
else: | ||
new_p = p + v | ||
|
||
# apply constraints | ||
if getattr(p, 'constraint', None) is not None: | ||
new_p = p.constraint(new_p) | ||
|
||
self.updates.append(K.update(p, new_p)) | ||
return self.updates | ||
|
||
# adapted from keras.optimizers.Adam | ||
class AdamWithWeightnorm(Adam): | ||
def get_updates(self, loss, params): | ||
grads = self.get_gradients(loss, params) | ||
self.updates = [K.update_add(self.iterations, 1)] | ||
|
||
lr = self.lr | ||
if self.initial_decay > 0: | ||
lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.floatx()))) | ||
|
||
t = K.cast(self.iterations + 1, K.floatx()) | ||
lr_t = lr * K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)) | ||
|
||
shapes = [K.get_variable_shape(p) for p in params] | ||
ms = [K.zeros(shape) for shape in shapes] | ||
vs = [K.zeros(shape) for shape in shapes] | ||
self.weights = [self.iterations] + ms + vs | ||
|
||
for p, g, m, v in zip(params, grads, ms, vs): | ||
|
||
# if a weight tensor (len > 1) use weight normalized parameterization | ||
# this is the only part changed w.r.t. keras.optimizers.Adam | ||
ps = K.get_variable_shape(p) | ||
if len(ps)>1: | ||
|
||
# get weight normalization parameters | ||
V, V_norm, V_scaler, g_param, grad_g, grad_V = get_weightnorm_params_and_grads(p, g) | ||
|
||
# Adam containers for the 'g' parameter | ||
V_scaler_shape = K.get_variable_shape(V_scaler) | ||
m_g = K.zeros(V_scaler_shape) | ||
v_g = K.zeros(V_scaler_shape) | ||
|
||
# update g parameters | ||
m_g_t = (self.beta_1 * m_g) + (1. - self.beta_1) * grad_g | ||
v_g_t = (self.beta_2 * v_g) + (1. - self.beta_2) * K.square(grad_g) | ||
new_g_param = g_param - lr_t * m_g_t / (K.sqrt(v_g_t) + self.epsilon) | ||
self.updates.append(K.update(m_g, m_g_t)) | ||
self.updates.append(K.update(v_g, v_g_t)) | ||
|
||
# update V parameters | ||
m_t = (self.beta_1 * m) + (1. - self.beta_1) * grad_V | ||
v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(grad_V) | ||
new_V_param = V - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) | ||
self.updates.append(K.update(m, m_t)) | ||
self.updates.append(K.update(v, v_t)) | ||
|
||
# if there are constraints we apply them to V, not W | ||
if getattr(p, 'constraint', None) is not None: | ||
new_V_param = p.constraint(new_V_param) | ||
|
||
# wn param updates --> W updates | ||
add_weightnorm_param_updates(self.updates, new_V_param, new_g_param, p, V_scaler) | ||
|
||
else: # do optimization normally | ||
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g | ||
v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) | ||
p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) | ||
|
||
self.updates.append(K.update(m, m_t)) | ||
self.updates.append(K.update(v, v_t)) | ||
|
||
new_p = p_t | ||
# apply constraints | ||
if getattr(p, 'constraint', None) is not None: | ||
new_p = p.constraint(new_p) | ||
self.updates.append(K.update(p, new_p)) | ||
return self.updates | ||
|
||
|
||
def get_weightnorm_params_and_grads(p, g): | ||
ps = K.get_variable_shape(p) | ||
|
||
# construct weight scaler: V_scaler = g/||V|| | ||
V_scaler_shape = (ps[-1],) # assumes we're using tensorflow! | ||
V_scaler = K.ones(V_scaler_shape) # init to ones, so effective parameters don't change | ||
|
||
# get V parameters = ||V||/g * W | ||
norm_axes = [i for i in range(len(ps) - 1)] | ||
V = p / tf.reshape(V_scaler, [1] * len(norm_axes) + [-1]) | ||
|
||
# split V_scaler into ||V|| and g parameters | ||
V_norm = tf.sqrt(tf.reduce_sum(tf.square(V), norm_axes)) | ||
g_param = V_scaler * V_norm | ||
|
||
# get grad in V,g parameters | ||
grad_g = tf.reduce_sum(g * V, norm_axes) / V_norm | ||
grad_V = tf.reshape(V_scaler, [1] * len(norm_axes) + [-1]) * \ | ||
(g - tf.reshape(grad_g / V_norm, [1] * len(norm_axes) + [-1]) * V) | ||
|
||
return V, V_norm, V_scaler, g_param, grad_g, grad_V | ||
|
||
|
||
def add_weightnorm_param_updates(updates, new_V_param, new_g_param, W, V_scaler): | ||
ps = K.get_variable_shape(new_V_param) | ||
norm_axes = [i for i in range(len(ps) - 1)] | ||
|
||
# update W and V_scaler | ||
new_V_norm = tf.sqrt(tf.reduce_sum(tf.square(new_V_param), norm_axes)) | ||
new_V_scaler = new_g_param / new_V_norm | ||
new_W = tf.reshape(new_V_scaler, [1] * len(norm_axes) + [-1]) * new_V_param | ||
updates.append(K.update(W, new_W)) | ||
updates.append(K.update(V_scaler, new_V_scaler)) | ||
|
||
|
||
# data based initialization for a given Keras model | ||
def data_based_init(model, input): | ||
|
||
# input can be dict, numpy array, or list of numpy arrays | ||
if type(input) is dict: | ||
feed_dict = input | ||
elif type(input) is list: | ||
feed_dict = {tf_inp: np_inp for tf_inp,np_inp in zip(model.inputs,input)} | ||
else: | ||
feed_dict = {model.inputs[0]: input} | ||
|
||
# add learning phase if required | ||
if model.uses_learning_phase and K.learning_phase() not in feed_dict: | ||
feed_dict.update({K.learning_phase(): 1}) | ||
|
||
# get all layer name, output, weight, bias tuples | ||
layer_output_weight_bias = [] | ||
for l in model.layers: | ||
trainable_weights = l.trainable_weights | ||
if len(trainable_weights) == 2: | ||
W,b = trainable_weights | ||
assert(l.built) | ||
layer_output_weight_bias.append((l.name,l.get_output_at(0),W,b)) # if more than one node, only use the first | ||
|
||
# iterate over our list and do data dependent init | ||
sess = K.get_session() | ||
for l,o,W,b in layer_output_weight_bias: | ||
print('Performing data dependent initialization for layer ' + l) | ||
m,v = tf.nn.moments(o, [i for i in range(len(o.get_shape())-1)]) | ||
s = tf.sqrt(v + 1e-10) | ||
updates = tf.group(W.assign(W/tf.reshape(s,[1]*(len(W.get_shape())-1)+[-1])), b.assign((b-m)/s)) | ||
sess.run(updates, feed_dict) |