Skip to content


Compatibility with Keras 2.2.3 and Tensorflow 1.11.0
Browse files Browse the repository at this point in the history
  • Loading branch information
krasserm committed Oct 6, 2018
1 parent dff0cd1 commit 1c462bd
Show file tree
Hide file tree
Showing 6 changed files with 359 additions and 0 deletions.
File renamed without changes.
File renamed without changes.
File renamed without changes.
6 changes: 6 additions & 0 deletions keras_2/
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

# Weight Normalization using Keras

Example code for using Weight Normalization using [Keras](

`````` contains the standard CIFAR-10 example from Keras, with lines 64 and 69 edited to include weight normalization and data dependent initialization.
145 changes: 145 additions & 0 deletions keras_2/
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
CIFAR-10 example from
Now with weight normalization. Lines 64-65 and 78-79 contain the changes w.r.t. original.

from __future__ import print_function
import keras
from keras.datasets import cifar10
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
import os

'''Train a simple deep CNN on the CIFAR10 small images dataset.
It gets to 75% validation accuracy in 25 epochs, and 79% after 50 epochs.
(it's still underfitting at that point, though).
With weight normalization, a validation accuracy of 75% is already reached
after 10 epochs.

batch_size = 32
num_classes = 10
epochs = 100
data_augmentation = True
num_predictions = 20
save_dir = os.path.join(os.getcwd(), 'saved_models')
model_name = 'keras_cifar10_trained_model.h5'

# The data, split between train and test sets:
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# Convert class vectors to binary class matrices.
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

model = Sequential()
model.add(Conv2D(32, (3, 3), padding='same',
model.add(Conv2D(32, (3, 3)))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3), padding='same'))
model.add(Conv2D(64, (3, 3)))
model.add(MaxPooling2D(pool_size=(2, 2)))


# let's train the model using SGD + momentum (how original). EDIT: now with weight normalization, so slightly more original ;-)
from weightnorm import SGDWithWeightnorm, AdamWithWeightnorm
opt_wn = SGDWithWeightnorm(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
#opt_wn = AdamWithWeightnorm(lr=0.001, decay=1e-6)

# Let's train the model using RMSprop

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

# data based initialization of parameters
from weightnorm import data_based_init
data_based_init(model, x_train[:100])

if not data_augmentation:
print('Not using data augmentation.'), y_train,
validation_data=(x_test, y_test),
print('Using real-time data augmentation.')
# This will do preprocessing and realtime data augmentation:
datagen = ImageDataGenerator(
featurewise_center=False, # set input mean to 0 over the dataset
samplewise_center=False, # set each sample mean to 0
featurewise_std_normalization=False, # divide inputs by std of the dataset
samplewise_std_normalization=False, # divide each input by its std
zca_whitening=False, # apply ZCA whitening
zca_epsilon=1e-06, # epsilon for ZCA whitening
rotation_range=0, # randomly rotate images in the range (degrees, 0 to 180)
# randomly shift images horizontally (fraction of total width)
# randomly shift images vertically (fraction of total height)
shear_range=0., # set range for random shear
zoom_range=0., # set range for random zoom
channel_shift_range=0., # set range for random channel shifts
# set mode for filling points outside the input boundaries
cval=0., # value used for fill_mode = "constant"
horizontal_flip=True, # randomly flip images
vertical_flip=False, # randomly flip images
# set rescaling factor (applied before any other transformation)
# set function that will be applied on each input
# image data format, either "channels_first" or "channels_last"
# fraction of images reserved for validation (strictly between 0 and 1)

# Compute quantities required for feature-wise normalization
# (std, mean, and principal components if ZCA whitening is applied).

# Fit the model on the batches generated by datagen.flow().
model.fit_generator(datagen.flow(x_train, y_train,
validation_data=(x_test, y_test),

# Save model and weights
if not os.path.isdir(save_dir):
model_path = os.path.join(save_dir, model_name)
print('Saved trained model at %s ' % model_path)

# Score trained model.
scores = model.evaluate(x_test, y_test, verbose=1)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])
208 changes: 208 additions & 0 deletions keras_2/
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
from keras import backend as K
from keras.optimizers import SGD,Adam
import tensorflow as tf

# adapted from keras.optimizers.SGD
class SGDWithWeightnorm(SGD):
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = []

lr =
if self.initial_decay > 0:
lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.floatx())))
self.updates .append(K.update_add(self.iterations, 1))

# momentum
shapes = [K.get_variable_shape(p) for p in params]
moments = [K.zeros(shape) for shape in shapes]
self.weights = [self.iterations] + moments
for p, g, m in zip(params, grads, moments):

# if a weight tensor (len > 1) use weight normalized parameterization
ps = K.get_variable_shape(p)
if len(ps) > 1:

# get weight normalization parameters
V, V_norm, V_scaler, g_param, grad_g, grad_V = get_weightnorm_params_and_grads(p, g)

# momentum container for the 'g' parameter
V_scaler_shape = K.get_variable_shape(V_scaler)
m_g = K.zeros(V_scaler_shape)

# update g parameters
v_g = self.momentum * m_g - lr * grad_g # velocity
self.updates.append(K.update(m_g, v_g))
if self.nesterov:
new_g_param = g_param + self.momentum * v_g - lr * grad_g
new_g_param = g_param + v_g

# update V parameters
v_v = self.momentum * m - lr * grad_V # velocity
self.updates.append(K.update(m, v_v))
if self.nesterov:
new_V_param = V + self.momentum * v_v - lr * grad_V
new_V_param = V + v_v

# if there are constraints we apply them to V, not W
if getattr(p, 'constraint', None) is not None:
new_V_param = p.constraint(new_V_param)

# wn param updates --> W updates
add_weightnorm_param_updates(self.updates, new_V_param, new_g_param, p, V_scaler)

else: # normal SGD with momentum
v = self.momentum * m - lr * g # velocity
self.updates.append(K.update(m, v))

if self.nesterov:
new_p = p + self.momentum * v - lr * g
new_p = p + v

# apply constraints
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)

self.updates.append(K.update(p, new_p))
return self.updates

# adapted from keras.optimizers.Adam
class AdamWithWeightnorm(Adam):
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = [K.update_add(self.iterations, 1)]

lr =
if self.initial_decay > 0:
lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.floatx())))

t = K.cast(self.iterations + 1, K.floatx())
lr_t = lr * K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))

shapes = [K.get_variable_shape(p) for p in params]
ms = [K.zeros(shape) for shape in shapes]
vs = [K.zeros(shape) for shape in shapes]
self.weights = [self.iterations] + ms + vs

for p, g, m, v in zip(params, grads, ms, vs):

# if a weight tensor (len > 1) use weight normalized parameterization
# this is the only part changed w.r.t. keras.optimizers.Adam
ps = K.get_variable_shape(p)
if len(ps)>1:

# get weight normalization parameters
V, V_norm, V_scaler, g_param, grad_g, grad_V = get_weightnorm_params_and_grads(p, g)

# Adam containers for the 'g' parameter
V_scaler_shape = K.get_variable_shape(V_scaler)
m_g = K.zeros(V_scaler_shape)
v_g = K.zeros(V_scaler_shape)

# update g parameters
m_g_t = (self.beta_1 * m_g) + (1. - self.beta_1) * grad_g
v_g_t = (self.beta_2 * v_g) + (1. - self.beta_2) * K.square(grad_g)
new_g_param = g_param - lr_t * m_g_t / (K.sqrt(v_g_t) + self.epsilon)
self.updates.append(K.update(m_g, m_g_t))
self.updates.append(K.update(v_g, v_g_t))

# update V parameters
m_t = (self.beta_1 * m) + (1. - self.beta_1) * grad_V
v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(grad_V)
new_V_param = V - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
self.updates.append(K.update(m, m_t))
self.updates.append(K.update(v, v_t))

# if there are constraints we apply them to V, not W
if getattr(p, 'constraint', None) is not None:
new_V_param = p.constraint(new_V_param)

# wn param updates --> W updates
add_weightnorm_param_updates(self.updates, new_V_param, new_g_param, p, V_scaler)

else: # do optimization normally
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

self.updates.append(K.update(m, m_t))
self.updates.append(K.update(v, v_t))

new_p = p_t
# apply constraints
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(K.update(p, new_p))
return self.updates

def get_weightnorm_params_and_grads(p, g):
ps = K.get_variable_shape(p)

# construct weight scaler: V_scaler = g/||V||
V_scaler_shape = (ps[-1],) # assumes we're using tensorflow!
V_scaler = K.ones(V_scaler_shape) # init to ones, so effective parameters don't change

# get V parameters = ||V||/g * W
norm_axes = [i for i in range(len(ps) - 1)]
V = p / tf.reshape(V_scaler, [1] * len(norm_axes) + [-1])

# split V_scaler into ||V|| and g parameters
V_norm = tf.sqrt(tf.reduce_sum(tf.square(V), norm_axes))
g_param = V_scaler * V_norm

# get grad in V,g parameters
grad_g = tf.reduce_sum(g * V, norm_axes) / V_norm
grad_V = tf.reshape(V_scaler, [1] * len(norm_axes) + [-1]) * \
(g - tf.reshape(grad_g / V_norm, [1] * len(norm_axes) + [-1]) * V)

return V, V_norm, V_scaler, g_param, grad_g, grad_V

def add_weightnorm_param_updates(updates, new_V_param, new_g_param, W, V_scaler):
ps = K.get_variable_shape(new_V_param)
norm_axes = [i for i in range(len(ps) - 1)]

# update W and V_scaler
new_V_norm = tf.sqrt(tf.reduce_sum(tf.square(new_V_param), norm_axes))
new_V_scaler = new_g_param / new_V_norm
new_W = tf.reshape(new_V_scaler, [1] * len(norm_axes) + [-1]) * new_V_param
updates.append(K.update(W, new_W))
updates.append(K.update(V_scaler, new_V_scaler))

# data based initialization for a given Keras model
def data_based_init(model, input):

# input can be dict, numpy array, or list of numpy arrays
if type(input) is dict:
feed_dict = input
elif type(input) is list:
feed_dict = {tf_inp: np_inp for tf_inp,np_inp in zip(model.inputs,input)}
feed_dict = {model.inputs[0]: input}

# add learning phase if required
if model.uses_learning_phase and K.learning_phase() not in feed_dict:
feed_dict.update({K.learning_phase(): 1})

# get all layer name, output, weight, bias tuples
layer_output_weight_bias = []
for l in model.layers:
trainable_weights = l.trainable_weights
if len(trainable_weights) == 2:
W,b = trainable_weights
layer_output_weight_bias.append((,l.get_output_at(0),W,b)) # if more than one node, only use the first

# iterate over our list and do data dependent init
sess = K.get_session()
for l,o,W,b in layer_output_weight_bias:
print('Performing data dependent initialization for layer ' + l)
m,v = tf.nn.moments(o, [i for i in range(len(o.get_shape())-1)])
s = tf.sqrt(v + 1e-10)
updates =,[1]*(len(W.get_shape())-1)+[-1])), b.assign((b-m)/s)), feed_dict)

0 comments on commit 1c462bd

Please sign in to comment.