classifier_trainer.py

import numpy as np


class ClassifierTrainer(object):
  """ The trainer class performs SGD with momentum on a cost function """
  def __init__(self):
    self.step_cache = {} # for storing velocities in momentum update

    # variables used for SMORMS3
    self.step_cache_square={}
    self.step_cache_mem={}

  def train(self, X, y, X_val, y_val, 
            model, loss_function, 
            reg=0.0,
            learning_rate=1e-2, momentum=0, learning_rate_decay=0.95,
            update='momentum', sample_batches=True,
            num_epochs=30, batch_size=100, acc_frequency=None,
            verbose=False):
    """
    Optimize the parameters of a model to minimize a loss function. We use
    training data X and y to compute the loss and gradients, and periodically
    check the accuracy on the validation set.

    Inputs:
    - X: Array of training data; each X[i] is a training sample.
    - y: Vector of training labels; y[i] gives the label for X[i].
    - X_val: Array of validation data
    - y_val: Vector of validation labels
    - model: Dictionary that maps parameter names to parameter values. Each
      parameter value is a numpy array.
    - loss_function: A function that can be called in the following ways:
      scores = loss_function(X, model, reg=reg)
      loss, grads = loss_function(X, model, y, reg=reg)
    - reg: Regularization strength. This will be passed to the loss function.
    - learning_rate: Initial learning rate to use.
    - momentum: Parameter to use for momentum updates.
    - learning_rate_decay: The learning rate is multiplied by this after each
      epoch.
    - update: The update rule to use. One of 'sgd', 'momentum', or 'rmsprop'.
    - sample_batches: If True, use a minibatch of data for each parameter update
      (stochastic gradient descent); if False, use the entire training set for
      each parameter update (gradient descent).
    - num_epochs: The number of epochs to take over the training data.
    - batch_size: The number of training samples to use at each iteration.
    - acc_frequency: If set to an integer, we compute the training and
      validation set error after every acc_frequency iterations.
    - verbose: If True, print status after each epoch.

    Returns a tuple of:
    - best_model: The model that got the highest validation accuracy during
      training.
    - loss_history: List containing the value of the loss function at each
      iteration.
    - train_acc_history: List storing the training set accuracy at each epoch.
    - val_acc_history: List storing the validation set accuracy at each epoch.
    """

    N = X.shape[0]
    print "N: ",N
    if sample_batches:
      iterations_per_epoch = N / batch_size # using SGD
    else:
      iterations_per_epoch = 1 # using GD
    
    num_iters = num_epochs * iterations_per_epoch
    epoch = 0
    best_val_acc = 0.0
    best_model = {}
    loss_history = []
    train_acc_history = []
    val_acc_history = []
    
    #print "number of iterations: ",num_iters
    
    for it in xrange(num_iters):
      #if it % 10 == 0:  print 'starting iteration ', it

      # get batch of data
      if sample_batches:
        batch_mask = np.random.choice(N, batch_size)
        X_batch = X[batch_mask]
        y_batch = y[batch_mask]
      else:
        # no SGD used, full gradient descent
        X_batch = X
        y_batch = y

      # evaluate cost and gradient
      cost, grads = loss_function(X_batch, model, y_batch, reg)
      #print cost, grads['W1'].shape
    
      loss_history.append(cost)

      # perform a parameter update
      for p in model:
        # compute the parameter step
        if update == 'sgd':
          dx = -learning_rate * grads[p]#simple gradient  (Vanilla update) x += - learning_rate * dx
        elif update == 'momentum':
          if not p in self.step_cache: 
            self.step_cache[p] = np.zeros(grads[p].shape)
          #dx = np.zeros_like(grads[p]) # you can remove this after
          #####################################################################
          # TODO: implement the momentum update formula and store the step    #
          # update into variable dx. You should use the variable              #
          # step_cache[p] and the momentum strength is stored in momentum.    #
          # Don't forget to also update the step_cache[p].                    #
          #####################################################################
        
          #################### http://cs231n.github.io/neural-networks-3/ #################
          # Momentum update  v = mu * v - learning_rate * dx # integrate velocity and  update x += v # integrate position

          v = self.step_cache[p]
          v  = (momentum*v) - (learning_rate * grads[p])
          dx = v

        elif update == 'nestrov-momentum':
          if not p in self.step_cache: 
            self.step_cache[p] = np.zeros(grads[p].shape)
          
          #####################################################################
          # TODO: implement the Nestrov-Momentum update formula and store the step #
          # update into variable dx. You should use the variable              #
          # step_cache[p] and the momentum strength is stored in momentum.    #
          # Don't forget to also update the step_cache[p].                    #
          #####################################################################
        
          ''' 
          #################### http://cs231n.github.io/neural-networks-3/ #################
            v_prev = v # back this up
            v = mu * v - learning_rate * dx # velocity update stays the same
            x += -mu * v_prev + (1 + mu) * v # position update changes form
          '''
            
          v  = self.step_cache[p]
          v_prev = v
          v  = (momentum*v) - (learning_rate * grads[p])
          dx = (-momentum*v_prev) + (1+momentum) * v
          #####################################################################
          #                      END OF YOUR CODE                             #
          #####################################################################
        elif update == 'rmsprop':
          decay_rate = 0.99 # you could also make this an option
          if not p in self.step_cache: 
            self.step_cache[p] = np.zeros(grads[p].shape)
          # dx = np.zeros_like(grads[p]) # you can remove this after
          #####################################################################
          # TODO: implement the RMSProp update and store the parameter update #
          # dx. Don't forget to also update step_cache[p]. Use smoothing 1e-8 #
          #####################################################################
          '''
          #################### http://cs231n.github.io/neural-networks-3/ #################
          cache = decay_rate * cache + (1 - decay_rate) * dx**2
          x += - learning_rate * dx / np.sqrt(cache + 1e-8)
          '''
          self.step_cache[p] = (decay_rate*self.step_cache[p]) + (1 - decay_rate) * grads[p]**2
          dx = -learning_rate*grads[p]/np.sqrt(self.step_cache[p] + 1e-8)
          #####################################################################
          #                      END OF YOUR CODE                             #
          #####################################################################
        elif update =="SMORMS3":
          #####################################################################
          # TODO: implement the SMORMS3 update formula and store the step    #
          # update into variable dx.                                         #
          #####################################################################
          ###############http://sifter.org/~simon/journal/20150420.html############
          '''  #didn't sort out yet!
            Initialize:

            mem = np.ones(model[p])   # Vector of 1's, same shape as p
            g   = np.zeros(model[p])    # Vector of 0's, same shape as p
            g2  = np.zeros(model[p]) 

            Given cost function cost(p), and some trivial epsilon (1e-16), take one step per mini-batch as follows:

            grad = grads[p]        # Gradient of cost(p) with respect to p
            r    = 1/(mem+1)
            g    = (1-r) * g  + r * grad
            g2   = (1-r) * g2 + r * grad ** 2
            p    = p - grad*min(lrate, g*g/(g2 + epsilon))/(sqrt(g2)+epsilon)
            mem  = 1 + mem*(1 - g*g/(g2 + epsilon))

            (Where products, divides, and min are element-wise.)
          '''
          dx=0
          #####################################################################
          #                      END OF YOUR CODE                             #
          #####################################################################
        else:
          raise ValueError('Unrecognized update type "%s"' % update)

        # update the parameters
        model[p] += dx


      # every epoch perform an evaluation on the validation set
      first_it = (it == 0)
      epoch_end = (it + 1) % iterations_per_epoch == 0
      acc_check = (acc_frequency is not None and it % acc_frequency == 0)
      if first_it or epoch_end or acc_check:
        if it > 0 and epoch_end:
          # decay the learning rate
          learning_rate *= learning_rate_decay
          epoch += 1

        # evaluate train accuracy
        if N > 1000:
          train_mask = np.random.choice(N, 1000)
          X_train_subset = X[train_mask]
          y_train_subset = y[train_mask]
        else:
          X_train_subset = X
          y_train_subset = y
        scores_train = loss_function(X_train_subset, model)
        y_pred_train = np.argmax(scores_train, axis=1)
        train_acc = np.mean(y_pred_train == y_train_subset)
        train_acc_history.append(train_acc)

        # evaluate val accuracy
        scores_val = loss_function(X_val, model)
        y_pred_val = np.argmax(scores_val, axis=1)
        val_acc = np.mean(y_pred_val ==  y_val)
        val_acc_history.append(val_acc)
        
        # keep track of the best model based on validation accuracy
        if val_acc > best_val_acc:
          # make a copy of the model
          best_val_acc = val_acc
          best_model = {}
          for p in model:
            best_model[p] = model[p].copy()

        # print progress if needed
        if verbose:
          print ('Finished epoch %d / %d: cost %f, train: %f, val %f, lr %e'
                 % (epoch, num_epochs, cost, train_acc, val_acc, learning_rate))

    if verbose:
      print 'finished optimization. best validation accuracy: %f' % (best_val_acc, )
    # return the best model and the training history statistics
    return best_model, loss_history, train_acc_history, val_acc_history