189hw4.py

# -*- coding: utf-8 -*-
"""189hw4.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1wjD3bz4g-OCXtrplceR-Y9Azek1fauT0
"""

from google.colab import drive
drive.mount('/content/gdrive')
!unzip -q "/content/gdrive/My Drive/Sophomore/hw4.zip"

#import 
import numpy as np
import matplotlib.pyplot as plt
from scipy import io
from numpy import linalg as LA
from scipy.stats import norm

#set random seed
np.random.seed(42)

wine = io.loadmat('data.mat')

reg_factor = 0.1
step_size = 0.0001

#from hw1
def shuffle_and_partition(data, labels, val_size):
  data_size = len(data)
  if val_size < 1.0:
    val_size = int(data_size * val_size)
  training_size = data_size - val_size
  
  index_perm = np.random.permutation(data_size)
  train_index = index_perm[:training_size]
  val_index= index_perm[training_size:]
  train_pts = data[train_index]
  train_labels = labels[train_index]
  val_pts = data[val_index]
  val_labels = labels[val_index]

  return train_pts, train_labels, val_pts, val_labels

wine_feature = wine["X"]
wine_label = wine["y"]

wine_train_feature, wine_train_label, wine_val_feature, wine_val_label = shuffle_and_partition(
    wine_feature, wine_label, 0.20)

#batch gradient descent
d = len(wine_train_feature[0])
n = len(wine_train_feature)
n_2 = len(wine_val_feature)
#normalized the training and validation feature
for i in range(d):
  mean = np.mean(wine_train_feature[:, i])
  sd = np.std(wine_train_feature[:, i])
  wine_train_feature[:, i] = (wine_train_feature[:, i] - mean) / sd
  wine_val_feature[:, i] = (wine_val_feature[:, 1] - mean) / sd
# add a ficitious dimension to the design matrix
fictitious = np.ones((1, n))
fictitious_2 = np.ones((1, n_2))
wine_train_feature = np.concatenate((wine_train_feature, fictitious.T), axis=1)
wine_val_feature = np.concatenate((wine_val_feature, fictitious_2.T), axis=1)

# function for computing the logitic function s(Xw)
def logistic_fn(X, w):
  #s should have length n
  n = len(X)
  s = np.zeros((n, 1))
  for i in range(len(X)):
    s[i] = 1 / (1 + np.exp(-np.dot(X[i], w)))
  return s

from scipy.special import expit, logit

# start batch gradient descent
reg_factor = 0.1
step_size = 0.0001
w = np.zeros((len(wine_train_feature[0]), 1)) #plus one to account for fictitious dimension
num_iter = 8000
cost_fn = []
X = wine_train_feature
y = wine_train_label
n = len(wine_train_feature)
for i in range(num_iter):
  s = logistic_fn(X, w)
  w = w + step_size * np.matmul(X.T, np.subtract(y, s)) - 2 * step_size * reg_factor * w
  ones = np.ones((n, 1))
  part_1 = np.matmul(y.T, np.log(s))
  part_2 = np.matmul(np.subtract(ones, y).T, np.log(np.subtract(ones, s)))
  cost = (-1)*part_1 - part_2 + reg_factor * np.sum(np.square(w))
  cost_fn.append(cost)
cost_fn_flatten_bgd = []
for cost in cost_fn:
  cost_fn_flatten_bgd.append(cost[0][0])

plt.figure()
plt.plot(list(range(8000)), cost_fn_flatten_bgd)
plt.title("Cost Function vs Number of Iterations (BGD)")
plt.xlabel("Number of Iterations")
plt.ylabel("Cost Function")

w_bgd = w.reshape((13, ))
w_bgd

# PART 3 Stochastic Gradient Descent
def s_i(x_i, w):
  s = 1 / (1 + np.exp(-np.dot(x_i, w)))
  return s

# start stochastic gradient descent
w = np.zeros(len(wine_train_feature[0])) 
# print("Initial w: ", w)
reg_factor = 0.01
num_iter = 8000
step_size = 0.01
cost_fn = []
X = wine_train_feature
y = wine_train_label
n = len(wine_train_feature)
for i in range(n):
  x_i = X[i]
  y_i = y[i]
  si = s_i(x_i, w)
  w = w + step_size * (y_i - si) * x_i - 2 * step_size * reg_factor * w
  ones = np.ones((n, 1))
  w2 = w.reshape((len(w), 1))
  s = logistic_fn(X, w2)
  part_1 = np.matmul(y.T, np.log(s))
  part_2 = np.matmul(np.subtract(ones, y).T, np.log(np.subtract(ones, s)))
  cost = (-1)*part_1 - part_2 + reg_factor * np.sum(np.square(w))
  cost_fn.append(cost)

cost_fn_flatten_sgd = []
for cost in cost_fn:
  cost_fn_flatten_sgd.append(cost[0][0])

w_sgd = w

plt.figure()
plt.plot(list(range(n)), cost_fn_flatten_sgd)
plt.title("Cost Function vs Number of Iterations (SGD)")
plt.xlabel("Number of Iterations")
plt.ylabel("Cost Function")

# PART 5 Decaying SGD
reg_factor = 0.01
w = np.zeros(len(wine_train_feature[0])) 
n = len(wine_train_feature)
num_iter = n
cost_fn = []
X = wine_train_feature
y = wine_train_label
delta = 1
for i in range(num_iter):
  x_i = X[i % n]
  y_i = y[i % n]
  si = s_i(x_i, w)
  step_size = delta / (i + 0.3)
  w = w + step_size * (y_i - si) * x_i - 2 * step_size * reg_factor * w
  ones = np.ones((n, 1))
  w2 = w.reshape((len(w), 1))
  s = logistic_fn(X, w2)
  part_1 = np.matmul(y.T, np.log(s))
  part_2 = np.matmul(np.subtract(ones, y).T, np.log(np.subtract(ones, s)))
  cost = (-1)*part_1 - part_2 + reg_factor * np.sum(np.square(w))
  cost_fn.append(cost)
cost_fn_flatten_sgd_decay = []
for cost in cost_fn:
  cost_fn_flatten_sgd_decay.append(cost[0][0])

w_sgd_decay = w

n = len(wine_train_feature)
plt.figure()
plt.plot(list(range(n)), cost_fn_flatten_sgd_decay, label = "With decaying step size")
plt.plot(list(range(n)), cost_fn_flatten_sgd, label = "without decaying step size")
plt.legend()
plt.title(
    "Cost Function vs Number of Iterations Using Decaying Step size vs No Decaying Step size (SGD)")
plt.xlabel("Number of Iterations")
plt.ylabel("Cost Function")

# normalize wine_feature
for i in range(d):
  mean = np.mean(wine_feature[:, i])
  sd = np.std(wine_feature[:, i])
  wine_feature[:, i] = (wine_feature[:, i] - mean) / sd

fictitious = np.ones((1, len(wine_feature)))
wine_feature = np.concatenate((wine_feature, fictitious.T), axis=1)

# PART 6 Train best classifier
# use batch gradient descent
reg_factor = 0.1
step_size = 0.0001
full_len = len(wine_feature)
w = np.zeros((len(wine_feature[0]), 1)) 
num_iter = 8000
X = wine_feature
y = wine_label
for i in range(num_iter):
  s = logistic_fn(X, w)
  w = w + step_size * np.matmul(X.T, np.subtract(y, s)) - 2 * step_size * reg_factor * w

best_w = w.reshape((13, ))

# use stochastic gradient descent
w2 = np.zeros(len(wine_train_feature[0])) 
reg_factor = 0.01
num_iter = 8000
step_size = 0.01
cost_fn = []
X = wine_train_feature
y = wine_train_label
n = len(wine_train_feature)
for i in range(num_iter):
  x_i = X[i % n]
  y_i = y[i % n]
  si = s_i(x_i, w2)
  w2 = w2 + step_size * (y_i - si) * x_i - 2 * step_size * reg_factor * w2

best_w_sgd = w2

wine_test = wine["X_test"]
# normalize wine_test
for i in range(d):
  mean = np.mean(wine_test[:, i])
  sd = np.std(wine_test[:, i])
  wine_test[:, i] = (wine_test[:, i] - mean) / sd

fictitious = np.ones((1, len(wine_test)))
wine_test = np.concatenate((wine_test, fictitious.T), axis=1)
pred_y = logistic_fn(wine_test, best_w)

pred_y = pred_y.reshape((len(wine_test),))

final_prediction = (pred_y > 0.5).astype(np.int32)

import pandas as pd
def results_to_csv(y_test):
    y_test = y_test.astype(int)
    df = pd.DataFrame({'Category': y_test})
    df.index += 1 # Ensures that the index starts at 1
    df.to_csv('submission.csv', index_label='Id')

results_to_csv(final_prediction)

# predictions made by stochastic gradient descent
pred_y = logistic_fn(wine_test, best_w_sgd)
pred_y = pred_y.reshape((len(wine_test),))
final_prediction2 = (pred_y > 0.5).astype(np.int32)

results_to_csv(final_prediction2)

# Question 5
def lp_norm_fn(w1, w2, p):
  result = (abs(w1)**p + abs(w2)**p) ** (1/p)
  return result

# part a)
fig = plt.figure(figsize = (5, 5))
delta = 0.05
x = np.arange(-3, 3, delta)
y = np.arange(-3, 3, delta)
X, Y = np.meshgrid(x, y)
Z = lp_norm_fn(X, Y, 0.5)

plt.contour(X, Y, Z)
plt.colorbar()
plt.show()

# part b)
fig = plt.figure(figsize = (5, 5))
delta = 0.05
x = np.arange(-3, 3, delta)
y = np.arange(-3, 3, delta)
X, Y = np.meshgrid(x, y)
Z = lp_norm_fn(X, Y, 1)

plt.contour(X, Y, Z)
plt.colorbar()
plt.show()

# part c)
fig = plt.figure(figsize = (5, 5))
delta = 0.05
x = np.arange(-3, 3, delta)
y = np.arange(-3, 3, delta)
X, Y = np.meshgrid(x, y)
Z = lp_norm_fn(X, Y, 2)

plt.contour(X, Y, Z)
plt.colorbar()
plt.show()