-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLAB5_assignment5.py
189 lines (136 loc) · 5.49 KB
/
LAB5_assignment5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot') # Look Pretty
def plotDecisionBoundary(model, X, y):
fig = plt.figure()
ax = fig.add_subplot(111)
padding = 0.6
resolution = 0.0025
colors = ['royalblue','forestgreen','ghostwhite']
# Calculate the boundaris
x_min, x_max = X[:, 0].min(), X[:, 0].max()
y_min, y_max = X[:, 1].min(), X[:, 1].max()
x_range = x_max - x_min
y_range = y_max - y_min
x_min -= x_range * padding
y_min -= y_range * padding
x_max += x_range * padding
y_max += y_range * padding
# Create a 2D Grid Matrix. The values stored in the matrix
# are the predictions of the class at at said location
xx, yy = np.meshgrid(np.arange(x_min, x_max, resolution),
np.arange(y_min, y_max, resolution))
# What class does the classifier say?
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Plot the contour map
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.terrain)
# Plot the test original points as well...
for label in range(len(np.unique(y))):
indices = np.where(y == label)
plt.scatter(X[indices, 0], X[indices, 1], c=colors[label], label=str(label), alpha=0.8)
p = model.get_params()
plt.axis('tight')
plt.title('K = ' + str(p['n_neighbors']))
#
# TODO: Load up the dataset into a variable called X. Check the .head and
# compare it to the file you loaded in a text editor. Make sure you're
# loading your data properly--don't fail on the 1st step!
#
x = pd.read_csv('C:/Users/mckinns/Documents/GitHub/DAT210x/Module5/Datasets/wheat.data',
sep=',', header=0)
# TODO: Copy the 'wheat_type' series slice out of X, and into a series
# called 'y'. Then drop the original 'wheat_type' column from the X
#
# .. your code here ..
y = x[['wheat_type']]
x.drop(labels = ['id', 'wheat_type'], axis = 1, inplace = True)
# TODO: Do a quick, "ordinal" conversion of 'y'. In actuality our
# classification isn't ordinal, but just as an experiment...
#
y = y.astype('category').cat.codes
#
# TODO: Basic nan munging. Fill each row's nans with the mean of the feature
#
x.isnull().sum()
x['compactness'] = x['compactness'].fillna(x.compactness.mean())
x['width'] = x['width'].fillna(x.width.mean())
x['groove'] = x['groove'].fillna(x.groove.mean())
#
# TODO: Split X into training and testing data sets using train_test_split().
# INFO: Use 0.33 test size, and use random_state=1. This is important
# so that your answers are verifiable. In the real world, you wouldn't
# specify a random_state.
#
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=1)
#
# TODO: Create an instance of SKLearn's Normalizer class and then train it
# using its .fit() method against your *training* data.
#
# NOTE: The reason you only fit against your training data is because in a
# real-world situation, you'll only have your training data to train with!
# In this lab setting, you have both train+test data; but in the wild,
# you'll only have your training data, and then unlabeled data you want to
# apply your models to.
#
from sklearn import preprocessing
normalized_X = preprocessing.normalize(X_train)
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
model.fit(normalized_X, y_train)
#
# TODO: With your trained pre-processor, transform both your training AND
# testing data.
#
# NOTE: Any testing data has to be transformed with your preprocessor
# that has ben fit against your training data, so that it exist in the same
# feature-space as the original data used to train your models.
#
# .. your code here ..
norm_X_test = preprocessing.normalize(X_test)
#
# TODO: Just like your preprocessing transformation, create a PCA
# transformation as well. Fit it against your training data, and then
# project your training and testing features into PCA space using the
# PCA model's .transform() method.
#
# NOTE: This has to be done because the only way to visualize the decision
# boundary in 2D would be if your KNN algo ran in 2D as well:
#
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
pca_X = pca.fit_transform(x)
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(pca_X, y,
test_size = 0.33, random_state = 1)
#
# TODO: Create and train a KNeighborsClassifier. Start with K=9 neighbors.
# NOTE: Be sure train your classifier against the pre-processed, PCA-
# transformed training data above! You do not, of course, need to transform
# your labels.
#
# .. your code here ..
knn = KNeighborsClassifier(n_neighbors=9)
y_train = np.ravel(y_train)
knn.fit(X_train, y_train)
# HINT: Ensure your KNeighbors classifier object from earlier is called 'knn'
plotDecisionBoundary(knn, X_train, y_train)
#------------------------------------
#
# TODO: Display the accuracy score of your test data/labels, computed by
# your KNeighbors model.
#
# NOTE: You do NOT have to run .predict before calling .score, since
# .score will take care of running your predictions for you automatically.
#
predictions = knn.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)
#
# BONUS: Instead of the ordinal conversion, try and get this assignment
# working with a proper Pandas get_dummies for feature encoding. HINT:
# You might have to update some of the plotDecisionBoundary code.
plt.show()