-
Notifications
You must be signed in to change notification settings - Fork 0
/
hw1.cs4641_kNN_LSOA.py
93 lines (71 loc) · 2.92 KB
/
hw1.cs4641_kNN_LSOA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#Sheena Ganju, CS 4641 HW 1
#Nueral network implementation using scikit learn,
#help from http://www.kdnuggets.com/2016/10/beginners-guide-neural-networks-python-scikit-learn.html/2
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import csv
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.model_selection import train_test_split
import scikitplot as skplt
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import learning_curve
from sklearn.model_selection import train_test_split
import scikitplot as skplt
import matplotlib.pyplot as plt
from sklearn.model_selection import validation_curve
from matplotlib.colors import ListedColormap
#read in data
#Read data in using pandas
#read in data
#Read data in using pandas
trainDataSet = pd.read_csv("london_crime_by_lsoa.csv", sep=",", header= None, low_memory= False)
print("Dataset: ", trainDataSet.head())
#encode text data to integers using getDummies
traindata = pd.get_dummies(trainDataSet)
# Create decision Tree using major_category, month, year, to predict violent or not
# train split uses default gini node, split using train_test_split
X = traindata.values[1:50000,:3]
Y = traindata.values[1:50000,4]
###set up classifier, from the n above
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.33, random_state= 20)
#start timer
t0= time.clock()
#find the best N by plotting
kArray = [1, 2, 3, 4, 5]
ansArray = []
for each in kArray:
clf = KNeighborsClassifier(n_neighbors = each)
clf.fit(X_train, Y_train)
f= clf.predict(X_test)
g = accuracy_score(f, Y_test)*100
ansArray.append(g)
fig = plt.figure()
ax= fig.add_subplot(111)
ax.set_title("K vs Accuracy")
ax.plot(kArray, ansArray)
plt.show()
clf = KNeighborsClassifier(n_neighbors = 5, weights = "distance")
clf.fit(X,Y)
print("Classifier score, training" + str(clf.score(X_train, Y_train)))
print("Classifier score, testing" + str(clf.score(X_test, Y_test)))
train_prediction = clf.predict(X_train)
trainaccuracy = accuracy_score(train_prediction, Y_train)*100
print("The training accuracy for this is " +str(trainaccuracy))
#output
Y_prediction = clf.predict(X_test)
accuracy = accuracy_score(Y_test, Y_prediction)*100
print("The test classification works with " + str(accuracy) + "% accuracy")
#classification precision score, metrics log loss
from sklearn.metrics import precision_score
from sklearn.metrics import log_loss
precision = precision_score(Y_test, Y_prediction, average = "weighted")*100
loss = log_loss(Y_test, Y_prediction)*100
print("Precision: " + str(precision))
print("Loss: " + str(loss))
#time program took to run
print(str(time.time() - t0) + " seconds wall time.")