-
Notifications
You must be signed in to change notification settings - Fork 496
/
Copy pathk_nn.py
95 lines (80 loc) · 3.06 KB
/
k_nn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import collections
import numpy
import requests
TRAIN_RATIO = 0.8
def k_nn(train_x, train_y, test_x, k):
"""
Predicts the class of given data point using k-NN algorithm
:param train_x: Features of training dataset
:param train_y: Class of training dataset
:param test_x: Featues of testing data point
:param k: Value of 'k' in k-NN algorithm
:return: Predicted class of test_x
"""
# Calculate the distance of current point from all other points.
# Complexity: O(nf), where f is number of features
dist = numpy.sqrt(sum(numpy.square(train_x - test_x).T)).T
# Add labels with corresponding distances
dist_with_label = numpy.hstack([train_y, dist]).T
# Get index of smallest k elements. Complexity: O(n + klog(k))
dist_with_label_sorted = numpy.argpartition(dist_with_label, -k).T[:k]
# Arrange 'dist_with_label' according to sorted index of distance
inter = dist_with_label[:, dist_with_label_sorted[:, 1]].reshape([2, k])
# Get the top labels in 1D array
top_labels = numpy.asarray(inter[0]).reshape([k])
# Get the most common neighbor. Complexity: O(k)
k_nn_label = collections.Counter(top_labels).most_common(1)[0][0]
return k_nn_label
def load_iris_data(shuffle=False):
"""
Loads iris dataset from internet
:param shuffle: True will result in shuffling of dataset
:return: Tuple of features and corresponding classes from iris dataset
"""
req = requests.get('https://archive.ics.uci.edu/'
'ml/machine-learning-databases/'
'iris/iris.data')
req_html = req.text
data_str = req_html.strip().split('\n')
data = []
for item in data_str:
item = item.split(',')
data.append(item)
data = numpy.matrix(data)
if shuffle:
numpy.random.shuffle(data)
data_x = data[:, :-1].astype(float)
data_y = data[:, -1]
return data_x, data_y
def k_nn_test(tr_x, tr_y, te_x, te_y, k):
"""
Tests the k-NN algorithm for given dataset and returns error ratio
:param tr_x: Features of training dataset
:param tr_y: Class of training dataset
:param te_x: Features of testing dataset
:param te_y: Actual class of testing dataset
:param k: Value of "k" in k-NN algorithm
:return: Error ratio for given dataset and model
"""
test_size = len(te_x)
error = 0.
for i in range(test_size):
x = te_x[i]
y_actual = te_y[i]
y_predicted = k_nn(tr_x, tr_y, x, k)
if y_actual != y_predicted:
error += 1
return error / test_size
def main():
data_x, data_y = load_iris_data(shuffle=True)
# Calculate size of train and test partitions
train_size = int(len(data_x) * TRAIN_RATIO)
test_size = len(data_x) - train_size
# Generate train and test partitions
train_x, train_y = data_x[:train_size], data_y[:train_size]
test_x, test_y = data_x[test_size + 1:], data_y[test_size + 1:]
# Test for error
error_ratio = k_nn_test(train_x, train_y, test_x, test_y, 7)
print(error_ratio)
if __name__ == '__main__':
main()