-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimplekNN.py
executable file
·122 lines (109 loc) · 4.42 KB
/
simplekNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""An implementation of a k-nearest neighbor classifier."""
import random
from Classifier import Classifier as _classifier
class kNN(_classifier):
"""A kNN classifer.
In the example below, we create a k=1 nearest neighbor classifer
that measures the distance between examples as the number of
discrete features that don't match. We train the classifier on
two training examples and test it on a third.
>>> from ML1050.Example import LabeledExample
>>> from ML1050.TrainingSet import createTrainingSet
>>> example1 = LabeledExample(['y', 'a', 'a', 'a'])
>>> example2 = LabeledExample(['n', 'b', 'b', 'b'])
>>> trainingSet = createTrainingSet([example1, example2])
>>> testExample = LabeledExample([None, 'a', 'a', 'b'])
>>> def d(ex1, ex2):
... dist = len(ex1)
... for i in range(len(ex1)):
... if ex1[i] == ex2[i]:
... dist -= 1
... return dist
...
>>> k = kNN(1, d)
>>> k.train(trainingSet)
>>> k.test(testExample)
'y'
"""
def __init__(self, k, distanceFunction):
self.k = k
self.dist = distanceFunction
self.distances = {}
def train(self, trainingSet):
"""Training is trivial for kNN. Just store the training set."""
self.trainingSet = trainingSet
def test(self, testExample):
self.distances = {}
for example in self.trainingSet:
d = self.dist(example, testExample)
# Keep all the examples that are the same distance away in a
# list that is keyed by that distance.
distList = self.distances.get(d, [])
distList.append(example)
self.distances[d] = distList
# Now find the closest examples to the testExample
sortedDists = self.distances.keys()
sortedDists.sort()
kNearestNeighbors = []
neighborsNeeded = self.k
for d in sortedDists:
neighborsAtD = self.distances[d]
if len(neighborsAtD) < neighborsNeeded:
kNearestNeighbors.extend(neighborsAtD)
neighborsNeeded -= len(neighborsAtD)
if neighborsNeeded == 0:
break
else: # We have more neighbors than we need
random.shuffle(neighborsAtD)
kNearestNeighbors.extend(neighborsAtD[0:neighborsNeeded])
break
# Each of the k nearest neighbors gets a vote
votes = {}
for neighbor in kNearestNeighbors:
votes[neighbor.label] = votes.get(neighbor.label, 0) + 1
listOfVotes = [ (num, label) for label, num in votes.items() ]
listOfVotes.sort()
numVotes, classLabel = listOfVotes[-1]
return classLabel
def validate(self, validation):
error = 0.0
for example in validation:
if self.test(example) != example.label:
error += 1
error = error/len(validation)
return error
def __call__(self, example):
"""
Calling an instance like a function is used to test a new example
>>> from ML1050.Example import LabeledExample
>>> from ML1050.TrainingSet import createTrainingSet
>>> example3 = LabeledExample(['+', 'c', 'c', 'c', 'q'])
>>> example4 = LabeledExample(['-', 'a', 'b', 'd', 'r'])
>>> example5 = LabeledExample(['+', 'e', 'e', 'd', 'q'])
>>> example6 = LabeledExample(['+', 'c', 'e', 'e', 'r'])
>>> example7 = LabeledExample(['+', 'c', 'e', 'e', 'r'])
>>> example8 = LabeledExample(['-', 'a', 'a', 'b', 'r'])
>>> trainingSet = createTrainingSet([example3, example4, example5, example6, example7, example8])
>>> testExample = LabeledExample([None, 'c', 'e', 'd', 'q'])
>>> def d(ex1, ex2):
... dist = len(ex1)
... for i in range(len(ex1)):
... if ex1[i] == ex2[i]:
... dist -= 1
... return dist
...
>>> k = kNN(3, d)
>>> k.train(trainingSet)
>>> k(testExample)
'+'
"""
return self.test(example)
def _test():
"""Run the tests in the documentation strings."""
import doctest
return doctest.testmod(verbose=True)
if __name__ == "__main__":
try:
__IP # Are we running IPython?
except NameError:
_test() # If not, run the tests