-
Notifications
You must be signed in to change notification settings - Fork 0
/
KnnGrid.py
70 lines (57 loc) · 1.9 KB
/
KnnGrid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from collections import OrderedDict
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series
import sklearn as sk
import sklearn.model_selection as model_selection
from sklearn.model_selection import ShuffleSplit
import sklearn.feature_selection as feature_selection
import sklearn.neighbors as neighbors
import sklearn.pipeline as pipeline
import warnings
warnings.filterwarnings("ignore")
import pcaextractor
import MaclearnUtilities
from MaclearnUtilities import safeFactorize
import RestrictedData
xs = RestrictedData.xs
xnorms = RestrictedData.xnorms
annots = RestrictedData.annots
ys = RestrictedData.ys
ynums = RestrictedData.ynums
## Note sklearn has some nice built-in capabilities
## for tuning model parameters over a grid of potential values...
## Here doing things manually instead for more explicit illustration.
def pandaize(f):
def pandaized(estimator, X, y, **kwargs):
return f(estimator, np.array(X), safeFactorize(y), **kwargs)
return pandaized
@pandaize
def cross_val_score_pd(estimator, X, y, **kwargs):
return model_selection.cross_val_score(
estimator, X, y, **kwargs)
def fsKnnFitterGenerator(k):
return pipeline.Pipeline([
('featsel', feature_selection.SelectKBest(
feature_selection.f_regression, k=10)),
('classifier', neighbors.KNeighborsClassifier(
n_neighbors=k))
])
cvSchedules = {k : ShuffleSplit(n_splits = 5,
test_size = 0.2,
random_state = 123)
for k in xnorms}
ks = [3, 5, 9, 15]
knnModels = [
OrderedDict([
(s, np.mean(cross_val_score_pd(
estimator = fsKnnFitterGenerator(k),
X = xnorms[s],
y = ys[s],
cv = cvSchedules[s].split(xnorms[s]))))
for s in xnorms
])
for k in ks
]
knnCvAccs = DataFrame(knnModels, index=ks)