-
Notifications
You must be signed in to change notification settings - Fork 12
/
feat_selection.py
169 lines (134 loc) · 5.65 KB
/
feat_selection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
"""
Airbnb New User Bookings Comptetition
https://www.kaggle.com/c/airbnb-recruiting-new-user-bookings
Author: Sandro Vega Pons ([email protected])
Routines to do feature selection.
"""
import xgboost
import numpy as np
from sklearn.linear_model import RandomizedLogisticRegression
from sklearn.ensemble import RandomForestClassifier
def xgb_feat_selection(X_train, y_train, X_valid, y_valid, random_state):
"""
Feature selection based on the scores given to the features by an
XGB Classifier.
"""
#Parameters of the xgb classifier to be used for feature selection
params = {'eta': 0.09,
'max_depth': 6,
'subsample': 0.5,
'colsample_bytree': 0.5,
'objective': 'multi:softprob',
'eval_metric': 'mlogloss',
'num_class': 12}
num_rounds = 1000
xg_train = xgboost.DMatrix(X_train, label=y_train)
xg_valid = xgboost.DMatrix(X_valid, label=y_valid)
watchlist = [(xg_train,'train'), (xg_valid, 'validation')]
#Training the model and stopping at the best iteration
xgb = xgboost.train(params, xg_train, num_rounds, watchlist,
early_stopping_rounds=10)
#Getting the scores for each feature
f_score = xgb.get_fscore()
feats = np.zeros(X_train.shape[1])
#Scores are given in the format => fn:x meaning n-th feature has a value x.
for k,v in f_score.items():
feats[int(k[1:])] = v
#Normalizing the scores to [0,1.]
feats = feats/float(np.max(feats))
np.save('save/feat_sel_xgb.npy', feats)
return feats
def log_reg_feat_selection(X_train, y_train, X_valid, y_valid, random_state):
"""
Feature selection based on the scores given to the features by the
RandomizedLogisticRegression algorithm.
"""
rlr = RandomizedLogisticRegression(C=[0.001, 0.01, 0.1, 1.],
sample_fraction=0.7,
n_resampling=200, selection_threshold=0.25,
verbose=5, n_jobs=-1, random_state=0)
rlr.fit(X_train, y_train)
np.save('save/feat_sel_log_reg.npy', rlr.scores_)
return rlr.scores_
def random_forest_selection(X_train, y_train, X_valid, y_valid, random_state):
"""
Feature selection based on the scores given to the features by the
RandomForest algorithm.
"""
rf = RandomForestClassifier(n_estimators=400, n_jobs=-1,
criterion='gini',
max_depth=15,
max_features=0.2,
max_leaf_nodes = 20,
min_samples_split=50,
random_state=random_state, verbose=10)
rf.fit(X_train, y_train)
feat_imp = rf.feature_importances_
np.save('save/feat_sel_random_forest.npy', feat_imp)
# th = np.sort(feat_imp)[::-1][int(len(feat_imp)*7./10.)]
# feats = feat_imp > th
return feat_imp
def apply_feat_sel(X_train, X_valid, X_test, f_type='xgboost', th=0.001):
"""
Selects features with highest scores according to a precomputed feature
scoring.
Parameters:
----------
X_train: training set
X_valid: validation set
X_test: test set
f_type: string
Type of feature scoring system to be used. Possible values are:
xgboost, for feature selection based on xgboost scoring.
log_reg, for feature selection based on logistic regression scoring.
th: float
Threshold. Features with a score higher than th are kept the others are
discarded.
Return:
------
X_train, X_valid, X_test: Train, validation and test sets after feature extraction.
"""
if f_type == 'xgboost':
scores = np.load('save/feat_sel_xgb.npy')
elif f_type == 'log_reg':
scores = np.load('save/feat_sel_log_reg.npy')
elif f_type == 'random_forest':
scores = np.load('save/feat_sel_random_forest.npy')
feats = scores > th
X_train = X_train[:, feats]
X_valid = X_valid[:, feats]
X_test = X_test[:, feats]
print 'Keeping %s features from the original %s' %(X_train.shape[1], feats.shape[0])
return X_train, X_valid, X_test
def apply_feat_sel_by_percent(X_train, X_valid, X_test, f_type='xgboost', percent=0.7):
"""
Selects features with highest scores according to a precomputed feature
scoring. The top percent features are kept.
Parameters:
----------
X_train: training set
X_valid: validation set
X_test: test set
f_type: string
Type of feature scoring system to be used. Possible values are:
xgboost, for feature selection based on xgboost scoring.
log_reg, for feature selection based on logistic regression scoring.
percent: float [0, 1]
percent*100 gives the percent of features to keep.
Return:
------
X_train, X_valid, X_test: Train, validation and test sets after feature extraction.
"""
if f_type == 'xgboost':
scores = np.load('save/feat_sel_xgb.npy')
elif f_type == 'log_reg':
scores = np.load('save/feat_sel_log_reg.npy')
elif f_type == 'random_forest':
scores = np.load('save/feat_sel_random_forest.npy')
th = np.sort(scores)[::-1][int(len(scores)*7./10.)]
feats = scores > th
X_train = X_train[:, feats]
X_valid = X_valid[:, feats]
X_test = X_test[:, feats]
print 'Keeping %s features from the original %s' %(X_train.shape[1], feats.shape[0])
return X_train, X_valid, X_test