-
Notifications
You must be signed in to change notification settings - Fork 12
/
code_xgboost.py
117 lines (92 loc) · 4.17 KB
/
code_xgboost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
Airbnb New User Bookings Comptetition
https://www.kaggle.com/c/airbnb-recruiting-new-user-bookings
Author: Sandro Vega Pons ([email protected])
Classifiers based on xgboost code.
"""
import numpy as np
import pickle
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.metrics import log_loss
from letor_metrics import ndcg_score
from sklearn.utils import compute_sample_weight
def clf_xgboost(data, cl_weight=None, random_state=0, ext_name="", verbose=True):
"""
XGBoost classifier
The function applies the classifier twice:
- First: Fit the classifier to (X_train, y_train) and predict on (X_valid).
The prediction is stored in 'save/valid' folder.
- Second: Fit the classifier to (X, y) = (X_train + X_valid, y_train + y_valid)
and predict on (X_test). The prediction is stored in 'save/test'
folder.
Parameters:
----------
data: list
[X_train, y_train, X_valid, y_valid, X_test]
cl_weight: None or Dictionary
Class weights, e.g. {0:1, 1:1.5, 2:1.6...} => weight for class 0 is 1,
for class 1 is 1.5, for class 2 is 1.6, and so on.
random_state: numpy RandomState
RandomState used for reproducibility
ext_name: string
Extra string to be used in the name of the stored prediction, e.g. it
can be used to identify specific parameter values that were used.
Result:
------
y_valid_pred: numpy ndarray shape=(n_samples_validation, n_classes)
Labels of the predictions for the validation set.
y_test_pred: numpy ndarray shape=(n_samples_test, n_classes)
Labels of the predictions for the test set.
Save:
----
y_valid_pred: it is stored in save/valid folder
y_test_pred: it is stored in save/test folder
"""
xgb = XGBClassifier(max_depth=6, learning_rate=0.01, n_estimators=10000,
objective='multi:softprob', gamma=1., min_child_weight=1.,
max_delta_step=5., subsample=0.7, colsample_bytree=0.7,
reg_alpha=0., reg_lambda=1., seed=random_state)
X_train, y_train, X_valid, y_valid, X_test = data
###Working on (X_Train => X_Valid)###
ss = StandardScaler()
XX_train = ss.fit_transform(X_train)
XX_valid = ss.transform(X_valid)
lb = LabelBinarizer()
lb.fit(y_train)
yb_valid = lb.transform(y_valid)
if cl_weight == None:
xgb.fit(XX_train, y_train,
eval_set=[(XX_valid, y_valid)],
eval_metric = 'mlogloss',
early_stopping_rounds=25, verbose=verbose)
else:
#Computing sample weights from class weights
sw_train = compute_sample_weight(class_weight=cl_weight, y=y_train)
xgb.fit(XX_train, y_train,
sample_weight=sw_train,
eval_set=[(XX_valid, y_valid)],
eval_metric = 'mlogloss',
early_stopping_rounds=25, verbose=verbose)
best_iter = xgb.best_iteration
y_valid_pred = xgb.predict_proba(XX_valid, ntree_limit = best_iter)
ndcg_xg = np.mean([ndcg_score(tr, pr, k=5) for tr, pr in zip(yb_valid.tolist(), y_valid_pred.tolist())])
print 'NDCG: %s' %(ndcg_xg)
logloss_xg = log_loss(y_valid, y_valid_pred)
print 'Log-loss: %s' %(logloss_xg)
rnd = random_state.randint(1000, 9999)
pickle.dump(y_valid_pred, open('save/valid/v_XGB_%s_%s_%s_%s'%(ext_name, rnd, round(ndcg_xg, 4), round(logloss_xg, 4)), 'w'))
###Working on X => X_test###
X = np.vstack((X_train, X_valid))
y = np.hstack((y_train, y_valid))
XX = ss.fit_transform(X)
XX_test = ss.transform(X_test)
xgb.n_estimators = best_iter + 20
if cl_weight == None:
xgb.fit(XX, y)
else:
sw = compute_sample_weight(class_weight=cl_weight, y=y)
xgb.fit(XX, y, sample_weight=sw)
y_test_pred = xgb.predict_proba(XX_test)
pickle.dump(y_test_pred, open('save/test/t_XGB_%s_%s'%(ext_name, rnd), 'w'))
return y_valid_pred, y_test_pred