forked from rushter/MLAlgorithms
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gbm.py
142 lines (110 loc) · 4.3 KB
/
gbm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# coding:utf-8
import numpy as np
# logistic function
from scipy.special import expit
from mla.base import BaseEstimator
from mla.ensemble.base import mse_criterion
from mla.ensemble.tree import Tree
"""
References:
https://arxiv.org/pdf/1603.02754v3.pdf
http://www.saedsayad.com/docs/xgboost.pdf
https://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf
http://stats.stackexchange.com/questions/202858/loss-function-approximation-with-taylor-expansion
"""
class Loss:
"""Base class for loss functions."""
def __init__(self, regularization=1.0):
self.regularization = regularization
def grad(self, actual, predicted):
"""First order gradient."""
raise NotImplementedError()
def hess(self, actual, predicted):
"""Second order gradient."""
raise NotImplementedError()
def approximate(self, actual, predicted):
"""Approximate leaf value."""
return self.grad(actual, predicted).sum() / (self.hess(actual, predicted).sum() + self.regularization)
def transform(self, pred):
"""Transform predictions values."""
return pred
def gain(self, actual, predicted):
"""Calculate gain for split search."""
nominator = self.grad(actual, predicted).sum() ** 2
denominator = self.hess(actual, predicted).sum() + self.regularization
return 0.5 * (nominator / denominator)
class LeastSquaresLoss(Loss):
"""Least squares loss"""
def grad(self, actual, predicted):
return actual - predicted
def hess(self, actual, predicted):
return np.ones_like(actual)
class LogisticLoss(Loss):
"""Logistic loss."""
def grad(self, actual, predicted):
return actual * expit(-actual * predicted)
def hess(self, actual, predicted):
expits = expit(predicted)
return expits * (1 - expits)
def transform(self, output):
# Apply logistic (sigmoid) function to the output
return expit(output)
class GradientBoosting(BaseEstimator):
"""Gradient boosting trees with Taylor's expansion approximation (as in xgboost)."""
def __init__(self, n_estimators, learning_rate=0.1, max_features=10, max_depth=2, min_samples_split=10):
self.min_samples_split = min_samples_split
self.learning_rate = learning_rate
self.max_depth = max_depth
self.max_features = max_features
self.n_estimators = n_estimators
self.trees = []
self.loss = None
def fit(self, X, y=None):
self._setup_input(X, y)
self.y_mean = np.mean(y)
self._train()
def _train(self):
# Initialize model with zeros
y_pred = np.zeros(self.n_samples, np.float32)
# Or mean
# y_pred = np.full(self.n_samples, self.y_mean)
for n in range(self.n_estimators):
residuals = self.loss.grad(self.y, y_pred)
tree = Tree(regression=True, criterion=mse_criterion)
# Pass multiple target values to the tree learner
targets = {
# Residual values
"y": residuals,
# Actual target values
"actual": self.y,
# Predictions from previous step
"y_pred": y_pred,
}
tree.train(
self.X,
targets,
max_features=self.max_features,
min_samples_split=self.min_samples_split,
max_depth=self.max_depth,
loss=self.loss,
)
predictions = tree.predict(self.X)
y_pred += self.learning_rate * predictions
self.trees.append(tree)
def _predict(self, X=None):
y_pred = np.zeros(X.shape[0], np.float32)
for i, tree in enumerate(self.trees):
y_pred += self.learning_rate * tree.predict(X)
return y_pred
def predict(self, X=None):
return self.loss.transform(self._predict(X))
class GradientBoostingRegressor(GradientBoosting):
def fit(self, X, y=None):
self.loss = LeastSquaresLoss()
super(GradientBoostingRegressor, self).fit(X, y)
class GradientBoostingClassifier(GradientBoosting):
def fit(self, X, y=None):
# Convert labels from {0, 1} to {-1, 1}
y = (y * 2) - 1
self.loss = LogisticLoss()
super(GradientBoostingClassifier, self).fit(X, y)