Skip to content

Commit

Permalink
ENH: added splitter_mode for Random Forest (#1223)
Browse files Browse the repository at this point in the history
* ENH: added splitter_mode for Random Forest

* condition for splitter mode

* Update onedal/ensemble/forest.cpp

* added warnings

* updated tests

* Add splitter_mode to RandomForest _parameter_constraints

---------

Co-authored-by: Alexander Andreev <[email protected]>
  • Loading branch information
samir-nasibli and Alexsandruss authored Mar 27, 2023
1 parent ef57673 commit 4abff0d
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 7 deletions.
18 changes: 18 additions & 0 deletions onedal/ensemble/forest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,21 @@ auto get_infer_mode(const py::dict& params) {
return result_mode;
}

#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230101
auto get_splitter_mode(const py::dict& params) {
using namespace decision_forest;
auto mode = params["splitter_mode"].cast<std::string>();
if (mode == "best") {
return splitter_mode::best;
}
else if (mode == "random") {
return splitter_mode::random;
}
else
ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(mode);
}
#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20230101

auto get_variable_importance_mode(const py::dict& params) {
using namespace decision_forest;

Expand Down Expand Up @@ -171,6 +186,9 @@ struct params2desc {
.set_min_bin_size(params["min_bin_size"].cast<std::int64_t>())
.set_memory_saving_mode(params["memory_saving_mode"].cast<bool>())
.set_bootstrap(params["bootstrap"].cast<bool>())
#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230101
.set_splitter_mode(get_splitter_mode(params))
#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20230101
.set_error_metric_mode(get_error_metric_mode(params))
.set_variable_importance_mode(get_variable_importance_mode(params));

Expand Down
8 changes: 8 additions & 0 deletions onedal/ensemble/forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def __init__(
max_bins,
min_bin_size,
infer_mode,
splitter_mode,
voting_mode,
error_metric_mode,
variable_importance_mode,
Expand All @@ -102,6 +103,7 @@ def __init__(
self.max_bins = max_bins
self.min_bin_size = min_bin_size
self.infer_mode = infer_mode
self.splitter_mode = splitter_mode
self.voting_mode = voting_mode
self.error_metric_mode = error_metric_mode
self.variable_importance_mode = variable_importance_mode
Expand Down Expand Up @@ -230,6 +232,8 @@ def _get_onedal_params(self, data):
if self.is_classification:
onedal_params['class_count'] = 0 if self.classes_ is None else len(
self.classes_)
if daal_check_version((2023, 'P', 101)):
onedal_params['splitter_mode'] = self.splitter_mode
return onedal_params

def _check_parameters(self):
Expand Down Expand Up @@ -434,6 +438,7 @@ def __init__(self,
max_bins=256,
min_bin_size=1,
infer_mode='class_responses',
splitter_mode='best',
voting_mode='weighted',
error_metric_mode='none',
variable_importance_mode='none',
Expand All @@ -460,6 +465,7 @@ def __init__(self,
max_bins=max_bins,
min_bin_size=min_bin_size,
infer_mode=infer_mode,
splitter_mode=splitter_mode,
voting_mode=voting_mode,
error_metric_mode=error_metric_mode,
variable_importance_mode=variable_importance_mode,
Expand Down Expand Up @@ -516,6 +522,7 @@ def __init__(self,
max_bins=256,
min_bin_size=1,
infer_mode='class_responses',
splitter_mode='best',
voting_mode='weighted',
error_metric_mode='none',
variable_importance_mode='none',
Expand All @@ -542,6 +549,7 @@ def __init__(self,
max_bins=max_bins,
min_bin_size=min_bin_size,
infer_mode=infer_mode,
splitter_mode=splitter_mode,
voting_mode=voting_mode,
error_metric_mode=error_metric_mode,
variable_importance_mode=variable_importance_mode,
Expand Down
25 changes: 25 additions & 0 deletions onedal/ensemble/tests/test_random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import numpy as np
from numpy.testing import assert_allclose

from daal4py.sklearn._utils import daal_check_version
from onedal.ensemble import RandomForestClassifier, RandomForestRegressor
from onedal.tests.utils._device_selection import get_queues

Expand All @@ -42,3 +43,27 @@ def test_rf_regression(queue):
max_depth=2, random_state=0).fit(X, y, queue=queue)
assert_allclose(
[-6.83], rf.predict([[0, 0, 0, 0]], queue=queue), atol=1e-2)


@pytest.mark.skipif(not daal_check_version((2023, 'P', 101)),
reason='requires OneDAL 2023.1.1')
@pytest.mark.parametrize('queue', get_queues('gpu'))
def test_rf_classifier_random_splitter(queue):
X, y = make_classification(n_samples=100, n_features=4,
n_informative=2, n_redundant=0,
random_state=0, shuffle=False)
rf = RandomForestClassifier(
max_depth=2, random_state=0,
splitter_mode='random').fit(X, y, queue=queue)
assert_allclose([1], rf.predict([[0, 0, 0, 0]], queue=queue))


@pytest.mark.parametrize('queue', get_queues('gpu'))
def test_rf_regression_random_splitter(queue):
X, y = make_regression(n_samples=100, n_features=4, n_informative=2,
random_state=0, shuffle=False)
rf = RandomForestRegressor(
max_depth=2, random_state=0,
splitter_mode='random').fit(X, y, queue=queue)
assert_allclose(
[-6.83], rf.predict([[0, 0, 0, 0]], queue=queue), atol=1e-2)
48 changes: 41 additions & 7 deletions sklearnex/preview/ensemble/forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
from scipy import sparse as sp

if sklearn_check_version('1.2'):
from sklearn.utils._param_validation import Interval
from sklearn.utils._param_validation import Interval, StrOptions


class BaseRandomForest(ABC):
Expand Down Expand Up @@ -193,7 +193,8 @@ class RandomForestClassifier(sklearn_RandomForestClassifier, BaseRandomForest):
_parameter_constraints: dict = {
**sklearn_RandomForestClassifier._parameter_constraints,
"max_bins": [Interval(numbers.Integral, 2, None, closed="left")],
"min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")]
"min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")],
"splitter_mode": [StrOptions({"best", "random"})]
}

if sklearn_check_version('1.0'):
Expand All @@ -218,7 +219,8 @@ def __init__(
ccp_alpha=0.0,
max_samples=None,
max_bins=256,
min_bin_size=1):
min_bin_size=1,
splitter_mode='best'):
super(RandomForestClassifier, self).__init__(
n_estimators=n_estimators,
criterion=criterion,
Expand All @@ -243,6 +245,7 @@ def __init__(
self.max_bins = max_bins
self.min_bin_size = min_bin_size
self.min_impurity_split = None
self.splitter_mode = splitter_mode
# self._estimator = DecisionTreeClassifier()
else:
def __init__(self,
Expand All @@ -266,7 +269,8 @@ def __init__(self,
ccp_alpha=0.0,
max_samples=None,
max_bins=256,
min_bin_size=1):
min_bin_size=1,
splitter_mode='best'):
super(RandomForestClassifier, self).__init__(
n_estimators=n_estimators,
criterion=criterion,
Expand Down Expand Up @@ -294,6 +298,7 @@ def __init__(self,
self.max_bins = max_bins
self.min_bin_size = min_bin_size
self.min_impurity_split = None
self.splitter_mode = splitter_mode
# self._estimator = DecisionTreeClassifier()

def fit(self, X, y, sample_weight=None):
Expand Down Expand Up @@ -529,6 +534,11 @@ def _estimators_(self):
def _onedal_cpu_supported(self, method_name, *data):
if method_name == 'ensemble.RandomForestClassifier.fit':
ready, X, y, sample_weight = self._onedal_ready(*data)
if self.splitter_mode == 'random':
warnings.warn("'random' splitter mode supports GPU devices only "
"and requires oneDAL version >= 2023.1.1. "
"Using 'best' mode instead.", RuntimeWarning)
self.splitter_mode = 'best'
if not ready:
return False
elif sp.issparse(X):
Expand Down Expand Up @@ -570,6 +580,11 @@ def _onedal_cpu_supported(self, method_name, *data):
def _onedal_gpu_supported(self, method_name, *data):
if method_name == 'ensemble.RandomForestClassifier.fit':
ready, X, y, sample_weight = self._onedal_ready(*data)
if self.splitter_mode == 'random' and \
not daal_check_version((2023, 'P', 101)):
warnings.warn("'random' splitter mode requires OneDAL >= 2023.1.1. "
"Using 'best' mode instead.", RuntimeWarning)
self.splitter_mode = 'best'
if not ready:
return False
elif sp.issparse(X):
Expand Down Expand Up @@ -687,6 +702,8 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None):
'min_bin_size': self.min_bin_size,
'max_samples': self.max_samples
}
if daal_check_version((2023, 'P', 101)):
onedal_params['splitter_mode'] = self.splitter_mode
self._cached_estimators_ = None

# Compute
Expand Down Expand Up @@ -729,7 +746,8 @@ class RandomForestRegressor(sklearn_RandomForestRegressor, BaseRandomForest):
_parameter_constraints: dict = {
**sklearn_RandomForestRegressor._parameter_constraints,
"max_bins": [Interval(numbers.Integral, 2, None, closed="left")],
"min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")]
"min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")],
"splitter_mode": [StrOptions({"best", "random"})]
}

if sklearn_check_version('1.0'):
Expand All @@ -754,7 +772,8 @@ def __init__(
ccp_alpha=0.0,
max_samples=None,
max_bins=256,
min_bin_size=1):
min_bin_size=1,
splitter_mode='best'):
super(RandomForestRegressor, self).__init__(
n_estimators=n_estimators,
criterion=criterion,
Expand All @@ -778,6 +797,7 @@ def __init__(
self.max_bins = max_bins
self.min_bin_size = min_bin_size
self.min_impurity_split = None
self.splitter_mode = splitter_mode
else:
def __init__(self,
n_estimators=100, *,
Expand All @@ -799,7 +819,8 @@ def __init__(self,
ccp_alpha=0.0,
max_samples=None,
max_bins=256,
min_bin_size=1):
min_bin_size=1,
splitter_mode='best'):
super(RandomForestRegressor, self).__init__(
n_estimators=n_estimators,
criterion=criterion,
Expand All @@ -826,6 +847,7 @@ def __init__(self,
self.max_bins = max_bins
self.min_bin_size = min_bin_size
self.min_impurity_split = None
self.splitter_mode = splitter_mode

@property
def _estimators_(self):
Expand Down Expand Up @@ -902,6 +924,11 @@ def _onedal_ready(self, X, y, sample_weight):
def _onedal_cpu_supported(self, method_name, *data):
if method_name == 'ensemble.RandomForestRegressor.fit':
ready, X, y, sample_weight = self._onedal_ready(*data)
if self.splitter_mode == 'random':
warnings.warn("'random' splitter mode supports GPU devices only "
"and requires oneDAL version >= 2023.1.1. "
"Using 'best' mode instead.", RuntimeWarning)
self.splitter_mode = 'best'
if not ready:
return False
elif not (self.oob_score and daal_check_version(
Expand Down Expand Up @@ -947,6 +974,11 @@ def _onedal_cpu_supported(self, method_name, *data):
def _onedal_gpu_supported(self, method_name, *data):
if method_name == 'ensemble.RandomForestRegressor.fit':
ready, X, y, sample_weight = self._onedal_ready(*data)
if self.splitter_mode == 'random' and \
not daal_check_version((2023, 'P', 101)):
warnings.warn("'random' splitter mode requires OneDAL >= 2023.1.1. "
"Using 'best' mode instead.", RuntimeWarning)
self.splitter_mode = 'best'
if not ready:
return False
elif not (self.oob_score and daal_check_version(
Expand Down Expand Up @@ -1035,6 +1067,8 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None):
'variable_importance_mode': 'mdi',
'max_samples': self.max_samples
}
if daal_check_version((2023, 'P', 101)):
onedal_params['splitter_mode'] = self.splitter_mode
self._cached_estimators_ = None
self._onedal_estimator = self._onedal_regressor(**onedal_params)
self._onedal_estimator.fit(X, y, sample_weight, queue=queue)
Expand Down

0 comments on commit 4abff0d

Please sign in to comment.