ENH: added splitter_mode for Random Forest (#1223)

* ENH: added splitter_mode for Random Forest * condition for splitter mode * Update onedal/ensemble/forest.cpp * added warnings * updated tests * Add splitter_mode to RandomForest _parameter_constraints --------- Co-authored-by: Alexander Andreev <[email protected]>
uxlfoundation · Mar 27, 2023 · 4abff0d · 4abff0d
1 parent ef57673
commit 4abff0d
Show file tree

Hide file tree

Showing 4 changed files with 92 additions and 7 deletions.
diff --git a/onedal/ensemble/forest.cpp b/onedal/ensemble/forest.cpp
@@ -109,6 +109,21 @@ auto get_infer_mode(const py::dict& params) {
     return result_mode;
 }
 
+#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230101
+auto get_splitter_mode(const py::dict& params) {
+    using namespace decision_forest;
+    auto mode = params["splitter_mode"].cast<std::string>();
+    if (mode == "best") {
+        return splitter_mode::best;
+    }
+    else if (mode == "random") {
+        return splitter_mode::random;
+    }
+    else
+        ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(mode);
+}
+#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20230101
+
 auto get_variable_importance_mode(const py::dict& params) {
     using namespace decision_forest;
 
@@ -171,6 +186,9 @@ struct params2desc {
                         .set_min_bin_size(params["min_bin_size"].cast<std::int64_t>())
                         .set_memory_saving_mode(params["memory_saving_mode"].cast<bool>())
                         .set_bootstrap(params["bootstrap"].cast<bool>())
+#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230101
+                        .set_splitter_mode(get_splitter_mode(params))
+#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20230101
                         .set_error_metric_mode(get_error_metric_mode(params))
                         .set_variable_importance_mode(get_variable_importance_mode(params));
 

diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py
@@ -77,6 +77,7 @@ def __init__(
             max_bins,
             min_bin_size,
             infer_mode,
+            splitter_mode,
             voting_mode,
             error_metric_mode,
             variable_importance_mode,
@@ -102,6 +103,7 @@ def __init__(
         self.max_bins = max_bins
         self.min_bin_size = min_bin_size
         self.infer_mode = infer_mode
+        self.splitter_mode = splitter_mode
         self.voting_mode = voting_mode
         self.error_metric_mode = error_metric_mode
         self.variable_importance_mode = variable_importance_mode
@@ -230,6 +232,8 @@ def _get_onedal_params(self, data):
         if self.is_classification:
             onedal_params['class_count'] = 0 if self.classes_ is None else len(
                 self.classes_)
+        if daal_check_version((2023, 'P', 101)):
+            onedal_params['splitter_mode'] = self.splitter_mode
         return onedal_params
 
     def _check_parameters(self):
@@ -434,6 +438,7 @@ def __init__(self,
                  max_bins=256,
                  min_bin_size=1,
                  infer_mode='class_responses',
+                 splitter_mode='best',
                  voting_mode='weighted',
                  error_metric_mode='none',
                  variable_importance_mode='none',
@@ -460,6 +465,7 @@ def __init__(self,
             max_bins=max_bins,
             min_bin_size=min_bin_size,
             infer_mode=infer_mode,
+            splitter_mode=splitter_mode,
             voting_mode=voting_mode,
             error_metric_mode=error_metric_mode,
             variable_importance_mode=variable_importance_mode,
@@ -516,6 +522,7 @@ def __init__(self,
                  max_bins=256,
                  min_bin_size=1,
                  infer_mode='class_responses',
+                 splitter_mode='best',
                  voting_mode='weighted',
                  error_metric_mode='none',
                  variable_importance_mode='none',
@@ -542,6 +549,7 @@ def __init__(self,
             max_bins=max_bins,
             min_bin_size=min_bin_size,
             infer_mode=infer_mode,
+            splitter_mode=splitter_mode,
             voting_mode=voting_mode,
             error_metric_mode=error_metric_mode,
             variable_importance_mode=variable_importance_mode,

diff --git a/onedal/ensemble/tests/test_random_forest.py b/onedal/ensemble/tests/test_random_forest.py
@@ -18,6 +18,7 @@
 import numpy as np
 from numpy.testing import assert_allclose
 
+from daal4py.sklearn._utils import daal_check_version
 from onedal.ensemble import RandomForestClassifier, RandomForestRegressor
 from onedal.tests.utils._device_selection import get_queues
 
@@ -42,3 +43,27 @@ def test_rf_regression(queue):
         max_depth=2, random_state=0).fit(X, y, queue=queue)
     assert_allclose(
         [-6.83], rf.predict([[0, 0, 0, 0]], queue=queue), atol=1e-2)
+
+
+@pytest.mark.skipif(not daal_check_version((2023, 'P', 101)),
+                    reason='requires OneDAL 2023.1.1')
+@pytest.mark.parametrize('queue', get_queues('gpu'))
+def test_rf_classifier_random_splitter(queue):
+    X, y = make_classification(n_samples=100, n_features=4,
+                               n_informative=2, n_redundant=0,
+                               random_state=0, shuffle=False)
+    rf = RandomForestClassifier(
+        max_depth=2, random_state=0,
+        splitter_mode='random').fit(X, y, queue=queue)
+    assert_allclose([1], rf.predict([[0, 0, 0, 0]], queue=queue))
+
+
+@pytest.mark.parametrize('queue', get_queues('gpu'))
+def test_rf_regression_random_splitter(queue):
+    X, y = make_regression(n_samples=100, n_features=4, n_informative=2,
+                           random_state=0, shuffle=False)
+    rf = RandomForestRegressor(
+        max_depth=2, random_state=0,
+        splitter_mode='random').fit(X, y, queue=queue)
+    assert_allclose(
+        [-6.83], rf.predict([[0, 0, 0, 0]], queue=queue), atol=1e-2)
diff --git a/sklearnex/preview/ensemble/forest.py b/sklearnex/preview/ensemble/forest.py
@@ -58,7 +58,7 @@
 from scipy import sparse as sp
 
 if sklearn_check_version('1.2'):
-    from sklearn.utils._param_validation import Interval
+    from sklearn.utils._param_validation import Interval, StrOptions
 
 
 class BaseRandomForest(ABC):
@@ -193,7 +193,8 @@ class RandomForestClassifier(sklearn_RandomForestClassifier, BaseRandomForest):
         _parameter_constraints: dict = {
             **sklearn_RandomForestClassifier._parameter_constraints,
             "max_bins": [Interval(numbers.Integral, 2, None, closed="left")],
-            "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")]
+            "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")],
+            "splitter_mode": [StrOptions({"best", "random"})]
         }
 
     if sklearn_check_version('1.0'):
@@ -218,7 +219,8 @@ def __init__(
                 ccp_alpha=0.0,
                 max_samples=None,
                 max_bins=256,
-                min_bin_size=1):
+                min_bin_size=1,
+                splitter_mode='best'):
             super(RandomForestClassifier, self).__init__(
                 n_estimators=n_estimators,
                 criterion=criterion,
@@ -243,6 +245,7 @@ def __init__(
             self.max_bins = max_bins
             self.min_bin_size = min_bin_size
             self.min_impurity_split = None
+            self.splitter_mode = splitter_mode
             # self._estimator = DecisionTreeClassifier()
     else:
         def __init__(self,
@@ -266,7 +269,8 @@ def __init__(self,
                      ccp_alpha=0.0,
                      max_samples=None,
                      max_bins=256,
-                     min_bin_size=1):
+                     min_bin_size=1,
+                     splitter_mode='best'):
             super(RandomForestClassifier, self).__init__(
                 n_estimators=n_estimators,
                 criterion=criterion,
@@ -294,6 +298,7 @@ def __init__(self,
             self.max_bins = max_bins
             self.min_bin_size = min_bin_size
             self.min_impurity_split = None
+            self.splitter_mode = splitter_mode
             # self._estimator = DecisionTreeClassifier()
 
     def fit(self, X, y, sample_weight=None):
@@ -529,6 +534,11 @@ def _estimators_(self):
     def _onedal_cpu_supported(self, method_name, *data):
         if method_name == 'ensemble.RandomForestClassifier.fit':
             ready, X, y, sample_weight = self._onedal_ready(*data)
+            if self.splitter_mode == 'random':
+                warnings.warn("'random' splitter mode supports GPU devices only "
+                              "and requires oneDAL version >= 2023.1.1. "
+                              "Using 'best' mode instead.", RuntimeWarning)
+                self.splitter_mode = 'best'
             if not ready:
                 return False
             elif sp.issparse(X):
@@ -570,6 +580,11 @@ def _onedal_cpu_supported(self, method_name, *data):
     def _onedal_gpu_supported(self, method_name, *data):
         if method_name == 'ensemble.RandomForestClassifier.fit':
             ready, X, y, sample_weight = self._onedal_ready(*data)
+            if self.splitter_mode == 'random' and \
+                    not daal_check_version((2023, 'P', 101)):
+                warnings.warn("'random' splitter mode requires OneDAL >= 2023.1.1. "
+                              "Using 'best' mode instead.", RuntimeWarning)
+                self.splitter_mode = 'best'
             if not ready:
                 return False
             elif sp.issparse(X):
@@ -687,6 +702,8 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None):
             'min_bin_size': self.min_bin_size,
             'max_samples': self.max_samples
         }
+        if daal_check_version((2023, 'P', 101)):
+            onedal_params['splitter_mode'] = self.splitter_mode
         self._cached_estimators_ = None
 
         # Compute
@@ -729,7 +746,8 @@ class RandomForestRegressor(sklearn_RandomForestRegressor, BaseRandomForest):
         _parameter_constraints: dict = {
             **sklearn_RandomForestRegressor._parameter_constraints,
             "max_bins": [Interval(numbers.Integral, 2, None, closed="left")],
-            "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")]
+            "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")],
+            "splitter_mode": [StrOptions({"best", "random"})]
         }
 
     if sklearn_check_version('1.0'):
@@ -754,7 +772,8 @@ def __init__(
                 ccp_alpha=0.0,
                 max_samples=None,
                 max_bins=256,
-                min_bin_size=1):
+                min_bin_size=1,
+                splitter_mode='best'):
             super(RandomForestRegressor, self).__init__(
                 n_estimators=n_estimators,
                 criterion=criterion,
@@ -778,6 +797,7 @@ def __init__(
             self.max_bins = max_bins
             self.min_bin_size = min_bin_size
             self.min_impurity_split = None
+            self.splitter_mode = splitter_mode
     else:
         def __init__(self,
                      n_estimators=100, *,
@@ -799,7 +819,8 @@ def __init__(self,
                      ccp_alpha=0.0,
                      max_samples=None,
                      max_bins=256,
-                     min_bin_size=1):
+                     min_bin_size=1,
+                     splitter_mode='best'):
             super(RandomForestRegressor, self).__init__(
                 n_estimators=n_estimators,
                 criterion=criterion,
@@ -826,6 +847,7 @@ def __init__(self,
             self.max_bins = max_bins
             self.min_bin_size = min_bin_size
             self.min_impurity_split = None
+            self.splitter_mode = splitter_mode
 
     @property
     def _estimators_(self):
@@ -902,6 +924,11 @@ def _onedal_ready(self, X, y, sample_weight):
     def _onedal_cpu_supported(self, method_name, *data):
         if method_name == 'ensemble.RandomForestRegressor.fit':
             ready, X, y, sample_weight = self._onedal_ready(*data)
+            if self.splitter_mode == 'random':
+                warnings.warn("'random' splitter mode supports GPU devices only "
+                              "and requires oneDAL version >= 2023.1.1. "
+                              "Using 'best' mode instead.", RuntimeWarning)
+                self.splitter_mode = 'best'
             if not ready:
                 return False
             elif not (self.oob_score and daal_check_version(
@@ -947,6 +974,11 @@ def _onedal_cpu_supported(self, method_name, *data):
     def _onedal_gpu_supported(self, method_name, *data):
         if method_name == 'ensemble.RandomForestRegressor.fit':
             ready, X, y, sample_weight = self._onedal_ready(*data)
+            if self.splitter_mode == 'random' and \
+                    not daal_check_version((2023, 'P', 101)):
+                warnings.warn("'random' splitter mode requires OneDAL >= 2023.1.1. "
+                              "Using 'best' mode instead.", RuntimeWarning)
+                self.splitter_mode = 'best'
             if not ready:
                 return False
             elif not (self.oob_score and daal_check_version(
@@ -1035,6 +1067,8 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None):
             'variable_importance_mode': 'mdi',
             'max_samples': self.max_samples
         }
+        if daal_check_version((2023, 'P', 101)):
+            onedal_params['splitter_mode'] = self.splitter_mode
         self._cached_estimators_ = None
         self._onedal_estimator = self._onedal_regressor(**onedal_params)
         self._onedal_estimator.fit(X, y, sample_weight, queue=queue)