GH-7118 - add score_each_iteration, score_tree_interval, disable_trai…

…ning_metrics to Python, R and documentation (#6038) * eif scoring history - add score_each_iteration and score_tree_interval to API * eif scoring history - test python and R api * eif scoring history - add score_each_iteration and score_tree_interval to the documentation * add posibility to disable training metrics API * eif - test scoring with large data in python
h2oai · Nov 21, 2023 · 64249c3 · 64249c3
1 parent 8d9304b
commit 64249c3
Show file tree

Hide file tree

Showing 9 changed files with 183 additions and 4 deletions.
diff --git a/h2o-algos/src/main/java/hex/schemas/ExtendedIsolationForestV3.java b/h2o-algos/src/main/java/hex/schemas/ExtendedIsolationForestV3.java
@@ -17,12 +17,15 @@ public static final class ExtendedIsolationForestParametersV3 extends ModelParam
                 "ignored_columns",
                 "ignore_const_cols",
                 "categorical_encoding",
+                "score_each_iteration",
+                "score_tree_interval",
 
                 // Extended Isolation Forest specific
                 "ntrees",
                 "sample_size",
                 "extension_level",
                 "seed",
+                "disable_training_metrics"
         };
 
         @API(help = "Number of Extended Isolation Forest trees.", gridable = true)
@@ -37,5 +40,11 @@ public static final class ExtendedIsolationForestParametersV3 extends ModelParam
 
         @API(help = "Seed for pseudo random number generator (if applicable)", gridable = true)
         public long seed;
+
+        @API(help="Score the model after every so many trees. Disabled if set to 0.", level = API.Level.secondary, gridable = false)
+        public int score_tree_interval;
+
+        @API(help = "Disable calculating training metrics (expensive on large datasets)")
+        public boolean disable_training_metrics;
     }
 }
diff --git a/h2o-docs/src/product/data-science/algo-params/score_each_iteration.rst b/h2o-docs/src/product/data-science/algo-params/score_each_iteration.rst
@@ -3,7 +3,7 @@
 ``score_each_iteration``
 ------------------------
 
-- Available in: GBM, DRF, Deep Learning, GLM, GAM, PCA, GLRM, Naïve-Bayes, K-Means, XGBoost, Isolation Forest, Uplift DRF
+- Available in: GBM, DRF, Deep Learning, GLM, GAM, PCA, GLRM, Naïve-Bayes, K-Means, XGBoost, Isolation Forest, Extended Isolation Forest, Uplift DRF
 - Hyperparameter: no
 
 

diff --git a/h2o-docs/src/product/data-science/algo-params/score_tree_interval.rst b/h2o-docs/src/product/data-science/algo-params/score_tree_interval.rst
@@ -3,7 +3,7 @@
 ``score_tree_interval``
 ------------------------
 
-- Available in: GBM, DRF, XGBoost, Isolation Forest, Uplift DRF
+- Available in: GBM, DRF, XGBoost, Isolation Forest, Extended Isolation Forest, Uplift DRF
 - Hyperparameter: no
 
 Description
@@ -84,4 +84,4 @@ Example
 		cars_gbm.train(x = predictors, y = response, training_frame = train, validation_frame = valid)
 
 		# print the model score every 5 trees
-		cars_gbm.scoring_history()
+		cars_gbm.scoring_history()
diff --git a/h2o-docs/src/product/data-science/eif.rst b/h2o-docs/src/product/data-science/eif.rst
@@ -53,11 +53,15 @@ Algorithm-specific parameters
 
 -  `sample_size <algo-params/sample_size.html>`__: The number of randomly sampled observations used to train each Extended Isolation Forest tree. This option defaults to ``256``.
 
+-  **disable_training_metrics**: Disable calculating training metrics (expensive on large datasets). This option defaults to ``True`` (enabled).
+
 Shared tree-algorithm parameters
 ''''''''''''''''''''''''''''''''
 
 -  `ntrees <algo-params/ntrees.html>`__: Specify the number of trees. This option defaults to ``100``.
 
+-  `score_tree_interval <algo-params/score_tree_interval.html>`__: Score the model after every so many trees. This value is set to 0 (disabled) by default.
+
 Common parameters
 '''''''''''''''''
 
@@ -77,6 +81,8 @@ Common parameters
 
 -  `model_id <algo-params/model_id.html>`__: Specify a custom name for the model to use as a reference. By default, H2O automatically generates a destination key.
 
+-  `score_each_iteration <algo-params/score_each_iteration.html>`__: (Optional) Enable this option to score during each iteration of the model training (disabled by default).
+
 -  `seed <algo-params/seed.html>`__: Specify the random number generator (RNG) seed for algorithm components dependent on randomization. The seed is consistent for each H2O instance so that you can create models with the same starting conditions in alternative configurations. This option defaults to ``-1`` (time-based random number).
 
 -  `training_frame <algo-params/training_frame.html>`__: *Required* Specify the dataset used to build the model. 

diff --git a/h2o-py/h2o/estimators/extended_isolation_forest.py b/h2o-py/h2o/estimators/extended_isolation_forest.py
@@ -36,10 +36,13 @@ def __init__(self,
                  ignored_columns=None,  # type: Optional[List[str]]
                  ignore_const_cols=True,  # type: bool
                  categorical_encoding="auto",  # type: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"]
+                 score_each_iteration=False,  # type: bool
+                 score_tree_interval=0,  # type: int
                  ntrees=100,  # type: int
                  sample_size=256,  # type: int
                  extension_level=0,  # type: int
                  seed=-1,  # type: int
+                 disable_training_metrics=True,  # type: bool
                  ):
         """
         :param model_id: Destination id for this model; auto-generated if not specified.
@@ -58,6 +61,12 @@ def __init__(self,
                Defaults to ``"auto"``.
         :type categorical_encoding: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder",
                "sort_by_response", "enum_limited"]
+        :param score_each_iteration: Whether to score during each iteration of model training.
+               Defaults to ``False``.
+        :type score_each_iteration: bool
+        :param score_tree_interval: Score the model after every so many trees. Disabled if set to 0.
+               Defaults to ``0``.
+        :type score_tree_interval: int
         :param ntrees: Number of Extended Isolation Forest trees.
                Defaults to ``100``.
         :type ntrees: int
@@ -71,6 +80,9 @@ def __init__(self,
         :param seed: Seed for pseudo random number generator (if applicable)
                Defaults to ``-1``.
         :type seed: int
+        :param disable_training_metrics: Disable calculating training metrics (expensive on large datasets)
+               Defaults to ``True``.
+        :type disable_training_metrics: bool
         """
         super(H2OExtendedIsolationForestEstimator, self).__init__()
         self._parms = {}
@@ -79,10 +91,13 @@ def __init__(self,
         self.ignored_columns = ignored_columns
         self.ignore_const_cols = ignore_const_cols
         self.categorical_encoding = categorical_encoding
+        self.score_each_iteration = score_each_iteration
+        self.score_tree_interval = score_tree_interval
         self.ntrees = ntrees
         self.sample_size = sample_size
         self.extension_level = extension_level
         self.seed = seed
+        self.disable_training_metrics = disable_training_metrics
 
     @property
     def training_frame(self):
@@ -176,6 +191,34 @@ def categorical_encoding(self, categorical_encoding):
         assert_is_type(categorical_encoding, None, Enum("auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"))
         self._parms["categorical_encoding"] = categorical_encoding
 
+    @property
+    def score_each_iteration(self):
+        """
+        Whether to score during each iteration of model training.
+
+        Type: ``bool``, defaults to ``False``.
+        """
+        return self._parms.get("score_each_iteration")
+
+    @score_each_iteration.setter
+    def score_each_iteration(self, score_each_iteration):
+        assert_is_type(score_each_iteration, None, bool)
+        self._parms["score_each_iteration"] = score_each_iteration
+
+    @property
+    def score_tree_interval(self):
+        """
+        Score the model after every so many trees. Disabled if set to 0.
+
+        Type: ``int``, defaults to ``0``.
+        """
+        return self._parms.get("score_tree_interval")
+
+    @score_tree_interval.setter
+    def score_tree_interval(self, score_tree_interval):
+        assert_is_type(score_tree_interval, None, int)
+        self._parms["score_tree_interval"] = score_tree_interval
+
     @property
     def ntrees(self):
         """
@@ -276,4 +319,18 @@ def seed(self, seed):
         assert_is_type(seed, None, int)
         self._parms["seed"] = seed
 
+    @property
+    def disable_training_metrics(self):
+        """
+        Disable calculating training metrics (expensive on large datasets)
+
+        Type: ``bool``, defaults to ``True``.
+        """
+        return self._parms.get("disable_training_metrics")
+
+    @disable_training_metrics.setter
+    def disable_training_metrics(self, disable_training_metrics):
+        assert_is_type(disable_training_metrics, None, bool)
+        self._parms["disable_training_metrics"] = disable_training_metrics
+
 
diff --git a/h2o-py/tests/testdir_algos/isoforextended/pyunit_isoforextended_metrics_large.py b/h2o-py/tests/testdir_algos/isoforextended/pyunit_isoforextended_metrics_large.py
@@ -0,0 +1,31 @@
+from __future__ import print_function
+import sys, os
+sys.path.insert(1, os.path.join("..","..",".."))
+import h2o
+from tests import pyunit_utils, assert_equals
+from h2o.estimators.extended_isolation_forest import H2OExtendedIsolationForestEstimator
+
+
+def extended_isolation_forest_metrics_large():
+    print("Extended Isolation Forest Anomaly Metrics Test On Large Data")
+
+    train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/creditcardfraud/creditcardfraud.csv"))
+
+    eif_model = H2OExtendedIsolationForestEstimator(ntrees=100, seed=0xBEEF, sample_size=256, extension_level=1, disable_training_metrics=False)
+    eif_model.train(training_frame=train)
+    metrics_by_python = eif_model.predict(train).mean()
+    average_mean_length = metrics_by_python[0, 1]
+    average_anomaly_score = metrics_by_python[0, 0]
+
+    print(metrics_by_python)
+    print(eif_model)
+
+    perf = eif_model.model_performance()
+    assert_equals(perf.mean_score(), average_mean_length, "Mean score metric is not correct", 1e-3)
+    assert_equals(perf.mean_normalized_score(), average_anomaly_score, "Anomaly score metric is not correct", 1e-3)
+
+
+if __name__ == "__main__":
+    pyunit_utils.standalone_test(extended_isolation_forest_metrics_large)
+else:
+    extended_isolation_forest_metrics_large()
diff --git a/h2o-py/tests/testdir_algos/isoforextended/pyunit_isoforextended_scoring_history.py b/h2o-py/tests/testdir_algos/isoforextended/pyunit_isoforextended_scoring_history.py
@@ -0,0 +1,35 @@
+from __future__ import print_function
+import sys, os
+sys.path.insert(1, os.path.join("..","..",".."))
+import h2o
+from tests import pyunit_utils, assert_equals
+from h2o.estimators.extended_isolation_forest import H2OExtendedIsolationForestEstimator
+
+
+def extended_isolation_forest_scoring_history():
+    print("Extended Isolation Forest Scoring History Test")
+
+    train = h2o.import_file(pyunit_utils.locate("smalldata/anomaly/single_blob.csv"))
+
+    eif_model = H2OExtendedIsolationForestEstimator(ntrees=10, seed=0xBEEF, sample_size=255, extension_level=1)
+    eif_model.train(training_frame=train)
+    print(eif_model.scoring_history())
+    assert_equals(None, eif_model.scoring_history(), "No scoring history by default")
+
+    eif_model = H2OExtendedIsolationForestEstimator(ntrees=10, seed=0xBEEF, sample_size=255, extension_level=1,
+                                                    score_each_iteration=True, disable_training_metrics=False)
+    eif_model.train(training_frame=train)
+    print(eif_model.scoring_history())
+    assert_equals(11, len(eif_model.scoring_history()), "There should be one empty row and one row for each tree")
+
+    eif_model = H2OExtendedIsolationForestEstimator(ntrees=10, seed=0xBEEF, sample_size=255, extension_level=1,
+                                                    score_tree_interval=3, disable_training_metrics=False)
+    eif_model.train(training_frame=train)
+    print(eif_model.scoring_history())
+    assert_equals(5, len(eif_model.scoring_history()), "There should be one empty row and one row for each interval")
+
+
+if __name__ == "__main__":
+    pyunit_utils.standalone_test(extended_isolation_forest_scoring_history)
+else:
+    extended_isolation_forest_scoring_history()
diff --git a/h2o-r/h2o-package/R/extendedisolationforest.R b/h2o-r/h2o-package/R/extendedisolationforest.R
@@ -11,12 +11,15 @@
 #' @param ignore_const_cols \code{Logical}. Ignore constant columns. Defaults to TRUE.
 #' @param categorical_encoding Encoding scheme for categorical features Must be one of: "AUTO", "Enum", "OneHotInternal", "OneHotExplicit",
 #'        "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited". Defaults to AUTO.
+#' @param score_each_iteration \code{Logical}. Whether to score during each iteration of model training. Defaults to FALSE.
+#' @param score_tree_interval Score the model after every so many trees. Disabled if set to 0. Defaults to 0.
 #' @param ntrees Number of Extended Isolation Forest trees. Defaults to 100.
 #' @param sample_size Number of randomly sampled observations used to train each Extended Isolation Forest tree. Defaults to 256.
 #' @param extension_level Maximum is N - 1 (N = numCols). Minimum is 0. Extended Isolation Forest with extension_Level = 0 behaves like
 #'        Isolation Forest. Defaults to 0.
 #' @param seed Seed for random numbers (affects certain parts of the algo that are stochastic and those might or might not be enabled by default).
 #'        Defaults to -1 (time-based random number).
+#' @param disable_training_metrics \code{Logical}. Disable calculating training metrics (expensive on large datasets) Defaults to TRUE.
 #' @examples
 #' \dontrun{
 #' library(h2o)
@@ -53,10 +56,13 @@ h2o.extendedIsolationForest <- function(training_frame,
                                         model_id = NULL,
                                         ignore_const_cols = TRUE,
                                         categorical_encoding = c("AUTO", "Enum", "OneHotInternal", "OneHotExplicit", "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited"),
+                                        score_each_iteration = FALSE,
+                                        score_tree_interval = 0,
                                         ntrees = 100,
                                         sample_size = 256,
                                         extension_level = 0,
-                                        seed = -1)
+                                        seed = -1,
+                                        disable_training_metrics = TRUE)
 {
   # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object
   training_frame <- .validate.H2OFrame(training_frame, required=TRUE)
@@ -73,6 +79,10 @@ h2o.extendedIsolationForest <- function(training_frame,
     parms$ignore_const_cols <- ignore_const_cols
   if (!missing(categorical_encoding))
     parms$categorical_encoding <- categorical_encoding
+  if (!missing(score_each_iteration))
+    parms$score_each_iteration <- score_each_iteration
+  if (!missing(score_tree_interval))
+    parms$score_tree_interval <- score_tree_interval
   if (!missing(ntrees))
     parms$ntrees <- ntrees
   if (!missing(sample_size))
@@ -81,6 +91,8 @@ h2o.extendedIsolationForest <- function(training_frame,
     parms$extension_level <- extension_level
   if (!missing(seed))
     parms$seed <- seed
+  if (!missing(disable_training_metrics))
+    parms$disable_training_metrics <- disable_training_metrics
 
   # Error check and build model
   model <- .h2o.modelJob('extendedisolationforest', parms, h2oRestApiVersion=3, verbose=FALSE)
@@ -90,10 +102,13 @@ h2o.extendedIsolationForest <- function(training_frame,
                                                         x,
                                                         ignore_const_cols = TRUE,
                                                         categorical_encoding = c("AUTO", "Enum", "OneHotInternal", "OneHotExplicit", "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited"),
+                                                        score_each_iteration = FALSE,
+                                                        score_tree_interval = 0,
                                                         ntrees = 100,
                                                         sample_size = 256,
                                                         extension_level = 0,
                                                         seed = -1,
+                                                        disable_training_metrics = TRUE,
                                                         segment_columns = NULL,
                                                         segment_models_id = NULL,
                                                         parallelism = 1)
@@ -115,6 +130,10 @@ h2o.extendedIsolationForest <- function(training_frame,
     parms$ignore_const_cols <- ignore_const_cols
   if (!missing(categorical_encoding))
     parms$categorical_encoding <- categorical_encoding
+  if (!missing(score_each_iteration))
+    parms$score_each_iteration <- score_each_iteration
+  if (!missing(score_tree_interval))
+    parms$score_tree_interval <- score_tree_interval
   if (!missing(ntrees))
     parms$ntrees <- ntrees
   if (!missing(sample_size))
@@ -123,6 +142,8 @@ h2o.extendedIsolationForest <- function(training_frame,
     parms$extension_level <- extension_level
   if (!missing(seed))
     parms$seed <- seed
+  if (!missing(disable_training_metrics))
+    parms$disable_training_metrics <- disable_training_metrics
 
   # Build segment-models specific parameters
   segment_parms <- list()

diff --git a/h2o-r/tests/testdir_algos/isoforextended/runit_isoforextended_scoring_history.R b/h2o-r/tests/testdir_algos/isoforextended/runit_isoforextended_scoring_history.R
@@ -0,0 +1,20 @@
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source("../../../scripts/h2o-r-test-setup.R")
+
+
+
+test.ExtendedIsolationForest.scoring_history <- function() {
+    single_blob.hex <-
+      h2o.importFile(path = locate("smalldata/anomaly/single_blob.csv"),
+                   destination_frame = "single_blob.hex")
+
+    exisofor.model <- h2o.extendedIsolationForest(training_frame = single_blob.hex, score_each_iteration=TRUE, ntrees=10, disable_training_metrics=FALSE)
+    print(exisofor.model)
+    expect_equal(nrow(h2o.scoreHistory(exisofor.model)), 11)
+
+    exisofor.model <- h2o.extendedIsolationForest(training_frame = single_blob.hex, score_tree_interval=3, ntrees=10, disable_training_metrics=FALSE)
+    print(exisofor.model)
+    expect_equal(nrow(h2o.scoreHistory(exisofor.model)), 5)
+}
+
+doTest("ExtendedIsolationForest: Smoke Test For Scoring History", test.ExtendedIsolationForest.scoring_history)