h2oai · mn-mikke · Sep 30, 2020 · mn-mikke · Sep 30, 2020 · sebhrusen
@@ -30,8 +30,10 @@ trait AutoMLConfiguration extends AlgorithmConfigurations {
         defaultValueFieldPrefix = "",
         typeExceptions = Map("sort_metric" -> classOf[H2OAutoMLSortMetric]),
         defaultValueSource = source,
-        defaultValuesOfCommonParameters = defaultValuesOfCommonParameters ++
-          Map("monotoneConstraints" -> new util.HashMap[String, Double](), "ignoredCols" -> ignoredCols.defaultValue),
+        defaultValuesOfCommonParameters = defaultValuesOfCommonParameters ++ Map(
+          "monotoneConstraints" -> new util.HashMap[String, Double](),
+          "ignoredCols" -> ignoredCols.defaultValue,
+          "preProcessing" -> null),
         generateParamTag = false)
   }
 

@@ -28,6 +28,7 @@ object AutoMLIgnoredParameters {
       "blending_frame",
       "leaderboard_frame",
       "monotone_constraints",
+      "preprocessing",
       "stopping_criteria",
       "modeling_plan",
       "algo_parameters")

@@ -68,7 +68,8 @@ class H2OAutoML(override val uid: String)
     // Removing "include_algos", "exclude_algos" from s H2OAutoMLBuildModelsParams since an effective set algorithms
     // needs to be calculated and stored into "include_algos". The "exclude_algos" are then reset to null and both
     // altered parameters are added to the result.
-    val essentialParameters = getH2OAutoMLBuildModelsParams() - ("include_algos", "exclude_algos")
+    val essentialParameters =
+      getH2OAutoMLBuildModelsParams() ++ getPreProcessingParams() - ("include_algos", "exclude_algos")
 
     essentialParameters ++ Map("include_algos" -> determineIncludedAlgos(), "exclude_algos" -> null) ++ extra
   }

@@ -23,3 +23,4 @@ trait H2OAutoMLParams
   with H2OAutoMLInputParams
   with H2OAutoMLStoppingCriteriaParams
   with HasMonotoneConstraints
+  with HasPreProcessing
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.h2o.sparkling.ml.params
+
+import ai.h2o.automl.preprocessing.PreprocessingStepDefinition
+import ai.h2o.sparkling.H2OFrame
+import org.apache.spark.expose.Logging
+
+trait HasPreProcessing extends H2OAlgoParamsBase {
+  private val preProcessing = nullableStringArrayParam(
+    "preProcessing",
+    "The list of pre-processing steps to run. Only 'TargetEncoding' is currently supported.")
+
+  setDefault(preProcessing -> null)
+
+  def getPreProcessing(): Array[String] = $(preProcessing)
+
+  def setPreProcessing(value: Array[String]): this.type = {
+    type EnumType = PreprocessingStepDefinition.Type
+    val validated = EnumParamValidator.getValidatedEnumValues[EnumType](value, nullEnabled = true)
+    set(preProcessing, validated)
+  }
+
+  override private[sparkling] def getH2OAlgorithmParams(trainingFrame: H2OFrame): Map[String, Any] = {
+    super.getH2OAlgorithmParams(trainingFrame) ++ getPreProcessingParams()
+  }
+
+  private[sparkling] def getPreProcessingParams(): Map[String, Any] = {
+    val value = getPreProcessing()
+    val valueToBackend = if (value == null) {
+      null
+    } else {
+      value.map { enumValue =>
+        val stepType = PreprocessingStepDefinition.Type.valueOf(enumValue)
+        Map("type" -> stepType)
+      }
+    }
+    Map("preprocessing" -> valueToBackend)
+  }
+
+  override private[sparkling] def getSWtoH2OParamNameMap(): Map[String, String] = {
+    super.getSWtoH2OParamNameMap() ++ Map("preProcessing" -> "preprocessing")
+  }
+}
@@ -82,4 +82,18 @@ class H2OAutoMLTestSuite extends FunSuite with Matchers with SharedH2OTestContex
     val model = automl.fit(dataset)
     model.transform(dataset).collect()
   }
+
+  test("TargetEncoding configuration on AutoML is propagated to backend") {
+    val automl = new H2OAutoML()
+      .setLabelCol("CAPSULE")
+      .setIgnoredCols(Array("ID"))
+      .setExcludeAlgos(Array("GLM"))
+      .setPreProcessing(Array("TargetEncoding"))
+      .setNfolds(3)
+      .setMaxModels(15)
+
+    automl.fit(dataset.withColumn("CAPSULE", 'CAPSULE.cast("string")))
+    val numberOfModelsWithTE = automl.getLeaderboard().filter('model_id.like("%TargetEncoder%")).count()
+    assert(numberOfModelsWithTE > 0)
+  }
 }
@@ -21,6 +21,7 @@
 from ai.h2o.sparkling.ml.params.H2OAutoMLStoppingCriteriaParams import H2OAutoMLStoppingCriteriaParams
 from ai.h2o.sparkling.ml.params.H2OCommonParams import H2OCommonParams
 from ai.h2o.sparkling.ml.params.HasMonotoneConstraints import HasMonotoneConstraints
+from ai.h2o.sparkling.ml.params.HasPreProcessing import HasPreProcessing
 from pyspark.ml.param import *
 
 
@@ -30,6 +31,7 @@ class H2OAutoMLParams(
     H2OAutoMLBuildModelsParams,
     H2OAutoMLInputParams,
     H2OAutoMLStoppingCriteriaParams,
-    HasMonotoneConstraints
+    HasMonotoneConstraints,
+    HasPreProcessing
 ):
     pass
@@ -0,0 +1,33 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from ai.h2o.sparkling.ml.params.H2OTypeConverters import H2OTypeConverters
+from pyspark.ml.param import *
+
+
+class HasPreProcessing(Params):
+    preProcessing = Param(
+        Params._dummy(),
+        "preProcessing",
+        "The list of pre-processing steps to run. Only 'TargetEncoding' is currently supported.",
+        H2OTypeConverters.toNullableListEnumString("ai.h2o.automl.preprocessing.PreprocessingStepDefinition$Type"))
+
+    def getPreProcessing(self):
+        return self.getOrDefault(self.preProcessing)
+
+    def setPreProcessing(self, value):
+        return self._set(preProcessing=value)
@@ -91,8 +91,8 @@ def testH2OAutoMLRegressorBehavesTheSameAsGenericH2OAutoMLOnNumericLabelColumn(p
     referenceModel = automl.fit(trainingDateset)
     referenceDataset = referenceModel.transform(testingDataset)
 
-    classifier = setParametersForTesting(H2OAutoMLRegressor())
-    model = classifier.fit(trainingDateset)
+    regressor = setParametersForTesting(H2OAutoMLRegressor())
+    model = regressor.fit(trainingDateset)
     result = model.transform(testingDataset)
 
     unit_test_utils.assert_data_frames_are_identical(referenceDataset, result)
@@ -110,3 +110,19 @@ def testH2OAutoMLClassifierBehavesDiffenrentlyThanH2OAutoMLRegressor(prostateDat
     classificationDataset = classificationModel.transform(testingDataset).drop("detailed_prediction")
 
     unit_test_utils.assert_data_frames_have_different_values(regressionDataset, classificationDataset)
+
+
+def testH2OAutoMLClassifierIsAbleToUseTargetEncoding(prostateDataset):
+    classifierWithTE = setParametersForTesting(H2OAutoMLClassifier(preProcessing=["TargetEncoding"])).setMaxModels(10)
+    classifierWithTE.fit(prostateDataset)
+    leaderboard = classifierWithTE.getLeaderboard()
+    numberOfTEModels = leaderboard.filter(leaderboard.model_id.like("%TargetEncoder%")).count()
+    assert numberOfTEModels > 0
+
+
+def testH2OAutoMLRegressorReturnsDifferentResultWithTargetEncoder(prostateDataset):
+    regressorWithTE = setParametersForTesting(H2OAutoMLRegressor(preProcessing=["TargetEncoding"])).setMaxModels(10)
+    regressorWithTE.fit(prostateDataset)
+    leaderboard = regressorWithTE.getLeaderboard()
+    numberOfTEModels = leaderboard.filter(leaderboard.model_id.like("%TargetEncoder%")).count()
+    assert numberOfTEModels > 0