diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AutoMLConfiguration.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AutoMLConfiguration.scala index d72ae72757..18020bdf54 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AutoMLConfiguration.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AutoMLConfiguration.scala @@ -30,8 +30,10 @@ trait AutoMLConfiguration extends AlgorithmConfigurations { defaultValueFieldPrefix = "", typeExceptions = Map("sort_metric" -> classOf[H2OAutoMLSortMetric]), defaultValueSource = source, - defaultValuesOfCommonParameters = defaultValuesOfCommonParameters ++ - Map("monotoneConstraints" -> new util.HashMap[String, Double](), "ignoredCols" -> ignoredCols.defaultValue), + defaultValuesOfCommonParameters = defaultValuesOfCommonParameters ++ Map( + "monotoneConstraints" -> new util.HashMap[String, Double](), + "ignoredCols" -> ignoredCols.defaultValue, + "preProcessing" -> null), generateParamTag = false) } diff --git a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AutoMLIgnoredParameters.scala b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AutoMLIgnoredParameters.scala index 599cf912d5..5f8dd7c6d2 100644 --- a/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AutoMLIgnoredParameters.scala +++ b/api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AutoMLIgnoredParameters.scala @@ -28,6 +28,7 @@ object AutoMLIgnoredParameters { "blending_frame", "leaderboard_frame", "monotone_constraints", + "preprocessing", "stopping_criteria", "modeling_plan", "algo_parameters") diff --git a/ml/src/main/scala/ai/h2o/sparkling/ml/algos/H2OAutoML.scala b/ml/src/main/scala/ai/h2o/sparkling/ml/algos/H2OAutoML.scala index 40ddd69e49..de085f3d31 100644 --- a/ml/src/main/scala/ai/h2o/sparkling/ml/algos/H2OAutoML.scala +++ b/ml/src/main/scala/ai/h2o/sparkling/ml/algos/H2OAutoML.scala @@ -68,7 +68,8 @@ class H2OAutoML(override val uid: String) // Removing "include_algos", "exclude_algos" from s H2OAutoMLBuildModelsParams since an effective set algorithms // needs to be calculated and stored into "include_algos". The "exclude_algos" are then reset to null and both // altered parameters are added to the result. - val essentialParameters = getH2OAutoMLBuildModelsParams() - ("include_algos", "exclude_algos") + val essentialParameters = + getH2OAutoMLBuildModelsParams() ++ getPreProcessingParams() - ("include_algos", "exclude_algos") essentialParameters ++ Map("include_algos" -> determineIncludedAlgos(), "exclude_algos" -> null) ++ extra } diff --git a/ml/src/main/scala/ai/h2o/sparkling/ml/params/H2OAutoMLParams.scala b/ml/src/main/scala/ai/h2o/sparkling/ml/params/H2OAutoMLParams.scala index e42b4a4d0f..d944226dac 100644 --- a/ml/src/main/scala/ai/h2o/sparkling/ml/params/H2OAutoMLParams.scala +++ b/ml/src/main/scala/ai/h2o/sparkling/ml/params/H2OAutoMLParams.scala @@ -23,3 +23,4 @@ trait H2OAutoMLParams with H2OAutoMLInputParams with H2OAutoMLStoppingCriteriaParams with HasMonotoneConstraints + with HasPreProcessing diff --git a/ml/src/main/scala/ai/h2o/sparkling/ml/params/HasPreProcessing.scala b/ml/src/main/scala/ai/h2o/sparkling/ml/params/HasPreProcessing.scala new file mode 100644 index 0000000000..0e20207160 --- /dev/null +++ b/ml/src/main/scala/ai/h2o/sparkling/ml/params/HasPreProcessing.scala @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.h2o.sparkling.ml.params + +import ai.h2o.automl.preprocessing.PreprocessingStepDefinition +import ai.h2o.sparkling.H2OFrame +import org.apache.spark.expose.Logging + +trait HasPreProcessing extends H2OAlgoParamsBase { + private val preProcessing = nullableStringArrayParam( + "preProcessing", + "The list of pre-processing steps to run. Only 'TargetEncoding' is currently supported.") + + setDefault(preProcessing -> null) + + def getPreProcessing(): Array[String] = $(preProcessing) + + def setPreProcessing(value: Array[String]): this.type = { + type EnumType = PreprocessingStepDefinition.Type + val validated = EnumParamValidator.getValidatedEnumValues[EnumType](value, nullEnabled = true) + set(preProcessing, validated) + } + + override private[sparkling] def getH2OAlgorithmParams(trainingFrame: H2OFrame): Map[String, Any] = { + super.getH2OAlgorithmParams(trainingFrame) ++ getPreProcessingParams() + } + + private[sparkling] def getPreProcessingParams(): Map[String, Any] = { + val value = getPreProcessing() + val valueToBackend = if (value == null) { + null + } else { + value.map { enumValue => + val stepType = PreprocessingStepDefinition.Type.valueOf(enumValue) + Map("type" -> stepType) + } + } + Map("preprocessing" -> valueToBackend) + } + + override private[sparkling] def getSWtoH2OParamNameMap(): Map[String, String] = { + super.getSWtoH2OParamNameMap() ++ Map("preProcessing" -> "preprocessing") + } +} diff --git a/ml/src/test/scala/ai/h2o/sparkling/ml/algos/H2OAutoMLTestSuite.scala b/ml/src/test/scala/ai/h2o/sparkling/ml/algos/H2OAutoMLTestSuite.scala index 6089130148..22413c5d10 100644 --- a/ml/src/test/scala/ai/h2o/sparkling/ml/algos/H2OAutoMLTestSuite.scala +++ b/ml/src/test/scala/ai/h2o/sparkling/ml/algos/H2OAutoMLTestSuite.scala @@ -82,4 +82,18 @@ class H2OAutoMLTestSuite extends FunSuite with Matchers with SharedH2OTestContex val model = automl.fit(dataset) model.transform(dataset).collect() } + + test("TargetEncoding configuration on AutoML is propagated to backend") { + val automl = new H2OAutoML() + .setLabelCol("CAPSULE") + .setIgnoredCols(Array("ID")) + .setExcludeAlgos(Array("GLM")) + .setPreProcessing(Array("TargetEncoding")) + .setNfolds(3) + .setMaxModels(15) + + automl.fit(dataset.withColumn("CAPSULE", 'CAPSULE.cast("string"))) + val numberOfModelsWithTE = automl.getLeaderboard().filter('model_id.like("%TargetEncoder%")).count() + assert(numberOfModelsWithTE > 0) + } } diff --git a/py/src/ai/h2o/sparkling/ml/params/H2OAutoMLParams.py b/py/src/ai/h2o/sparkling/ml/params/H2OAutoMLParams.py index 254ed34916..e3fd2c0bf3 100644 --- a/py/src/ai/h2o/sparkling/ml/params/H2OAutoMLParams.py +++ b/py/src/ai/h2o/sparkling/ml/params/H2OAutoMLParams.py @@ -21,6 +21,7 @@ from ai.h2o.sparkling.ml.params.H2OAutoMLStoppingCriteriaParams import H2OAutoMLStoppingCriteriaParams from ai.h2o.sparkling.ml.params.H2OCommonParams import H2OCommonParams from ai.h2o.sparkling.ml.params.HasMonotoneConstraints import HasMonotoneConstraints +from ai.h2o.sparkling.ml.params.HasPreProcessing import HasPreProcessing from pyspark.ml.param import * @@ -30,6 +31,7 @@ class H2OAutoMLParams( H2OAutoMLBuildModelsParams, H2OAutoMLInputParams, H2OAutoMLStoppingCriteriaParams, - HasMonotoneConstraints + HasMonotoneConstraints, + HasPreProcessing ): pass diff --git a/py/src/ai/h2o/sparkling/ml/params/HasPreProcessing.py b/py/src/ai/h2o/sparkling/ml/params/HasPreProcessing.py new file mode 100644 index 0000000000..b0a6676d6b --- /dev/null +++ b/py/src/ai/h2o/sparkling/ml/params/HasPreProcessing.py @@ -0,0 +1,33 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ai.h2o.sparkling.ml.params.H2OTypeConverters import H2OTypeConverters +from pyspark.ml.param import * + + +class HasPreProcessing(Params): + preProcessing = Param( + Params._dummy(), + "preProcessing", + "The list of pre-processing steps to run. Only 'TargetEncoding' is currently supported.", + H2OTypeConverters.toNullableListEnumString("ai.h2o.automl.preprocessing.PreprocessingStepDefinition$Type")) + + def getPreProcessing(self): + return self.getOrDefault(self.preProcessing) + + def setPreProcessing(self, value): + return self._set(preProcessing=value) diff --git a/py/tests/unit/with_runtime_sparkling/test_automl.py b/py/tests/unit/with_runtime_sparkling/test_automl.py index f24da5f45d..6d4355eecf 100644 --- a/py/tests/unit/with_runtime_sparkling/test_automl.py +++ b/py/tests/unit/with_runtime_sparkling/test_automl.py @@ -91,8 +91,8 @@ def testH2OAutoMLRegressorBehavesTheSameAsGenericH2OAutoMLOnNumericLabelColumn(p referenceModel = automl.fit(trainingDateset) referenceDataset = referenceModel.transform(testingDataset) - classifier = setParametersForTesting(H2OAutoMLRegressor()) - model = classifier.fit(trainingDateset) + regressor = setParametersForTesting(H2OAutoMLRegressor()) + model = regressor.fit(trainingDateset) result = model.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(referenceDataset, result) @@ -110,3 +110,19 @@ def testH2OAutoMLClassifierBehavesDiffenrentlyThanH2OAutoMLRegressor(prostateDat classificationDataset = classificationModel.transform(testingDataset).drop("detailed_prediction") unit_test_utils.assert_data_frames_have_different_values(regressionDataset, classificationDataset) + + +def testH2OAutoMLClassifierIsAbleToUseTargetEncoding(prostateDataset): + classifierWithTE = setParametersForTesting(H2OAutoMLClassifier(preProcessing=["TargetEncoding"])).setMaxModels(10) + classifierWithTE.fit(prostateDataset) + leaderboard = classifierWithTE.getLeaderboard() + numberOfTEModels = leaderboard.filter(leaderboard.model_id.like("%TargetEncoder%")).count() + assert numberOfTEModels > 0 + + +def testH2OAutoMLRegressorReturnsDifferentResultWithTargetEncoder(prostateDataset): + regressorWithTE = setParametersForTesting(H2OAutoMLRegressor(preProcessing=["TargetEncoding"])).setMaxModels(10) + regressorWithTE.fit(prostateDataset) + leaderboard = regressorWithTE.getLeaderboard() + numberOfTEModels = leaderboard.filter(leaderboard.model_id.like("%TargetEncoder%")).count() + assert numberOfTEModels > 0