Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SW-2454] Expose preprocessing Parameter on AutoML #2337

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@ trait AutoMLConfiguration extends AlgorithmConfigurations {
defaultValueFieldPrefix = "",
typeExceptions = Map("sort_metric" -> classOf[H2OAutoMLSortMetric]),
defaultValueSource = source,
defaultValuesOfCommonParameters = defaultValuesOfCommonParameters ++
Map("monotoneConstraints" -> new util.HashMap[String, Double](), "ignoredCols" -> ignoredCols.defaultValue),
defaultValuesOfCommonParameters = defaultValuesOfCommonParameters ++ Map(
"monotoneConstraints" -> new util.HashMap[String, Double](),
"ignoredCols" -> ignoredCols.defaultValue,
"preProcessing" -> null),
generateParamTag = false)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ object AutoMLIgnoredParameters {
"blending_frame",
"leaderboard_frame",
"monotone_constraints",
"preprocessing",
"stopping_criteria",
"modeling_plan",
"algo_parameters")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ class H2OAutoML(override val uid: String)
// Removing "include_algos", "exclude_algos" from s H2OAutoMLBuildModelsParams since an effective set algorithms
// needs to be calculated and stored into "include_algos". The "exclude_algos" are then reset to null and both
// altered parameters are added to the result.
val essentialParameters = getH2OAutoMLBuildModelsParams() - ("include_algos", "exclude_algos")
val essentialParameters =
getH2OAutoMLBuildModelsParams() ++ getPreProcessingParams() - ("include_algos", "exclude_algos")

essentialParameters ++ Map("include_algos" -> determineIncludedAlgos(), "exclude_algos" -> null) ++ extra
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ trait H2OAutoMLParams
with H2OAutoMLInputParams
with H2OAutoMLStoppingCriteriaParams
with HasMonotoneConstraints
with HasPreProcessing
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package ai.h2o.sparkling.ml.params

import ai.h2o.automl.preprocessing.PreprocessingStepDefinition
import ai.h2o.sparkling.H2OFrame
import org.apache.spark.expose.Logging

trait HasPreProcessing extends H2OAlgoParamsBase {
private val preProcessing = nullableStringArrayParam(
"preProcessing",
"The list of pre-processing steps to run. Only 'TargetEncoding' is currently supported.")

setDefault(preProcessing -> null)

def getPreProcessing(): Array[String] = $(preProcessing)

def setPreProcessing(value: Array[String]): this.type = {
type EnumType = PreprocessingStepDefinition.Type
val validated = EnumParamValidator.getValidatedEnumValues[EnumType](value, nullEnabled = true)
set(preProcessing, validated)
}

override private[sparkling] def getH2OAlgorithmParams(trainingFrame: H2OFrame): Map[String, Any] = {
super.getH2OAlgorithmParams(trainingFrame) ++ getPreProcessingParams()
}

private[sparkling] def getPreProcessingParams(): Map[String, Any] = {
val value = getPreProcessing()
val valueToBackend = if (value == null) {
null
} else {
value.map { enumValue =>
val stepType = PreprocessingStepDefinition.Type.valueOf(enumValue)
Map("type" -> stepType)
}
}
Map("preprocessing" -> valueToBackend)
}

override private[sparkling] def getSWtoH2OParamNameMap(): Map[String, String] = {
super.getSWtoH2OParamNameMap() ++ Map("preProcessing" -> "preprocessing")
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,18 @@ class H2OAutoMLTestSuite extends FunSuite with Matchers with SharedH2OTestContex
val model = automl.fit(dataset)
model.transform(dataset).collect()
}

test("TargetEncoding configuration on AutoML is propagated to backend") {
val automl = new H2OAutoML()
.setLabelCol("CAPSULE")
.setIgnoredCols(Array("ID"))
.setExcludeAlgos(Array("GLM"))
.setPreProcessing(Array("TargetEncoding"))
.setNfolds(3)
.setMaxModels(15)

automl.fit(dataset.withColumn("CAPSULE", 'CAPSULE.cast("string")))
val numberOfModelsWithTE = automl.getLeaderboard().filter('model_id.like("%TargetEncoder%")).count()
assert(numberOfModelsWithTE > 0)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sebhrusen I'm struggling to make a proper assertion that the target encoder configuration got propagated to the H2O-3 backend correctly. I always get models without TE in the name:

    +---+---------------------------------------------------+------------------+------------------+------------------+--------------------+-------------------+-------------------+
    |   |model_id                                           |auc               |logloss           |aucpr             |mean_per_class_error|rmse               |mse                |
    +---+---------------------------------------------------+------------------+------------------+------------------+--------------------+-------------------+-------------------+
    |0  |XGBoost_grid__1_AutoML_20200930_144425_model_2     |0.8008983329014425|0.5324630843647561|0.7264049743268661|0.2481068785810947  |0.4206946467535924 |0.17698398580712993|
    |1  |StackedEnsemble_BestOfFamily_AutoML_20200930_144425|0.796089948461029 |0.5352290055942586|0.7157876486963061|0.2413549854596758  |0.4218637819513173 |0.1779690505222686 |
    |2  |GBM_3_AutoML_20200930_144425                       |0.794304799746624 |0.5428050914514395|0.7136431558249916|0.240001727563272   |0.4233602336874402 |0.17923388746788396|
    |3  |XGBoost_2_AutoML_20200930_144425                   |0.7897555497970113|0.5438787026540678|0.6745633505945536|0.23801502980046646 |0.4241036764691094 |0.179863928394615  |
    |4  |StackedEnsemble_AllModels_AutoML_20200930_144425   |0.788546255506608 |0.5456330425471603|0.6993100693507803|0.25662952405631856 |0.4271499395420181 |0.1824570708507497 |
    |5  |XRT_1_AutoML_20200930_144425                       |0.7869914485618036|0.5458060203482348|0.7085552022729108|0.2466816388816907  |0.4275499590875706 |0.1827989675157833 |
    |6  |XGBoost_1_AutoML_20200930_144425                   |0.7866603322680027|0.5514808802580531|0.6793692069162894|0.2721200080619619  |0.42849949489139705|0.18361181712218239|
    |7  |XGBoost_3_AutoML_20200930_144425                   |0.7852926780109988|0.5585836539217749|0.7115903473071291|0.2751720365091705  |0.43035151169917557|0.18520242362176567|
    |8  |GBM_2_AutoML_20200930_144425                       |0.7832771875269932|0.5586083399653526|0.7057832950302982|0.25236820131870663 |0.4333758325064237 |0.1878146122006358 |
    |9  |GBM_4_AutoML_20200930_144425                       |0.779678097376983 |0.5586810781201778|0.7046138968774106|0.26643344562494603 |0.42958672754465704|0.1845447564825274 |
    |10 |DRF_1_AutoML_20200930_144425                       |0.7746105784457689|0.6439713162064343|0.6941656782674943|0.27183208084996113 |0.43279714728695945|0.18731337069973006|
    |11 |GBM_grid__1_AutoML_20200930_144425_model_1         |0.7733436987129654|0.566404361998668 |0.7024741690635927|0.307866171431862   |0.43590550114090165|0.19001360592490063|
    |12 |GBM_1_AutoML_20200930_144425                       |0.7721056117013619|0.585228079727758 |0.6925117235220278|0.3010422965074429  |0.44199090396413526|0.19535595918703344|
    |13 |DeepLearning_1_AutoML_20200930_144425              |0.7687080705997524|0.6047802837929463|0.6842854311283253|0.28547983069879934 |0.4470894540833909 |0.1998889799525845 |
    |14 |XGBoost_grid__1_AutoML_20200930_144425_model_1     |0.7667357691975469|0.5698147128763724|0.6416898453775174|0.25798278195272234 |0.4382533915528264 |0.19206603520755497|
    |15 |GBM_5_AutoML_20200930_144425                       |0.7561256514353172|0.5771390347876815|0.6219361203422313|0.27766260689297745 |0.44333348493674946|0.19654457886616306|
    |16 |DeepLearning_grid__1_AutoML_20200930_144425_model_1|0.750712619849702 |0.8332323707654468|0.6709959427755803|0.32626472027871356 |0.4750086911835548 |0.22563325669991374|
    +---+---------------------------------------------------+------------------+------------------+------------------+--------------------+-------------------+-------------------+

The parameters send to H2OBackend are:

{
  "input_spec": {
    "response_column": "CAPSULE",
    "fold_column": null,
    "weights_column": null,
    "sort_metric": "AUTO",
    "training_frame": "frame_rdd_133-929173483"
  },
  "build_models": {
    "exploitation_ratio": 0,
    "preprocessing": [
      {
        "type": "TargetEncoding"
      }
    ],
    "include_algos": [
      "DRF",
      "GBM",
      "DeepLearning",
      "StackedEnsemble",
      "XGBoost"
    ],
    "exclude_algos": null
  },
  "build_control": {
    "class_sampling_factors": null,
    "keep_cross_validation_fold_assignment": false,
    "max_after_balance_size": 5,
    "balance_classes": false,
    "stopping_criteria": {
      "stopping_rounds": 3,
      "seed": -1,
      "max_runtime_secs_per_model": 0,
      "max_runtime_secs": 0,
      "max_models": 15,
      "stopping_tolerance": -1,
      "stopping_metric": "AUTO"
    },
    "export_checkpoints_dir": null,
    "nfolds": 3,
    "keep_cross_validation_predictions": false,
    "project_name": null,
    "keep_cross_validation_models": false
  }
}

Do you have an idea what i'm doing wrong? I went over the tests in your PR h2oai/h2o-3#4927, but haven't noticed any special configuration.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's a good point: I don't know if we will want to change the model's name when they're trained with TE, currently it's not the case.
I think you do everything right: checking if a model uses TE is not simple today, especially as AutoML will apply TE only in certain conditions (training dataset must have to categorical columns which themselves need to fulfill certain cardinality constraints).
The easiest is to check backend logs (look for preprocessors property in model parameters) and/or download the model's json representation.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @sebhrusen!

I don't know if we will want to change the model's name when they're trained with TE, currently it's not the case.

I got inspired by your tests here https://github.com/h2oai/h2o-3/pull/4927/files#diff-9f262b275056f042a5247e16d4bf59c9R35, but apparently there is no relation between keys in your test and model_id in the leaderbord.

The easiest is to check backend logs (look for preprocessors property in model parameters) and/or download the model's json representation.

I will try to investigate json details of the model.

}
}
4 changes: 3 additions & 1 deletion py/src/ai/h2o/sparkling/ml/params/H2OAutoMLParams.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from ai.h2o.sparkling.ml.params.H2OAutoMLStoppingCriteriaParams import H2OAutoMLStoppingCriteriaParams
from ai.h2o.sparkling.ml.params.H2OCommonParams import H2OCommonParams
from ai.h2o.sparkling.ml.params.HasMonotoneConstraints import HasMonotoneConstraints
from ai.h2o.sparkling.ml.params.HasPreProcessing import HasPreProcessing
from pyspark.ml.param import *


Expand All @@ -30,6 +31,7 @@ class H2OAutoMLParams(
H2OAutoMLBuildModelsParams,
H2OAutoMLInputParams,
H2OAutoMLStoppingCriteriaParams,
HasMonotoneConstraints
HasMonotoneConstraints,
HasPreProcessing
):
pass
33 changes: 33 additions & 0 deletions py/src/ai/h2o/sparkling/ml/params/HasPreProcessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from ai.h2o.sparkling.ml.params.H2OTypeConverters import H2OTypeConverters
from pyspark.ml.param import *


class HasPreProcessing(Params):
preProcessing = Param(
Params._dummy(),
"preProcessing",
"The list of pre-processing steps to run. Only 'TargetEncoding' is currently supported.",
H2OTypeConverters.toNullableListEnumString("ai.h2o.automl.preprocessing.PreprocessingStepDefinition$Type"))

def getPreProcessing(self):
return self.getOrDefault(self.preProcessing)

def setPreProcessing(self, value):
return self._set(preProcessing=value)
20 changes: 18 additions & 2 deletions py/tests/unit/with_runtime_sparkling/test_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ def testH2OAutoMLRegressorBehavesTheSameAsGenericH2OAutoMLOnNumericLabelColumn(p
referenceModel = automl.fit(trainingDateset)
referenceDataset = referenceModel.transform(testingDataset)

classifier = setParametersForTesting(H2OAutoMLRegressor())
model = classifier.fit(trainingDateset)
regressor = setParametersForTesting(H2OAutoMLRegressor())
model = regressor.fit(trainingDateset)
result = model.transform(testingDataset)

unit_test_utils.assert_data_frames_are_identical(referenceDataset, result)
Expand All @@ -110,3 +110,19 @@ def testH2OAutoMLClassifierBehavesDiffenrentlyThanH2OAutoMLRegressor(prostateDat
classificationDataset = classificationModel.transform(testingDataset).drop("detailed_prediction")

unit_test_utils.assert_data_frames_have_different_values(regressionDataset, classificationDataset)


def testH2OAutoMLClassifierIsAbleToUseTargetEncoding(prostateDataset):
classifierWithTE = setParametersForTesting(H2OAutoMLClassifier(preProcessing=["TargetEncoding"])).setMaxModels(10)
classifierWithTE.fit(prostateDataset)
leaderboard = classifierWithTE.getLeaderboard()
numberOfTEModels = leaderboard.filter(leaderboard.model_id.like("%TargetEncoder%")).count()
assert numberOfTEModels > 0


def testH2OAutoMLRegressorReturnsDifferentResultWithTargetEncoder(prostateDataset):
regressorWithTE = setParametersForTesting(H2OAutoMLRegressor(preProcessing=["TargetEncoding"])).setMaxModels(10)
regressorWithTE.fit(prostateDataset)
leaderboard = regressorWithTE.getLeaderboard()
numberOfTEModels = leaderboard.filter(leaderboard.model_id.like("%TargetEncoder%")).count()
assert numberOfTEModels > 0