-
Notifications
You must be signed in to change notification settings - Fork 359
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[GH-3018] Add UpliftDRF #5698
base: master
Are you sure you want to change the base?
[GH-3018] Add UpliftDRF #5698
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,23 +29,26 @@ import hex.schemas.DRFModelV3.DRFModelOutputV3 | |
import hex.schemas.DeepLearningModelV3.DeepLearningModelOutputV3 | ||
import hex.schemas.DeepLearningV3.{DeepLearningParametersV3 => DLParamsV3} | ||
import hex.schemas.ExtendedIsolationForestModelV3.ExtendedIsolationForestModelOutputV3 | ||
import hex.schemas.ExtendedIsolationForestV3.{ExtendedIsolationForestParametersV3 => ExtIFParamsV3} | ||
import hex.schemas.GAMModelV3.GAMModelOutputV3 | ||
import hex.schemas.GBMModelV3.GBMModelOutputV3 | ||
import hex.schemas.GLMModelV3.GLMModelOutputV3 | ||
import hex.schemas.IsolationForestModelV3.IsolationForestModelOutputV3 | ||
import hex.schemas.IsolationForestV3.{IsolationForestParametersV3 => IFParamsV3} | ||
import hex.schemas.ExtendedIsolationForestV3.{ExtendedIsolationForestParametersV3 => ExtIFParamsV3} | ||
import hex.schemas.KMeansModelV3.KMeansModelOutputV3 | ||
import hex.schemas.KMeansV3.{KMeansParametersV3 => KMeansParamsV3} | ||
import hex.schemas.RuleFitModelV3.RuleFitModelOutputV3 | ||
import hex.schemas.RuleFitV3.RuleFitParametersV3 | ||
import hex.schemas.UpliftDRFModelV3.UpliftDRFModelOutputV3 | ||
import hex.schemas.UpliftDRFV3.{UpliftDRFParametersV3 => UpliftParams} | ||
import hex.schemas.XGBoostModelV3.XGBoostModelOutputV3 | ||
import hex.schemas.XGBoostV3.{XGBoostParametersV3 => XGBParamsV3} | ||
import hex.schemas.{DRFV3, GAMV3, GBMV3, GLMV3} | ||
import hex.tree.drf.DRFModel.DRFParameters | ||
import hex.tree.gbm.GBMModel.GBMParameters | ||
import hex.tree.isofor.IsolationForestModel.{IsolationForestParameters => IFParameters} | ||
import hex.tree.isoforextended.ExtendedIsolationForestModel.{ExtendedIsolationForestParameters => ExtIFParams} | ||
import hex.tree.uplift.UpliftDRFModel.UpliftDRFParameters | ||
import hex.tree.xgboost.XGBoostModel.XGBoostParameters | ||
|
||
import java.util | ||
|
@@ -97,6 +100,10 @@ class AlgorithmConfigurations extends MultipleAlgorithmsConfiguration { | |
val gamFields = Seq(ignoredCols, betaConstraints, gamCols) | ||
val gbmFields = Seq(monotonicity, calibrationDataFrame, ignoredCols) | ||
val drfFields = Seq(calibrationDataFrame, ignoredCols) | ||
val upliftDrfFields = Seq( | ||
ExplicitField("treatment_column", "HasTreatmentCol", "treatment"), | ||
ExplicitField("response_column", "HasLabelCol", "label"), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need to specify label column explicitly? Isn't uplift DRF just another supervised algorithm? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The response column is used the same way as in the supervised algorithm. |
||
ignoredCols) | ||
val kmeansFields = Seq(userPoints, ignoredCols) | ||
val coxPHFields = Seq(ignoredCols, interactionPairs) | ||
val ifFields = Seq(ignoredCols, calibrationDataFrame, validationLabelCol) | ||
|
@@ -118,6 +125,7 @@ class AlgorithmConfigurations extends MultipleAlgorithmsConfiguration { | |
("H2OXGBoostParams", classOf[XGBParamsV3], classOf[XGBoostParameters], xgboostFields, noDeprecation), | ||
("H2OGBMParams", classOf[GBMV3.GBMParametersV3], classOf[GBMParameters], gbmFields, noDeprecation), | ||
("H2ODRFParams", classOf[DRFV3.DRFParametersV3], classOf[DRFParameters], drfFields, noDeprecation), | ||
("H2OUpliftDRFParams", classOf[UpliftParams], classOf[UpliftDRFParameters], upliftDrfFields, noDeprecation), | ||
("H2OGLMParams", classOf[GLMV3.GLMParametersV3], classOf[GLMParameters], glmFields, noDeprecation), | ||
("H2OGAMParams", classOf[GAMV3.GAMParametersV3], classOf[GAMParameters], gamFields, noDeprecation), | ||
("H2ODeepLearningParams", classOf[DLParamsV3], classOf[DeepLearningParameters], dlFields, noDeprecation), | ||
|
@@ -158,6 +166,12 @@ class AlgorithmConfigurations extends MultipleAlgorithmsConfiguration { | |
("H2OXGBoost", classOf[XGBoostParameters], treeSupervised, Seq(withDistribution), None), | ||
("H2OGBM", classOf[GBMParameters], treeSupervised, Seq(withDistribution), None), | ||
("H2ODRF", classOf[DRFParameters], treeSupervised, Seq(withDistribution), None), | ||
( | ||
"H2OUpliftDRF", | ||
classOf[UpliftDRFParameters], | ||
treeUnsupervised, | ||
Seq(withDistribution, "H2OUpliftDRFExtras"), | ||
None), | ||
("H2OGLM", classOf[GLMParameters], cvSupervised, Seq(withFamily), Some("H2OGLMMetrics")), | ||
("H2OGAM", classOf[GAMParameters], cvSupervised, Seq(withFamily), None), | ||
("H2ODeepLearning", classOf[DeepLearningParameters], cvSupervised, Seq(withDistribution), None), | ||
|
@@ -202,6 +216,7 @@ class AlgorithmConfigurations extends MultipleAlgorithmsConfiguration { | |
("H2OXGBoostModelOutputs", classOf[XGBoostModelOutputV3]), | ||
("H2OGBMModelOutputs", classOf[GBMModelOutputV3]), | ||
("H2ODRFModelOutputs", classOf[DRFModelOutputV3]), | ||
("H2OUpliftDRFModelOutputs", classOf[UpliftDRFModelOutputV3]), | ||
("H2OGLMModelOutputs", classOf[GLMModelOutputV3]), | ||
("H2OGAMModelOutputs", classOf[GAMModelOutputV3]), | ||
("H2ODeepLearningModelOutputs", classOf[DeepLearningModelOutputV3]), | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ | |
\begin{itemize} | ||
\item DeepLearning | ||
\item DRF | ||
\item UpliftDRF | ||
\item GBM | ||
\item XGBoost | ||
\item AutoML | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
.. _uplift_drf: | ||
|
||
Train Distributed Uplift Random Forest (Uplift DRF) Model in Sparkling Water | ||
---------------------------------------------------------------------------- | ||
|
||
Introduction | ||
~~~~~~~~~~~~ | ||
Distributed Uplift Random Forest (Uplift DRF) is a classification tool for modeling uplift - the incremental impact of a treatment. Only binomial classification (distribution="bernoulli") is currently supported. | ||
Uplift DRF can be applied in fields where we operate with two groups of subjects. First group, let’s call it treatment, receive some kind of treatment (e.g. marketing campaign, medicine,…), and a second group, let’s call it control, is separated from the treatment. We also gather information about their response, whether they bought a product, recover from disease, or similar. Then, Uplift DRF trains so-called uplift trees. | ||
For more comprehensive description see `H2O-3 Distributed Uplift Random Forest (Uplift DRF) documentation <https://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/upliftdrf.html>`__. | ||
|
||
Example | ||
~~~~~~~ | ||
|
||
The following section describes how to train the Distributed Uplift Random Forest (Uplift DRF) model in Sparkling Water in Scala & Python following the same example as H2O-3 documentation mentioned above. See also :ref:`parameters_H2OUpliftDRF` | ||
and :ref:`model_details_H2OUpliftDRFMOJOModel`. | ||
|
||
.. content-tabs:: | ||
|
||
.. tab-container:: Scala | ||
:title: Scala | ||
|
||
First, let's start Sparkling Shell as | ||
|
||
.. code:: shell | ||
|
||
./bin/sparkling-shell | ||
|
||
Start H2O cluster inside the Spark environment | ||
|
||
.. code:: scala | ||
|
||
import ai.h2o.sparkling._ | ||
import java.net.URI | ||
val hc = H2OContext.getOrCreate() | ||
|
||
Parse the data using H2O and convert them to Spark Frame | ||
|
||
.. code:: scala | ||
|
||
import org.apache.spark.SparkFiles | ||
val datasetUrl = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv" | ||
spark.sparkContext.addFile(datasetUrl) //for example purposes, on a real cluster it's better to load directly from distributed storage | ||
val sparkDF = spark.read.option("header", "true").option("inferSchema", "true").csv(SparkFiles.get("prostate.csv")) | ||
val Array(trainingDF, testingDF) = sparkDF.randomSplit(Array(0.8, 0.2)) | ||
|
||
Train the model. You can configure all the available Distributed Uplift Random Forest (Uplift DRF) arguments using provided setters. | ||
|
||
.. code:: scala | ||
|
||
import ai.h2o.sparkling.ml.algos.H2OUpliftDRF | ||
|
||
val predictorColumns = Array("f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8") | ||
val responseColumn = "conversion" | ||
val treatmentColumn = "treatment" | ||
|
||
val algo = new H2OUpliftDRF() | ||
.setNtrees(10) | ||
.setMaxDepth(5) | ||
.setTreatmentCol(treatmentColumn) | ||
.setUpliftMetric("KL") | ||
.setMinRows(10) | ||
.setSeed(1234) | ||
.setAuucType("qini") | ||
.setLabelCol(responseColumn) | ||
.setFeaturesCols(predictorColumns :+ treatmentColumn :+ responseColumn) | ||
|
||
val model = algo.fit(trainingDF) | ||
|
||
Run Predictions | ||
|
||
.. code:: scala | ||
|
||
model.transform(testingDF).show(truncate = false) | ||
|
||
View model summary containing info about trained trees etc. | ||
|
||
.. code:: scala | ||
|
||
model.getModelSummary() | ||
|
||
You can also get other model details by calling methods listed in :ref:`model_details_H2OUpliftDRFMOJOModel`. | ||
|
||
|
||
.. tab-container:: Python | ||
:title: Python | ||
|
||
First, let's start PySparkling Shell as | ||
|
||
.. code:: shell | ||
|
||
./bin/pysparkling | ||
|
||
Start H2O cluster inside the Spark environment | ||
|
||
.. code:: python | ||
|
||
from pysparkling import * | ||
hc = H2OContext.getOrCreate() | ||
|
||
Parse the data using H2O and convert them to Spark Frame | ||
|
||
.. code:: python | ||
|
||
import h2o | ||
frame = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/uplift/criteo_uplift_13k.csv") | ||
sparkDF = hc.asSparkFrame(frame) | ||
[trainingDF, testingDF] = sparkDF.randomSplit([0.8, 0.2]) | ||
|
||
Train the model. You can configure all the available UpliftDRF arguments using provided setters or constructor parameters. | ||
|
||
.. code:: python | ||
|
||
from pysparkling.ml import H2OUpliftDRF | ||
|
||
treatmentColumn = "treatment" | ||
responseColumn = "conversion" | ||
predictors = ["f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", treatmentColumn, responseColumn] | ||
|
||
algo = H2OUpliftDRF(featuresCols=predictors, | ||
ntrees = 10, | ||
maxDepth = 5, | ||
treatmentCol = treatmentColumn, | ||
upliftMetric = "KL", | ||
minRows = 10, | ||
seed = 1234, | ||
auucType = "qini", | ||
labelCol = responseColumn) | ||
|
||
model = algo.fit(trainingDF) | ||
|
||
Run Predictions | ||
|
||
.. code:: python | ||
|
||
model.transform(testingDF).show(truncate = False) | ||
|
||
View model summary containing info about trained trees etc. | ||
|
||
.. code:: python | ||
|
||
model.getModelSummary() | ||
|
||
You can also get other model details by calling methods listed in :ref:`model_details_H2OUpliftDRFMOJOModel`. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need an explicit field for treatment? Could we just add another rule to
ParameterNameConverter
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, for the uplift algorithm, the new treatment column is crucial. However I am not sure how this algorithm configuration works, so I am not sure if this is the correct way to add the treatment column.