From 4ee272cc4f28ac315a9f8f80d8ce2033bfd1a6b3 Mon Sep 17 00:00:00 2001 From: Veronika Maurerova Date: Tue, 17 Oct 2023 18:49:14 +0200 Subject: [PATCH 1/5] Fix Uplift MOJO API, add tests --- .../src/main/java/hex/generic/Generic.java | 1 + .../main/java/hex/generic/GenericModel.java | 4 ++ .../src/product/data-science/upliftdrf.rst | 3 +- .../algos/upliftdrf/UpliftDrfMojoReader.java | 2 +- .../uplift/pyunit_uplift_rf_mojo.py | 70 +++++++++++++++++++ .../runit_uplift_compare_h2o_vs_upliftRF.R | 2 +- .../uplift/runit_uplift_rf_mojo.R | 60 ++++++++++++++++ 7 files changed, 139 insertions(+), 3 deletions(-) create mode 100644 h2o-py/tests/testdir_algos/uplift/pyunit_uplift_rf_mojo.py create mode 100644 h2o-r/tests/testdir_algos/uplift/runit_uplift_rf_mojo.R diff --git a/h2o-algos/src/main/java/hex/generic/Generic.java b/h2o-algos/src/main/java/hex/generic/Generic.java index 1885dc41c87f..3d819acc1737 100644 --- a/h2o-algos/src/main/java/hex/generic/Generic.java +++ b/h2o-algos/src/main/java/hex/generic/Generic.java @@ -39,6 +39,7 @@ public class Generic extends ModelBuilder`__. +Uplift DRF supports importing and exporting `MOJOs <../save-and-load-model.html#supported-mojos>`__. + Uplift DRF demo ~~~~~~~~~~~~~~~ diff --git a/h2o-genmodel/src/main/java/hex/genmodel/algos/upliftdrf/UpliftDrfMojoReader.java b/h2o-genmodel/src/main/java/hex/genmodel/algos/upliftdrf/UpliftDrfMojoReader.java index a8b3a877b540..81dfbb9c32e9 100644 --- a/h2o-genmodel/src/main/java/hex/genmodel/algos/upliftdrf/UpliftDrfMojoReader.java +++ b/h2o-genmodel/src/main/java/hex/genmodel/algos/upliftdrf/UpliftDrfMojoReader.java @@ -4,7 +4,7 @@ import java.io.IOException; /** - */ + */ public class UpliftDrfMojoReader extends SharedTreeMojoReader { @Override diff --git a/h2o-py/tests/testdir_algos/uplift/pyunit_uplift_rf_mojo.py b/h2o-py/tests/testdir_algos/uplift/pyunit_uplift_rf_mojo.py new file mode 100644 index 000000000000..e53ac047fff5 --- /dev/null +++ b/h2o-py/tests/testdir_algos/uplift/pyunit_uplift_rf_mojo.py @@ -0,0 +1,70 @@ +import sys, os + +sys.path.insert(1, os.path.join("..", "..", "..")) +import h2o +from tests import pyunit_utils, assert_equals +from h2o.estimators import H2OUpliftRandomForestEstimator + + +def uplift_random_forest_mojo(): + print("Uplift Distributed Random Forest MOJO Test") + seed = 12345 + + treatment_column = "treatment" + response_column = "outcome" + x_names = ["feature_"+str(x) for x in range(1, 13)] + + train_h2o = h2o.upload_file(pyunit_utils.locate("smalldata/uplift/upliftml_train.csv")) + train_h2o[treatment_column] = train_h2o[treatment_column].asfactor() + train_h2o[response_column] = train_h2o[response_column].asfactor() + + n_samples = train_h2o.shape[0] + + uplift_model = H2OUpliftRandomForestEstimator( + ntrees=10, + max_depth=5, + treatment_column=treatment_column, + uplift_metric="KL", + distribution="bernoulli", + min_rows=10, + nbins=1000, + seed=seed, + sample_rate=0.99, + auuc_type="gain" + ) + uplift_model.train(y=response_column, x=x_names, training_frame=train_h2o) + prediction = uplift_model.predict(train_h2o) + + assert_equals(n_samples, prediction.shape[0], "Not correct shape") + assert_equals(3, prediction.shape[1], "Not correct shape") + print(prediction) + uplift_predict = prediction['uplift_predict'].as_data_frame(use_pandas=True)["uplift_predict"] + + path = pyunit_utils.locate("results") + + assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not. ".format(path) + model_path = uplift_model.download_mojo(path=path) + + assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not. ".format(model_path) + mojo_model = h2o.upload_mojo(model_path) + prediction_reloaded = mojo_model.predict(train_h2o) + + assert_equals(n_samples, prediction.shape[0], "Not correct shape") + assert_equals(3, prediction.shape[1], "Not correct shape") + print(prediction_reloaded) + uplift_predict_reloaded = prediction_reloaded['uplift_predict'].as_data_frame(use_pandas=True)["uplift_predict"] + + assert_equals(uplift_predict[0], uplift_predict_reloaded[0], "Output is not the same after reload") + assert_equals(uplift_predict[5], uplift_predict_reloaded[5], "Output is not the same after reload") + assert_equals(uplift_predict[33], uplift_predict_reloaded[33], "Output is not the same after reload") + assert_equals(uplift_predict[256], uplift_predict_reloaded[256], "Output is not the same after reload") + assert_equals(uplift_predict[499], uplift_predict_reloaded[499], "Output is not the same after reload") + assert_equals(uplift_predict[512], uplift_predict_reloaded[512], "Output is not the same after reload") + assert_equals(uplift_predict[750], uplift_predict_reloaded[750], "Output is not the same after reload") + assert_equals(uplift_predict[999], uplift_predict_reloaded[999], "Output is not the same after reload") + + +if __name__ == "__main__": + pyunit_utils.standalone_test(uplift_random_forest_mojo) +else: + uplift_random_forest_mojo() diff --git a/h2o-r/tests/testdir_algos/uplift/runit_uplift_compare_h2o_vs_upliftRF.R b/h2o-r/tests/testdir_algos/uplift/runit_uplift_compare_h2o_vs_upliftRF.R index 0495fb72c55f..925d03257ce2 100644 --- a/h2o-r/tests/testdir_algos/uplift/runit_uplift_compare_h2o_vs_upliftRF.R +++ b/h2o-r/tests/testdir_algos/uplift/runit_uplift_compare_h2o_vs_upliftRF.R @@ -78,7 +78,7 @@ test.uplift.vs.h2oUplift <- function() { h2oQini <- qini(h2oPerf) print(paste("H2O:", h2oQini, "upliftRF:", upliftQini$Qini)) - diff = abs(h2oQini$Qini - upliftQini$Qini) + diff <- abs(h2oQini$Qini - upliftQini$Qini) print(paste("Diff:", diff)) expect_true(diff < 10e-1) } diff --git a/h2o-r/tests/testdir_algos/uplift/runit_uplift_rf_mojo.R b/h2o-r/tests/testdir_algos/uplift/runit_uplift_rf_mojo.R new file mode 100644 index 000000000000..d560d5de2541 --- /dev/null +++ b/h2o-r/tests/testdir_algos/uplift/runit_uplift_rf_mojo.R @@ -0,0 +1,60 @@ +setwd(normalizePath(dirname(R.utils::commandArgs(asValues = TRUE)$"f"))) +source("../../../scripts/h2o-r-test-setup.R") +library(uplift) + + +test.uplift <- function() { + ntrees <- 10 + mtries <- 6 + seed <- 42 + uplift_metric <- "KL" + set.seed(seed) + + # Test data preparation for each implementation + train <- sim_pte(n = 2000, p = 6, rho = 0, sigma = sqrt(2), beta.den = 4) + train$treat <- ifelse(train$treat == 1, 1, 0) + test <- sim_pte(n = 1000, p = 6, rho = 0, sigma = sqrt(2), beta.den = 4) + test$treat <- ifelse(test$treat == 1, 1, 0) + + trainh2o <- train + trainh2o$treat <- as.factor(train$treat) + trainh2o$y <- as.factor(train$y) + trainh2o <- as.h2o(trainh2o) + + testh2o <- test + testh2o$treat <- as.factor(test$treat) + testh2o$y <- as.factor(test$y) + testh2o <- as.h2o(testh2o) + + model <- h2o.upliftRandomForest( + x = c("X1", "X2", "X3", "X4", "X5", "X6"), + y = "y", + training_frame = trainh2o, + validation_frame = testh2o, + treatment_column = "treat", + uplift_metric = uplift_metric, + auuc_type = "qini", + distribution = "bernoulli", + ntrees = ntrees, + mtries = mtries, + max_depth = 10, + min_rows = 10, + nbins = 100, + seed = seed) + + print(model) + pred.uplift <- h2o.predict(model, testh2o) + print(pred.uplift) + + tmpdir <- tempdir() + print(tmpdir) + modelfile <- h2o.download_mojo(model, path=tmpdir) + print(modelfile) + modelpath <- paste0(tmpdir, "/", modelfile) + print(modelpath) + model.mojo <- h2o.import_mojo(modelpath) + pred.mojo <- h2o.predict(model.mojo, testh2o) + print(pred.mojo) +} + +doTest("Uplift Random Forest Test: Test H2O RF uplift", test.uplift) From 9da727dbf76b237aa0ec851fc56a31aaabc95c4a Mon Sep 17 00:00:00 2001 From: Veronika Maurerova Date: Wed, 18 Oct 2023 16:00:55 +0200 Subject: [PATCH 2/5] Add tests, fix predict --- .../main/java/hex/generic/GenericModel.java | 14 +++++- .../java/hex/generic/GenericModelOutput.java | 8 +++- h2o-core/src/main/java/hex/AUUC.java | 1 - h2o-core/src/main/java/hex/Model.java | 3 ++ h2o-core/src/main/java/hex/ModelMetrics.java | 1 + .../java/hex/genmodel/MojoPipelineWriter.java | 3 ++ .../genmodel/descriptor/ModelDescriptor.java | 6 +++ .../descriptor/ModelDescriptorBuilder.java | 14 ++++++ .../uplift/pyunit_uplift_rf_mojo.py | 44 ++++++++++++------ .../uplift/runit_uplift_rf_mojo.R | 33 ++++++++++--- .../uplift/testthat-problems.rds | Bin 0 -> 451 bytes 11 files changed, 103 insertions(+), 24 deletions(-) create mode 100644 h2o-r/tests/testdir_algos/uplift/testthat-problems.rds diff --git a/h2o-algos/src/main/java/hex/generic/GenericModel.java b/h2o-algos/src/main/java/hex/generic/GenericModel.java index 57dd1ecc5370..2e2a4e42374f 100644 --- a/h2o-algos/src/main/java/hex/generic/GenericModel.java +++ b/h2o-algos/src/main/java/hex/generic/GenericModel.java @@ -185,8 +185,15 @@ private void predict(EasyPredictModelWrapper wrapper, AdaptFrameParameters adapt final String offsetColumn = adaptFrameParameters.getOffsetColumn(); final String weightsColumn = adaptFrameParameters.getWeightsColumn(); final String responseColumn = adaptFrameParameters.getResponseColumn(); + final String treatmentColumn = adaptFrameParameters.getTreatmentColumn(); final boolean isClassifier = wrapper.getModel().isClassifier(); - final float[] yact = new float[1]; + final boolean isUplift = treatmentColumn != null; + final float[] yact; + if (isUplift) { + yact = new float[2]; + } else { + yact = new float[1]; + } for (int row = 0; row < cs[0]._len; row++) { RowData rowData = new RowData(); RowDataUtils.extractChunkRow(cs, _fr._names, types, row, rowData); @@ -210,6 +217,9 @@ private void predict(EasyPredictModelWrapper wrapper, AdaptFrameParameters adapt yact[0] = (float) idx; } else yact[0] = ((Number) response).floatValue(); + if (isUplift){ + yact[1] = (float) rowData.get(treatmentColumn); + } _mb.perRow(result, yact, weight, offset, GenericModel.this); } } @@ -289,7 +299,7 @@ public String getResponseColumn() { return genModel.isSupervised() ? genModel.getResponseName() : null; } @Override - public String getTreatmentColumn() {return null;} + public String getTreatmentColumn() {return descriptor != null ? descriptor.treatmentColumn() : null;} @Override public double missingColumnsType() { return Double.NaN; diff --git a/h2o-algos/src/main/java/hex/generic/GenericModelOutput.java b/h2o-algos/src/main/java/hex/generic/GenericModelOutput.java index f1d583a5a0a6..b25d71629bd0 100644 --- a/h2o-algos/src/main/java/hex/generic/GenericModelOutput.java +++ b/h2o-algos/src/main/java/hex/generic/GenericModelOutput.java @@ -27,6 +27,7 @@ public GenericModelOutput(final ModelDescriptor modelDescriptor) { _hasOffset = modelDescriptor.offsetColumn() != null; _hasWeights = modelDescriptor.weightsColumn() != null; _hasFold = modelDescriptor.foldColumn() != null; + _hasTreatment = modelDescriptor.treatmentColumn() != null; _modelClassDist = modelDescriptor.modelClassDist(); _priorClassDist = modelDescriptor.priorClassDist(); _names = modelDescriptor.columnNames(); @@ -36,6 +37,7 @@ public GenericModelOutput(final ModelDescriptor modelDescriptor) { _defaultThreshold = modelDescriptor.defaultThreshold(); _original_model_identifier = modelDescriptor.algoName(); _original_model_full_name = modelDescriptor.algoFullName(); + } public GenericModelOutput(final ModelDescriptor modelDescriptor, final ModelAttributes modelAttributes, @@ -298,5 +300,9 @@ private static TwoDimTable convertTable(final Table convertedTable){ return table; } - + + @Override + public boolean hasTreatment() { + return super.hasTreatment(); + } } diff --git a/h2o-core/src/main/java/hex/AUUC.java b/h2o-core/src/main/java/hex/AUUC.java index 842ec0869ad9..78dd39720aae 100644 --- a/h2o-core/src/main/java/hex/AUUC.java +++ b/h2o-core/src/main/java/hex/AUUC.java @@ -79,7 +79,6 @@ public AUUC(Vec probs, Vec y, Vec uplift, AUUCType auucType, int nbins) { public AUUC(AUUCBuilder bldr, AUUCType auucType) { this(bldr, true, auucType); } - public AUUC(double[] customThresholds, Vec probs, Vec y, Vec uplift, AUUCType auucType) { this(new AUUCImpl(customThresholds).doAll(probs, y, uplift)._bldr, auucType); diff --git a/h2o-core/src/main/java/hex/Model.java b/h2o-core/src/main/java/hex/Model.java index ddc56c7938ac..2123d5096b8d 100755 --- a/h2o-core/src/main/java/hex/Model.java +++ b/h2o-core/src/main/java/hex/Model.java @@ -1162,6 +1162,7 @@ public String[] features() { public String weightsName () { return _hasWeights ?_names[weightsIdx()]:null;} public String offsetName () { return _hasOffset ?_names[offsetIdx()]:null;} public String foldName () { return _hasFold ?_names[foldIdx()]:null;} + public String treatmentName() { return _hasTreatment ? _names[treatmentIdx()]: null;} public InteractionBuilder interactionBuilder() { return null; } // Vec layout is [c1,c2,...,cn, w?, o?, f?, u?, r] // cn are predictor cols, r is response, w is weights, o is offset, f is fold and t is treatment - these are optional @@ -3469,6 +3470,8 @@ protected class H2OModelDescriptor implements ModelDescriptor { @Override public String weightsColumn() { return _output.weightsName(); } @Override + public String treatmentColumn() { return _output.treatmentName(); } + @Override public String foldColumn() { return _output.foldName(); } @Override public ModelCategory getModelCategory() { return _output.getModelCategory(); } diff --git a/h2o-core/src/main/java/hex/ModelMetrics.java b/h2o-core/src/main/java/hex/ModelMetrics.java index fa226427ffb3..da293128fa0e 100755 --- a/h2o-core/src/main/java/hex/ModelMetrics.java +++ b/h2o-core/src/main/java/hex/ModelMetrics.java @@ -435,6 +435,7 @@ public double[] perRow(double ds[], float yact[], double weight, double offset, assert(weight == 1 && offset == 0); return perRow(ds, yact, m); } + public void reduce( T mb ) { _sumsqe += mb._sumsqe; _count += mb._count; diff --git a/h2o-genmodel/src/main/java/hex/genmodel/MojoPipelineWriter.java b/h2o-genmodel/src/main/java/hex/genmodel/MojoPipelineWriter.java index de712b60c933..94cae181cca5 100644 --- a/h2o-genmodel/src/main/java/hex/genmodel/MojoPipelineWriter.java +++ b/h2o-genmodel/src/main/java/hex/genmodel/MojoPipelineWriter.java @@ -145,6 +145,9 @@ public String foldColumn() { return null; } + @Override + public String treatmentColumn() { return null; } + @Override public ModelCategory getModelCategory() { return _finalModel._category; diff --git a/h2o-genmodel/src/main/java/hex/genmodel/descriptor/ModelDescriptor.java b/h2o-genmodel/src/main/java/hex/genmodel/descriptor/ModelDescriptor.java index d227ff32da8a..2c4d09948cad 100644 --- a/h2o-genmodel/src/main/java/hex/genmodel/descriptor/ModelDescriptor.java +++ b/h2o-genmodel/src/main/java/hex/genmodel/descriptor/ModelDescriptor.java @@ -47,6 +47,12 @@ public interface ModelDescriptor { */ String foldColumn(); + + /** + * @return A {@link String} with the name of the treatment column used. Null of there was no treatment used during training. + */ + String treatmentColumn(); + /** * Model's category. * diff --git a/h2o-genmodel/src/main/java/hex/genmodel/descriptor/ModelDescriptorBuilder.java b/h2o-genmodel/src/main/java/hex/genmodel/descriptor/ModelDescriptorBuilder.java index 8542ba7a3037..781bba7d9769 100644 --- a/h2o-genmodel/src/main/java/hex/genmodel/descriptor/ModelDescriptorBuilder.java +++ b/h2o-genmodel/src/main/java/hex/genmodel/descriptor/ModelDescriptorBuilder.java @@ -44,6 +44,7 @@ public static class MojoModelDescriptor implements ModelDescriptor, Serializable private final double[] _modelClassDistrib; private final String _offsetColumn; private final String _weightsColumn; + private final String _treatmentColumn; private final String[][] _domains; private final String[][] _origDomains; private final String[] _names; @@ -76,6 +77,11 @@ private MojoModelDescriptor(final MojoModel mojoModel, final String fullAlgorith } else { _weightsColumn = null; } + if (modelAttributes != null) { + _treatmentColumn = (String) modelAttributes.getParameterValueByName("treatment_column");; + } else { + _treatmentColumn = null; + } } @Override @@ -113,6 +119,11 @@ public String foldColumn() { return null; } + @Override + public String treatmentColumn() { + return _treatmentColumn; + } + @Override public ModelCategory getModelCategory() { return _category; @@ -248,6 +259,9 @@ public String foldColumn() { return null; } + @Override + public String treatmentColumn() { return null; } + @Override public ModelCategory getModelCategory() { return _category; diff --git a/h2o-py/tests/testdir_algos/uplift/pyunit_uplift_rf_mojo.py b/h2o-py/tests/testdir_algos/uplift/pyunit_uplift_rf_mojo.py index e53ac047fff5..f7c60876abad 100644 --- a/h2o-py/tests/testdir_algos/uplift/pyunit_uplift_rf_mojo.py +++ b/h2o-py/tests/testdir_algos/uplift/pyunit_uplift_rf_mojo.py @@ -21,7 +21,7 @@ def uplift_random_forest_mojo(): n_samples = train_h2o.shape[0] uplift_model = H2OUpliftRandomForestEstimator( - ntrees=10, + ntrees=5, max_depth=5, treatment_column=treatment_column, uplift_metric="KL", @@ -32,12 +32,14 @@ def uplift_random_forest_mojo(): sample_rate=0.99, auuc_type="gain" ) + uplift_model.train(y=response_column, x=x_names, training_frame=train_h2o) + print(uplift_model) + prediction = uplift_model.predict(train_h2o) assert_equals(n_samples, prediction.shape[0], "Not correct shape") assert_equals(3, prediction.shape[1], "Not correct shape") - print(prediction) uplift_predict = prediction['uplift_predict'].as_data_frame(use_pandas=True)["uplift_predict"] path = pyunit_utils.locate("results") @@ -47,22 +49,36 @@ def uplift_random_forest_mojo(): assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not. ".format(model_path) mojo_model = h2o.upload_mojo(model_path) - prediction_reloaded = mojo_model.predict(train_h2o) + print(mojo_model) + + prediction_mojo = mojo_model.predict(train_h2o) assert_equals(n_samples, prediction.shape[0], "Not correct shape") assert_equals(3, prediction.shape[1], "Not correct shape") - print(prediction_reloaded) - uplift_predict_reloaded = prediction_reloaded['uplift_predict'].as_data_frame(use_pandas=True)["uplift_predict"] - - assert_equals(uplift_predict[0], uplift_predict_reloaded[0], "Output is not the same after reload") - assert_equals(uplift_predict[5], uplift_predict_reloaded[5], "Output is not the same after reload") - assert_equals(uplift_predict[33], uplift_predict_reloaded[33], "Output is not the same after reload") - assert_equals(uplift_predict[256], uplift_predict_reloaded[256], "Output is not the same after reload") - assert_equals(uplift_predict[499], uplift_predict_reloaded[499], "Output is not the same after reload") - assert_equals(uplift_predict[512], uplift_predict_reloaded[512], "Output is not the same after reload") - assert_equals(uplift_predict[750], uplift_predict_reloaded[750], "Output is not the same after reload") - assert_equals(uplift_predict[999], uplift_predict_reloaded[999], "Output is not the same after reload") + print(prediction_mojo) + uplift_predict_mojo = prediction_mojo['uplift_predict'].as_data_frame(use_pandas=True)["uplift_predict"] + + assert_equals(uplift_predict[0], uplift_predict_mojo[0], "Output is not the same with MOJO") + assert_equals(uplift_predict[5], uplift_predict_mojo[5], "Output is not the same with MOJO") + assert_equals(uplift_predict[33], uplift_predict_mojo[33], "Output is not the same with MOJO") + assert_equals(uplift_predict[256], uplift_predict_mojo[256], "Output is not the same with MOJO") + assert_equals(uplift_predict[499], uplift_predict_mojo[499], "Output is not the same with MOJO") + assert_equals(uplift_predict[512], uplift_predict_mojo[512], "Output is not the same with MOJO") + assert_equals(uplift_predict[750], uplift_predict_mojo[750], "Output is not the same with MOJO") + assert_equals(uplift_predict[999], uplift_predict_mojo[999], "Output is not the same with MOJO") + perf_model = uplift_model.model_performance() + print(perf_model) + perf_model_auuc = perf_model.auuc() + + perf_mojo = mojo_model.model_performance() + print(perf_mojo) + perf_mojo_auuc = perf_mojo.auuc() + + assert_equals(perf_model_auuc, perf_mojo_auuc, "AUUC is not the same with MOJO") + + os.remove(model_path) + if __name__ == "__main__": pyunit_utils.standalone_test(uplift_random_forest_mojo) diff --git a/h2o-r/tests/testdir_algos/uplift/runit_uplift_rf_mojo.R b/h2o-r/tests/testdir_algos/uplift/runit_uplift_rf_mojo.R index d560d5de2541..882f1075951b 100644 --- a/h2o-r/tests/testdir_algos/uplift/runit_uplift_rf_mojo.R +++ b/h2o-r/tests/testdir_algos/uplift/runit_uplift_rf_mojo.R @@ -4,7 +4,7 @@ library(uplift) test.uplift <- function() { - ntrees <- 10 + ntrees <- 6 mtries <- 6 seed <- 42 uplift_metric <- "KL" @@ -44,17 +44,38 @@ test.uplift <- function() { print(model) pred.uplift <- h2o.predict(model, testh2o) - print(pred.uplift) + pred.uplift.df <- as.data.frame(pred.uplift) tmpdir <- tempdir() - print(tmpdir) modelfile <- h2o.download_mojo(model, path=tmpdir) - print(modelfile) modelpath <- paste0(tmpdir, "/", modelfile) - print(modelpath) + model.mojo <- h2o.import_mojo(modelpath) + print(model.mojo) pred.mojo <- h2o.predict(model.mojo, testh2o) - print(pred.mojo) + pred.mojo.df <- as.data.frame(pred.mojo) + + expect_equal(pred.mojo.df[1,1], pred.uplift.df[1,1]) + expect_equal(pred.mojo.df[2,1], pred.uplift.df[2,1]) + expect_equal(pred.mojo.df[10,1], pred.uplift.df[10,1]) + expect_equal(pred.mojo.df[42,1], pred.uplift.df[42,1]) + expect_equal(pred.mojo.df[550,1], pred.uplift.df[550,1]) + expect_equal(pred.mojo.df[666,1], pred.uplift.df[666,1]) + + perf.uplift <- h2o.performance(model) + print(perf.uplift) + auuc.uplift <- h2o.auuc(perf.uplift) + print(auuc.uplift) + + perf.mojo <- h2o.performance(model.mojo) + print(perf.mojo) + auuc.mojo <- h2o.auuc(perf.mojo) + print(auuc.mojo) + + expect_equal(auuc.uplift, auuc.mojo) + + on.exit(unlink(modelpath,recursive=TRUE)) + on.exit(unlink(tmpdir,recursive=TRUE)) } doTest("Uplift Random Forest Test: Test H2O RF uplift", test.uplift) diff --git a/h2o-r/tests/testdir_algos/uplift/testthat-problems.rds b/h2o-r/tests/testdir_algos/uplift/testthat-problems.rds new file mode 100644 index 0000000000000000000000000000000000000000..b888363003e28eaeb19cedc412bb28991c565a97 GIT binary patch literal 451 zcmV;!0X+U6iwFP!000001C3KnPXaLzE$o8$g(lwoNcso#;E{tdF(Dp|8UxWA3`@b9 z(ynb+^v{E{yX{go9&9o@?M(XS&71Dq9wDSjViJ*Ngj;fg)*vLxeGBclILBlc{qq>I zEV;^3!Dn_%rLvTLW+13lv`j73hZ6+I`I^mMuVA=|wC6%iK-8iKOK#s^K}#;@2G1*(53yp)jCHxyrdDHJ!-^+hSS@yv z^s=HB1nZLc2tb$ z532jYFl(vZYMAccp_>jVUq`W`AbyH=hIWp2QER^?VZn@hN|6sHD5dcNrJ$L#)sf37 zPx33~ Date: Wed, 25 Oct 2023 16:56:52 +0200 Subject: [PATCH 3/5] Implement Generic logic behind Python/R uplift MOJO API --- .../java/hex/generic/GenericModelOutput.java | 79 +++++++++++++++++- h2o-core/src/main/java/hex/AUUC.java | 37 +++++--- .../ModelMetricsBinomialUpliftGeneric.java | 20 +++++ .../ModelMetricsBinomialUpliftGenericV3.java | 29 +++++++ .../ModelMetricsBinomialUpliftV3.java | 5 +- .../META-INF/services/water.api.Schema | 1 + .../genmodel/attributes/ModelAttributes.java | 2 + .../MojoModelMetricsBinomialUplift.java | 19 +++++ .../uplift/pyunit_uplift_rf_mojo.py | 10 ++- .../pyunit_generic_model_mojo_upliftdrf.py | 73 ++++++++++++++++ .../uplift/testthat-problems.rds | Bin 451 -> 0 bytes 11 files changed, 256 insertions(+), 19 deletions(-) create mode 100644 h2o-core/src/main/java/hex/ModelMetricsBinomialUpliftGeneric.java create mode 100644 h2o-core/src/main/java/water/api/schemas3/ModelMetricsBinomialUpliftGenericV3.java create mode 100644 h2o-genmodel/src/main/java/hex/genmodel/attributes/metrics/MojoModelMetricsBinomialUplift.java create mode 100644 h2o-py/tests/testdir_generic_model/pyunit_generic_model_mojo_upliftdrf.py delete mode 100644 h2o-r/tests/testdir_algos/uplift/testthat-problems.rds diff --git a/h2o-algos/src/main/java/hex/generic/GenericModelOutput.java b/h2o-algos/src/main/java/hex/generic/GenericModelOutput.java index b25d71629bd0..73a3477d09ef 100644 --- a/h2o-algos/src/main/java/hex/generic/GenericModelOutput.java +++ b/h2o-algos/src/main/java/hex/generic/GenericModelOutput.java @@ -5,6 +5,7 @@ import hex.genmodel.attributes.metrics.*; import hex.genmodel.descriptor.ModelDescriptor; import hex.tree.isofor.ModelMetricsAnomaly; +import water.util.ArrayUtils; import water.util.Log; import water.util.TwoDimTable; @@ -192,6 +193,15 @@ ordinalMetrics._logloss, customMetric(ordinalMetrics), metricsCoxPH._sigma, metricsCoxPH._mae, metricsCoxPH._root_mean_squared_log_error, metricsCoxPH._mean_residual_deviance, customMetric(mojoMetrics), metricsCoxPH._concordance, metricsCoxPH._concordant, metricsCoxPH._discordant, metricsCoxPH._tied_y); + case BinomialUplift: + assert mojoMetrics instanceof MojoModelMetricsBinomialUplift; + MojoModelMetricsBinomialUplift metricsUplift = (MojoModelMetricsBinomialUplift) mojoMetrics; + AUUC.AUUCType auucType = AUUC.AUUCType.valueOf((String) modelAttributes.getParameterValueByName("auuc_type")); + AUUC auuc = createAUUC(auucType, metricsUplift._thresholds_and_metric_scores, metricsUplift._auuc_table, metricsUplift._aecu_table); + return new ModelMetricsBinomialUpliftGeneric(null, null, metricsUplift._nobs, _domains[_domains.length - 1], + metricsUplift._ate, metricsUplift._att, metricsUplift._atc, metricsUplift._sigma, auuc, customMetric(metricsUplift), + convertTable(metricsUplift._thresholds_and_metric_scores), convertTable(metricsUplift._auuc_table), + convertTable(metricsUplift._aecu_table), metricsUplift._description); case Unknown: case Clustering: case AutoEncoder: @@ -285,7 +295,7 @@ private static TwoDimTable[] convertTables(final Table[] inputTables) { } return tables; } - + private static TwoDimTable convertTable(final Table convertedTable){ if(convertedTable == null) return null; final TwoDimTable table = new TwoDimTable(convertedTable.getTableHeader(), convertedTable.getTableDescription(), @@ -297,10 +307,75 @@ private static TwoDimTable convertTable(final Table convertedTable){ table.set(j, i, convertedTable.getCell(i,j)); } } - return table; } + private static AUUC createAUUC(AUUC.AUUCType auucType, Table thresholds_and_metric_scores, Table auuc_table, Table aecu_table){ + int nbins = thresholds_and_metric_scores.rows(); + double[] ths = new double[nbins]; + long[] freq = new long[nbins]; + AUUC.AUUCType[] auucTypes = AUUC.AUUCType.values(); + double[][] uplift = new double[auucTypes.length][nbins]; + double[][] upliftNorm = new double[auucTypes.length][nbins]; + double[][] upliftRand = new double[auucTypes.length][nbins]; + double[] auuc = new double[auucTypes.length]; + double[] auucNorm = new double[auucTypes.length]; + double[] auucRand = new double[auucTypes.length]; + double[] aecu = new double[auucTypes.length]; + + String[] thrHeader = thresholds_and_metric_scores.getColHeaders(); + // threshold column index + int thrIndex = ArrayUtils.find(thrHeader, "thresholds"); + int freqIndex = ArrayUtils.find(thrHeader, "n"); + + // uplift type indices + int[] upliftIndices = new int[auucTypes.length]; + int[] upliftNormIndices = new int[auucTypes.length]; + int[] upliftRandIndices = new int[auucTypes.length]; + for (int i = 1; i < auucTypes.length; i++) { + String auucTypeName = auucTypes[i].name(); + upliftIndices[i] = ArrayUtils.find(thrHeader, auucTypeName); + upliftNormIndices[i] = ArrayUtils.find(thrHeader, auucTypeName+"_normalized"); + upliftRandIndices[i] = ArrayUtils.find(thrHeader, auucTypeName+"_random"); + // AUTO setting + if(auucTypeName.equals(AUUC.AUUCType.nameAuto())){ + upliftIndices[0] = upliftIndices[i]; + upliftNormIndices[0] = upliftNormIndices[i]; + upliftRandIndices[0] = upliftRandIndices[i]; + } + } + // fill thresholds and uplift values from table + for (int i = 0; i < thresholds_and_metric_scores.rows(); i++) { + ths[i] = (double) thresholds_and_metric_scores.getCell(thrIndex, i); + freq[i] = (long) thresholds_and_metric_scores.getCell(freqIndex, i); + for (int j = 0; j < auucTypes.length; j++) { + uplift[j][i] = (double) thresholds_and_metric_scores.getCell(upliftIndices[j], i); + upliftNorm[j][i] = (double) thresholds_and_metric_scores.getCell(upliftNormIndices[j], i); + upliftRand[j][i] = (double) thresholds_and_metric_scores.getCell(upliftRandIndices[j], i); + } + } + // fill auuc values and aecu values + String[] auucHeader = auuc_table.getColHeaders(); + String[] aecuHeader = aecu_table.getColHeaders(); + for (int i = 1; i < auucTypes.length; i++) { + AUUC.AUUCType type = auucTypes[i]; + String auucTypeName = type.name(); + int colIndex = ArrayUtils.find(auucHeader, auucTypeName); + auuc[i] = (double) auuc_table.getCell(colIndex, 0); + auucNorm[i] = (double) auuc_table.getCell(colIndex, 1); + auucRand[i] = (double) auuc_table.getCell(colIndex, 2); + colIndex = ArrayUtils.find(aecuHeader, auucTypeName); + aecu[i] = (double) aecu_table.getCell(colIndex, 0); + if(auucTypeName.equals(AUUC.AUUCType.nameAuto())){ + auuc[0] = auuc[i]; + auucNorm[0] = auucNorm[i]; + auucRand[0] = auucRand[i]; + aecu[0] = aecu[i]; + } + } + return new AUUC(ths, freq, auuc, auucNorm, auucRand, aecu, auucType, uplift, upliftNorm, upliftRand); + } + @Override public boolean hasTreatment() { return super.hasTreatment(); diff --git a/h2o-core/src/main/java/hex/AUUC.java b/h2o-core/src/main/java/hex/AUUC.java index 78dd39720aae..149334adfe1f 100644 --- a/h2o-core/src/main/java/hex/AUUC.java +++ b/h2o-core/src/main/java/hex/AUUC.java @@ -10,7 +10,6 @@ import water.fvec.Frame; import water.fvec.Vec; import water.util.ArrayUtils; -import water.util.Log; import java.util.Arrays; import java.util.Iterator; @@ -52,7 +51,7 @@ public class AUUC extends Iced { public long frequency( int idx ) { return _frequency[idx]; } public double uplift( int idx) { return _uplift[_auucTypeIndx][idx]; } - private int getIndexByAUUCType(AUUCType type){ + public int getIndexByAUUCType(AUUCType type){ return ArrayUtils.find(AUUC.AUUCType.VALUES, type); } @@ -204,6 +203,25 @@ public AUUC() { _upliftNormalized = new double[AUUCType.values().length][]; _upliftRandom = new double[AUUCType.values().length][]; } + + public AUUC(double[] ths, long[] freq, double[] auuc, double[] auucNorm, double[] auucRand, double[] aecu, + AUUCType auucType, double[][] uplift, double[][] upliftNorm, double[][] upliftRand) { + _nBins = ths.length; + _n = freq[freq.length-1]; + _ths = ths; + _frequencyCumsum = freq; + _treatment = _control = _yTreatment = _yControl = _frequency = new long[0]; + _auucs = auuc; + _auucsNormalized = auucNorm; + _auucsRandom = auucRand; + _aecu = aecu; + _maxIdx = -1; + _auucType = auucType; + _auucTypeIndx = getIndexByAUUCType(_auucType); + _uplift = uplift; + _upliftNormalized = upliftNorm; + _upliftRandom = upliftRand; + } public static double[] calculateQuantileThresholds(int groups, Vec preds) { Frame fr = null; @@ -442,19 +460,14 @@ public enum AUUCType { * @return metric value */ abstract double exec(long treatment, long control, long yTreatment, long yControl ); public double exec(AUUC auc, int idx) { return exec(auc.treatment(idx),auc.control(idx),auc.yTreatment(idx),auc.yControl(idx)); } - + public static final AUUCType[] VALUES = values(); - public static AUUCType fromString(String strRepr) { - for (AUUCType tc : AUUCType.values()) { - if (tc.toString().equalsIgnoreCase(strRepr)) { - return tc; - } - } - return null; - } + public static final AUUCType[] VALUES_WITHOUT_AUTO = ArrayUtils.remove(values().clone(), ArrayUtils.find(AUUCType.values(), AUTO)); - public double maxCriterion(AUUC auuc) { return exec(auuc, maxCriterionIdx(auuc)); } + public static String nameAuto(){ + return qini.name(); + } /** Convert a criterion into a threshold index that maximizes the criterion * @return Threshold index that maximizes the criterion diff --git a/h2o-core/src/main/java/hex/ModelMetricsBinomialUpliftGeneric.java b/h2o-core/src/main/java/hex/ModelMetricsBinomialUpliftGeneric.java new file mode 100644 index 000000000000..7bafbbbe50ce --- /dev/null +++ b/h2o-core/src/main/java/hex/ModelMetricsBinomialUpliftGeneric.java @@ -0,0 +1,20 @@ +package hex; + +import water.fvec.Frame; +import water.util.TwoDimTable; + +public class ModelMetricsBinomialUpliftGeneric extends ModelMetricsBinomialUplift { + + + public final TwoDimTable _thresholds_and_metric_scores; + public final TwoDimTable _auuc_table; + public final TwoDimTable _aecu_table; + + public ModelMetricsBinomialUpliftGeneric(Model model, Frame frame, long nobs, String[] domain, double ate, double att, double atc, double sigma, AUUC auuc, CustomMetric customMetric, TwoDimTable thresholds_and_metric_scores, TwoDimTable auuc_table, TwoDimTable aecu_table, final String description) { + super(model, frame, nobs, domain, ate, att, atc, sigma, auuc, customMetric); + _thresholds_and_metric_scores = thresholds_and_metric_scores; + _auuc_table = auuc_table; + _aecu_table = aecu_table; + _description = description; + } +} diff --git a/h2o-core/src/main/java/water/api/schemas3/ModelMetricsBinomialUpliftGenericV3.java b/h2o-core/src/main/java/water/api/schemas3/ModelMetricsBinomialUpliftGenericV3.java new file mode 100644 index 000000000000..a070f76d08ad --- /dev/null +++ b/h2o-core/src/main/java/water/api/schemas3/ModelMetricsBinomialUpliftGenericV3.java @@ -0,0 +1,29 @@ +package water.api.schemas3; + +import hex.ModelMetricsBinomialUpliftGeneric; + +public class ModelMetricsBinomialUpliftGenericV3> + extends ModelMetricsBinomialUpliftV3 { + + @Override + public S fillFromImpl(ModelMetricsBinomialUpliftGeneric modelMetrics) { + super.fillFromImpl(modelMetrics); + this.AUUC = modelMetrics._auuc.auuc(); + this.auuc_normalized = modelMetrics._auuc.auucNormalized(); + this.ate = modelMetrics.ate(); + this.att = modelMetrics.att(); + this.atc = modelMetrics.atc(); + this.qini = modelMetrics.qini(); + + if (modelMetrics._auuc_table != null) { // Possibly overwrites whatever has been set in the ModelMetricsBinomialV3 + this.auuc_table = new TwoDimTableV3().fillFromImpl(modelMetrics._auuc_table); + } + if (modelMetrics._aecu_table != null) { // Possibly overwrites whatever has been set in the ModelMetricsBinomialV3 + this.aecu_table = new TwoDimTableV3().fillFromImpl(modelMetrics._aecu_table); + } + if (modelMetrics._thresholds_and_metric_scores != null) { // Possibly overwrites whatever has been set in the ModelMetricsBinomialV3 + this.thresholds_and_metric_scores = new TwoDimTableV3().fillFromImpl(modelMetrics._thresholds_and_metric_scores); + } + return (S) this; + } +} diff --git a/h2o-core/src/main/java/water/api/schemas3/ModelMetricsBinomialUpliftV3.java b/h2o-core/src/main/java/water/api/schemas3/ModelMetricsBinomialUpliftV3.java index f1c102da6092..685b344cede4 100644 --- a/h2o-core/src/main/java/water/api/schemas3/ModelMetricsBinomialUpliftV3.java +++ b/h2o-core/src/main/java/water/api/schemas3/ModelMetricsBinomialUpliftV3.java @@ -57,8 +57,7 @@ public S fillFromImpl(ModelMetricsBinomialUplift modelMetrics) { qini = auuc.qini(); // Fill TwoDimTable String[] thresholds = new String[auuc._nBins]; - AUUCType metrics[] = AUUCType.VALUES; - metrics = ArrayUtils.remove(metrics, Arrays.asList(metrics).indexOf(AUUCType.AUTO)); + AUUCType metrics[] = AUUCType.VALUES_WITHOUT_AUTO; int metricsLength = metrics.length; long[] n = new long[auuc._nBins]; double[][] uplift = new double[metricsLength][]; @@ -89,7 +88,7 @@ public S fillFromImpl(ModelMetricsBinomialUplift modelMetrics) { types [i + 1 + 2 * metricsLength] = "double"; formats [i + 1 + 2 * metricsLength] = "%f"; } - colHeaders[i + 1 + 2 * metricsLength] = "n"; types[i + 1 + 2 * metricsLength] = "int"; formats[i + 1 + 2 * metricsLength] = "%d"; + colHeaders[i + 1 + 2 * metricsLength] = "n"; types[i + 1 + 2 * metricsLength] = "long"; formats[i + 1 + 2 * metricsLength] = "%d"; colHeaders[i + 2 + 2 * metricsLength] = "idx"; types[i + 2 + 2 * metricsLength] = "int"; formats[i + 2 + 2 * metricsLength] = "%d"; TwoDimTable thresholdsByMetrics = new TwoDimTable("Metrics for Thresholds", "Cumulative Uplift metrics for a given percentile", new String[auuc._nBins], colHeaders, types, formats, null ); for (i = 0; i < auuc._nBins; i++) { diff --git a/h2o-core/src/main/resources/META-INF/services/water.api.Schema b/h2o-core/src/main/resources/META-INF/services/water.api.Schema index 70ff82ef399e..75286a3d4080 100644 --- a/h2o-core/src/main/resources/META-INF/services/water.api.Schema +++ b/h2o-core/src/main/resources/META-INF/services/water.api.Schema @@ -72,6 +72,7 @@ water.api.schemas3.ModelMetricsBinomialGLMGenericV3 water.api.schemas3.ModelMetricsBinomialV3 water.api.schemas3.ModelMetricsBinomialGenericV3 water.api.schemas3.ModelMetricsBinomialUpliftV3 +water.api.schemas3.ModelMetricsBinomialUpliftGenericV3 water.api.schemas3.ModelMetricsClusteringV3 water.api.schemas3.ModelMetricsHGLMV3 water.api.schemas3.ModelMetricsHGLMGenericV3 diff --git a/h2o-genmodel/src/main/java/hex/genmodel/attributes/ModelAttributes.java b/h2o-genmodel/src/main/java/hex/genmodel/attributes/ModelAttributes.java index d18abf2f2ff3..eeb2a8dab5a7 100644 --- a/h2o-genmodel/src/main/java/hex/genmodel/attributes/ModelAttributes.java +++ b/h2o-genmodel/src/main/java/hex/genmodel/attributes/ModelAttributes.java @@ -91,6 +91,8 @@ private static MojoModelMetrics determineModelMetricsType(final MojoModel mojoMo } else return new MojoModelMetricsOrdinal(); case CoxPH: return new MojoModelMetricsRegressionCoxPH(); + case BinomialUplift: + return new MojoModelMetricsBinomialUplift(); case Unknown: case Clustering: case AutoEncoder: diff --git a/h2o-genmodel/src/main/java/hex/genmodel/attributes/metrics/MojoModelMetricsBinomialUplift.java b/h2o-genmodel/src/main/java/hex/genmodel/attributes/metrics/MojoModelMetricsBinomialUplift.java new file mode 100644 index 000000000000..4366d0a0b806 --- /dev/null +++ b/h2o-genmodel/src/main/java/hex/genmodel/attributes/metrics/MojoModelMetricsBinomialUplift.java @@ -0,0 +1,19 @@ +package hex.genmodel.attributes.metrics; + +import hex.genmodel.attributes.SerializedName; +import hex.genmodel.attributes.Table; + +public class MojoModelMetricsBinomialUplift extends MojoModelMetricsSupervised { + + @SerializedName("AUUC") + public double _auuc; + public double _normalized_auuc; + @SerializedName("Qini") + public double _qini; + public double _ate; + public double _att; + public double _atc; + public Table _thresholds_and_metric_scores; + public Table _auuc_table; + public Table _aecu_table; +} diff --git a/h2o-py/tests/testdir_algos/uplift/pyunit_uplift_rf_mojo.py b/h2o-py/tests/testdir_algos/uplift/pyunit_uplift_rf_mojo.py index f7c60876abad..937deb9c84e2 100644 --- a/h2o-py/tests/testdir_algos/uplift/pyunit_uplift_rf_mojo.py +++ b/h2o-py/tests/testdir_algos/uplift/pyunit_uplift_rf_mojo.py @@ -69,14 +69,20 @@ def uplift_random_forest_mojo(): perf_model = uplift_model.model_performance() print(perf_model) - perf_model_auuc = perf_model.auuc() perf_mojo = mojo_model.model_performance() print(perf_mojo) - perf_mojo_auuc = perf_mojo.auuc() + perf_model_auuc = perf_model.auuc() + perf_mojo_auuc = perf_mojo.auuc() assert_equals(perf_model_auuc, perf_mojo_auuc, "AUUC is not the same with MOJO") + perf_model_qini = perf_model.qini() + perf_mojo_qini = perf_mojo.qini() + assert_equals(perf_model_qini, perf_mojo_qini, "Qini is not the same with MOJO") + + + os.remove(model_path) diff --git a/h2o-py/tests/testdir_generic_model/pyunit_generic_model_mojo_upliftdrf.py b/h2o-py/tests/testdir_generic_model/pyunit_generic_model_mojo_upliftdrf.py new file mode 100644 index 000000000000..5cca75e40e17 --- /dev/null +++ b/h2o-py/tests/testdir_generic_model/pyunit_generic_model_mojo_upliftdrf.py @@ -0,0 +1,73 @@ +import tempfile +import os +import sys +sys.path.insert(1,"../../") + +import h2o +from h2o.display import H2OTableDisplay, capture_output +from h2o.estimators import H2OUpliftRandomForestEstimator, H2OGenericEstimator +from tests import pyunit_utils +from tests.testdir_generic_model import compare_output, compare_params + + +def test(x, y, treatment_column, output_test, strip_part, algo_name, generic_algo_name): + + seed = 12345 + + train_h2o = h2o.upload_file(pyunit_utils.locate("smalldata/uplift/upliftml_train.csv")) + train_h2o[treatment_column] = train_h2o[treatment_column].asfactor() + train_h2o[y] = train_h2o[y].asfactor() + + n_samples = train_h2o.shape[0] + + uplift_model = H2OUpliftRandomForestEstimator( + ntrees=5, + max_depth=5, + treatment_column=treatment_column, + uplift_metric="KL", + distribution="bernoulli", + min_rows=10, + nbins=1000, + seed=seed, + sample_rate=0.99, + auuc_type="gain" + ) + + uplift_model.train(y=y, x=x, training_frame=train_h2o) + print(uplift_model) + + # comparison fails when using pandas due to spaces formatting + with H2OTableDisplay.pandas_rendering_enabled(False), capture_output() as (original_output, _): + uplift_model.show() + print(original_output.getvalue()) + + original_model_filename = tempfile.mkdtemp() + original_model_filename = uplift_model.download_mojo(original_model_filename) + + model = H2OGenericEstimator.from_file(original_model_filename) + assert model is not None + with H2OTableDisplay.pandas_rendering_enabled(False), capture_output() as (generic_output, _): + model.show() + print(generic_output.getvalue()) + compare_params(uplift_model, model) + + output_test(original_output.getvalue(), generic_output.getvalue(), strip_part, algo_name, generic_algo_name) + predictions = model.predict(train_h2o) + assert predictions is not None + assert predictions.nrows == n_samples + assert model._model_json["output"]["model_summary"] is not None + assert len(model._model_json["output"]["model_summary"]._cell_values) > 0 + + generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo"); + generic_mojo_filename = model.download_mojo(path=generic_mojo_filename) + assert os.path.getsize(generic_mojo_filename) == os.path.getsize(original_model_filename) + + +def mojo_model_test_binomial_uplift(): + test(["feature_"+str(x) for x in range(1, 13)], "outcome", "treatment", compare_output, "Model Summary: ", + 'ModelMetricsBinomialUplift: upliftdrf', 'ModelMetricsBinomialUpliftGeneric: generic') + + +pyunit_utils.run_tests([ + mojo_model_test_binomial_uplift, +]) diff --git a/h2o-r/tests/testdir_algos/uplift/testthat-problems.rds b/h2o-r/tests/testdir_algos/uplift/testthat-problems.rds deleted file mode 100644 index b888363003e28eaeb19cedc412bb28991c565a97..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 451 zcmV;!0X+U6iwFP!000001C3KnPXaLzE$o8$g(lwoNcso#;E{tdF(Dp|8UxWA3`@b9 z(ynb+^v{E{yX{go9&9o@?M(XS&71Dq9wDSjViJ*Ngj;fg)*vLxeGBclILBlc{qq>I zEV;^3!Dn_%rLvTLW+13lv`j73hZ6+I`I^mMuVA=|wC6%iK-8iKOK#s^K}#;@2G1*(53yp)jCHxyrdDHJ!-^+hSS@yv z^s=HB1nZLc2tb$ z532jYFl(vZYMAccp_>jVUq`W`AbyH=hIWp2QER^?VZn@hN|6sHD5dcNrJ$L#)sf37 zPx33~ Date: Thu, 26 Oct 2023 09:18:52 +0200 Subject: [PATCH 4/5] Add R test --- .../main/java/hex/generic/GenericModel.java | 2 - .../runit_generic_model_mojo_upliftdrf.R | 67 +++++++++++++++++++ 2 files changed, 67 insertions(+), 2 deletions(-) create mode 100644 h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_upliftdrf.R diff --git a/h2o-algos/src/main/java/hex/generic/GenericModel.java b/h2o-algos/src/main/java/hex/generic/GenericModel.java index 2e2a4e42374f..949a47c48221 100644 --- a/h2o-algos/src/main/java/hex/generic/GenericModel.java +++ b/h2o-algos/src/main/java/hex/generic/GenericModel.java @@ -109,8 +109,6 @@ public ModelMetrics.MetricBuilder makeMetricBuilder(String[] domain) { case AnomalyDetection: return new ModelMetricsAnomaly.MetricBuilderAnomaly(); case BinomialUplift: - // Solve thresholds, move threshods from Uplift Output to Model Output to calculate metrics - // return new ModelMetricsBinomialUplift.MetricBuilderBinomialUplift(domain, _output.default_auuc_thresholds); return new ModelMetricsBinomialUplift.MetricBuilderBinomialUplift(domain, null); default: throw H2O.unimpl(); diff --git a/h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_upliftdrf.R b/h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_upliftdrf.R new file mode 100644 index 000000000000..81f8a501938d --- /dev/null +++ b/h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_upliftdrf.R @@ -0,0 +1,67 @@ +source("generic_model_test_common.R") +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source("../../../scripts/h2o-r-test-setup.R") +library(uplift) + + +test.model.generic.drf <- function() { + ntrees <- 6 + mtries <- 6 + seed <- 42 + uplift_metric <- "KL" + set.seed(seed) + + # Test data preparation for each implementation + train <- sim_pte(n = 2000, p = 6, rho = 0, sigma = sqrt(2), beta.den = 4) + train$treat <- ifelse(train$treat == 1, 1, 0) + test <- sim_pte(n = 1000, p = 6, rho = 0, sigma = sqrt(2), beta.den = 4) + test$treat <- ifelse(test$treat == 1, 1, 0) + + trainh2o <- train + trainh2o$treat <- as.factor(train$treat) + trainh2o$y <- as.factor(train$y) + trainh2o <- as.h2o(trainh2o) + + testh2o <- test + testh2o$treat <- as.factor(test$treat) + testh2o$y <- as.factor(test$y) + testh2o <- as.h2o(testh2o) + + original_model <- h2o.upliftRandomForest( + x = c("X1", "X2", "X3", "X4", "X5", "X6"), + y = "y", + training_frame = trainh2o, + validation_frame = testh2o, + treatment_column = "treat", + uplift_metric = uplift_metric, + auuc_type = "qini", + distribution = "bernoulli", + ntrees = ntrees, + mtries = mtries, + max_depth = 10, + min_rows = 10, + nbins = 100, + seed = seed) + + print(original_model) + + mojo_original_name <- h2o.download_mojo(model = original_model, path = tempdir()) + mojo_original_path <- paste0(tempdir(),"/",mojo_original_name) + + generic_model <- h2o.genericModel(mojo_original_path) + print(generic_model) + + original_output <- capture.output(print(original_model)) + generic_output <- capture.output(print(generic_model)) + compare_output(original_output, generic_output, + c("Extract .+ frame","H2OBinomialUpliftModel: upliftdrf", "Model ID", "H2OBinomialUpliftMetrics: upliftdrf"), + c("H2OBinomialUpliftModel: generic", "Model ID", "H2OBinomialUpliftMetrics: generic")) + + generic_model_preds <- h2o.predict(generic_model, testh2o) + expect_equal(length(generic_model_preds), 3) + expect_equal(h2o.nrow(generic_model_preds), 1000) + generic_model_path <- h2o.download_mojo(model = generic_model, path = tempdir()) + expect_equal(file.size(paste0(tempdir(),"/",generic_model_path)), file.size(mojo_original_path)) +} + +doTest("Generic model from DRF MOJO", test.model.generic.drf ) From 4e93d9b78e45c31d425cfec7a8ba7bab6bc03719 Mon Sep 17 00:00:00 2001 From: Veronika Maurerova Date: Wed, 1 Nov 2023 11:40:18 +0100 Subject: [PATCH 5/5] Clear code --- h2o-core/src/main/java/hex/AUUC.java | 2 +- .../java/hex/genmodel/algos/upliftdrf/UpliftDrfMojoReader.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/h2o-core/src/main/java/hex/AUUC.java b/h2o-core/src/main/java/hex/AUUC.java index 149334adfe1f..83a637f05b19 100644 --- a/h2o-core/src/main/java/hex/AUUC.java +++ b/h2o-core/src/main/java/hex/AUUC.java @@ -51,7 +51,7 @@ public class AUUC extends Iced { public long frequency( int idx ) { return _frequency[idx]; } public double uplift( int idx) { return _uplift[_auucTypeIndx][idx]; } - public int getIndexByAUUCType(AUUCType type){ + private int getIndexByAUUCType(AUUCType type){ return ArrayUtils.find(AUUC.AUUCType.VALUES, type); } diff --git a/h2o-genmodel/src/main/java/hex/genmodel/algos/upliftdrf/UpliftDrfMojoReader.java b/h2o-genmodel/src/main/java/hex/genmodel/algos/upliftdrf/UpliftDrfMojoReader.java index 81dfbb9c32e9..a8b3a877b540 100644 --- a/h2o-genmodel/src/main/java/hex/genmodel/algos/upliftdrf/UpliftDrfMojoReader.java +++ b/h2o-genmodel/src/main/java/hex/genmodel/algos/upliftdrf/UpliftDrfMojoReader.java @@ -4,7 +4,7 @@ import java.io.IOException; /** - */ + */ public class UpliftDrfMojoReader extends SharedTreeMojoReader { @Override