From 57bc954808ce17db8c5eff70ad507387194c659b Mon Sep 17 00:00:00 2001 From: wendycwong Date: Tue, 29 Oct 2024 12:38:07 -0700 Subject: [PATCH 1/2] GH-8487: implement HGLM gaussian [nocheck] (#16403) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * GH-8487: add HGLM as a separate toolbox. GH-8487: crafting HGLM parameters. GH-8487: implement EM algo. GH-8487: forming the fixed matrices and vectors. GH-8487: add test to make sure correct initialization of fixed, random coefficients, sigma values and T matrix. GH-8487: Finished implementing EM to estimate fixed coefficients, random coefficients, tmat and tauEVar GH-8487: finished implementing prediction but still need to figure out the model metrics calculation. GH-8487: Adding support for models without random intercept. GH-8487: adding normalization and denormalization of coefficients for fixed and random. GH-8487: Completed prediction implementation and added tests to make sure prediction is correct when standardize=true/false, random_intercept = true/false. GH-8487: fixing model metric classes. GH-8487: add python and R tests. GH-8487: adding hooks to generate synthetic data. GH-8487: added scoring history, model summary, coefficient tables. GH-8487: added modelmetrics for validation frame. GH-8487: From experiment to find best tauEVar calculation process. The one in equation 10 is best. GH-8487: add capability in Python client to extract scoring history, model summary, model metrics, model coefficients (fixed and random), icc, T matrix, residual variance. GH-8487: done checking scoring history, model summary and model metrics. GH-8487: added R client test for utility functions. GH-8487: use lambda_ instead lf Lambda in pyunit_benign_glm.py GH-8487: remove standardize from HGLM as the convention does not do standardization. Co-authored-by: Veronika Maurerová --- h2o-algos/src/main/java/hex/DataInfo.java | 20 +- .../src/main/java/hex/api/RegisterAlgos.java | 2 +- h2o-algos/src/main/java/hex/gam/GAM.java | 3 +- .../hex/gam/MatrixFrameUtils/GamUtils.java | 23 - .../main/java/hex/glm/ComputationState.java | 102 -- h2o-algos/src/main/java/hex/glm/GLM.java | 1079 ++------------- .../main/java/hex/glm/GLMMetricBuilder.java | 102 +- h2o-algos/src/main/java/hex/glm/GLMModel.java | 148 +-- h2o-algos/src/main/java/hex/glm/GLMTask.java | 1159 +---------------- .../java/hex/hglm/ComputationStateHGLM.java | 150 +++ h2o-algos/src/main/java/hex/hglm/HGLM.java | 364 ++++++ .../src/main/java/hex/hglm/HGLMModel.java | 302 +++++ .../src/main/java/hex/hglm/HGLMScore.java | 142 ++ .../src/main/java/hex/hglm/HGLMTask.java | 439 +++++++ .../src/main/java/hex/hglm/HGLMUtils.java | 166 +++ .../main/java/hex/hglm/MetricBuilderHGLM.java | 142 ++ .../src/main/java/hex/schemas/GLMModelV3.java | 21 - .../src/main/java/hex/schemas/GLMV3.java | 26 +- .../main/java/hex/schemas/HGLMModelV3.java | 151 +++ .../src/main/java/hex/schemas/HGLMV3.java | 125 ++ .../java/hex/schemas/ModelSelectionV3.java | 6 +- .../META-INF/services/water.api.Schema | 4 + .../java/hex/generic/GenericModelTest.java | 1 - .../test/java/hex/glm/GLMBasicTestHGLM.java | 94 -- .../test/java/hex/glm/GLMCheckpointTest.java | 4 +- .../test/java/hex/glm/GLMConstrainedTest.java | 4 +- .../src/test/java/hex/hglm/HGLMBasicTest.java | 905 +++++++++++++ .../src/test/java/hex/hglm/HGLMUtilTest.java | 231 ++++ .../java/water/test/util/PojoUtilsTest.java | 20 +- h2o-bindings/bin/custom/R/gen_glm.py | 5 - h2o-bindings/bin/custom/R/gen_hglm.py | 124 ++ h2o-bindings/bin/custom/python/gen_hglm.py | 106 ++ h2o-bindings/bin/gen_R.py | 2 + h2o-bindings/bin/gen_python.py | 1 + ...odelMetricHGLMGaussianGaussianGeneric.java | 16 - .../src/main/java/hex/ModelMetricsHGLM.java | 158 --- .../hex/ModelMetricsHGLMGaussianGaussian.java | 58 - .../java/hex/ModelMetricsHGLMGeneric.java | 14 - .../java/hex/ModelMetricsRegressionHGLM.java | 118 ++ .../ModelMetricsRegressionHGLMGeneric.java | 13 + ...lMetricsHGLMGaussianGaussianGenericV3.java | 13 - .../ModelMetricsHGLMGaussianGaussianV3.java | 11 - .../schemas3/ModelMetricsHGLMGenericV3.java | 12 - .../api/schemas3/ModelMetricsHGLMV3.java | 96 -- .../ModelMetricsRegressionHGLMGenericV3.java | 21 + .../ModelMetricsRegressionHGLMV3.java | 45 + .../src/main/java/water/util/ArrayUtils.java | 231 +++- .../META-INF/services/water.api.Schema | 6 +- .../src/test/java/hex/ModelMetricsTest.java | 87 +- .../test/java/water/util/ArrayUtilsTest.java | 299 +++++ .../src/main/java/hex/ModelCategory.java | 1 - .../src/main/java/hex/genmodel/GenModel.java | 1 - h2o-py/docs/modeling.rst | 6 + h2o-py/h2o/estimators/__init__.py | 13 +- h2o-py/h2o/estimators/glm.py | 86 +- h2o-py/h2o/estimators/hglm.py | 685 ++++++++++ h2o-py/h2o/estimators/model_selection.py | 4 +- h2o-py/h2o/explanation/_explain.py | 3 - .../h2o/model/extensions/scoring_history.py | 3 - h2o-py/h2o/model/metrics_base.py | 39 +- h2o-py/h2o/model/model_base.py | 12 +- h2o-py/h2o/sklearn/__init__.py | 7 +- h2o-py/tests/pyunit_utils/__init__.py | 2 +- ...m_tests.py => utils_for_glm_hglm_tests.py} | 46 +- ...se_lessthan_linear_constraints_binomial.py | 10 +- ...nstraints_binomial_objective_likelihood.py | 8 +- ...6722_equality_constraints_only_binomial.py | 6 +- ...se_lessthan_linear_constraints_binomial.py | 10 +- ..._equality_lessthan_constraints_binomial.py | 14 +- ..._equality_lessthan_constraints_binomial.py | 14 +- ..._tight_linear_constraints_only_binomial.py | 14 +- ..._loose_beta_linear_constraints_binomial.py | 10 +- ..._loose_only_linear_constraints_binomial.py | 10 +- ...t_GH_6722_separate_linear_beta_gaussian.py | 1 - ...ta_equality_linear_constraints_binomial.py | 18 +- ...ht_equality_linear_constraints_binomial.py | 14 +- ..._tight_linear_constraints_only_binomial.py | 14 +- ...pyunit_PUBDEV_6876_HGLM_compare_R_large.py | 34 - .../pyunit_PUBDEV_6876_HGLM_initial_values.py | 42 - ...7_1p5_noise_var_scoring_history_summary.py | 89 ++ ...H_8487_2_noise_var_init_beta_ubeta_tmat.py | 65 + ...487_2_noise_var_scoring_history_summary.py | 89 ++ ..._3_noise_variance_random_intercept_only.py | 33 + ..._noise_variance_scoring_history_summary.py | 86 ++ .../hglm/pyunit_GH_8487_coefficients_check.py | 50 + ..._noise_variance_scoring_history_summary.py | 89 ++ ...87_p5_noise_var_scoring_history_summary.py | 89 ++ .../pyunit_generate_synthetic_HGLM_data.py | 106 ++ .../testdir_misc/explain/pyunit_explain.py | 16 - .../testdir_multi_jvm/test_rest_api.py | 2 +- h2o-r/H2O_Load.R | 2 +- h2o-r/h2o-DESCRIPTION.template | 2 +- h2o-r/h2o-package/R/classes.R | 24 +- h2o-r/h2o-package/R/explain.R | 4 - h2o-r/h2o-package/R/glm.R | 44 +- h2o-r/h2o-package/R/hglm.R | 411 ++++++ h2o-r/h2o-package/R/kvstore.R | 12 - h2o-r/h2o-package/R/models.R | 22 +- h2o-r/h2o-package/R/modelselection.R | 2 +- h2o-r/h2o-package/R/segment.R | 2 +- h2o-r/h2o-package/pkgdown/_pkgdown.yml | 10 + h2o-r/scripts/h2o-r-test-setup.R | 2 +- .../runit_PUBDEV_6876_HGLM_setInitialValues.R | 42 - .../runit_PUBDEV_6876_HGLM_testAgainstR1.R | 56 - .../hglm/runit_GH_8487_coefs_check.R | 37 + .../testdir_misc/explain/runit_explain.R | 17 - .../src/main/java/water/TestUtil.java | 41 +- 107 files changed, 6405 insertions(+), 3537 deletions(-) create mode 100644 h2o-algos/src/main/java/hex/hglm/ComputationStateHGLM.java create mode 100644 h2o-algos/src/main/java/hex/hglm/HGLM.java create mode 100644 h2o-algos/src/main/java/hex/hglm/HGLMModel.java create mode 100644 h2o-algos/src/main/java/hex/hglm/HGLMScore.java create mode 100644 h2o-algos/src/main/java/hex/hglm/HGLMTask.java create mode 100644 h2o-algos/src/main/java/hex/hglm/HGLMUtils.java create mode 100644 h2o-algos/src/main/java/hex/hglm/MetricBuilderHGLM.java create mode 100644 h2o-algos/src/main/java/hex/schemas/HGLMModelV3.java create mode 100644 h2o-algos/src/main/java/hex/schemas/HGLMV3.java delete mode 100644 h2o-algos/src/test/java/hex/glm/GLMBasicTestHGLM.java create mode 100644 h2o-algos/src/test/java/hex/hglm/HGLMBasicTest.java create mode 100644 h2o-algos/src/test/java/hex/hglm/HGLMUtilTest.java create mode 100644 h2o-bindings/bin/custom/R/gen_hglm.py create mode 100644 h2o-bindings/bin/custom/python/gen_hglm.py delete mode 100644 h2o-core/src/main/java/hex/ModelMetricHGLMGaussianGaussianGeneric.java delete mode 100644 h2o-core/src/main/java/hex/ModelMetricsHGLM.java delete mode 100644 h2o-core/src/main/java/hex/ModelMetricsHGLMGaussianGaussian.java delete mode 100644 h2o-core/src/main/java/hex/ModelMetricsHGLMGeneric.java create mode 100644 h2o-core/src/main/java/hex/ModelMetricsRegressionHGLM.java create mode 100644 h2o-core/src/main/java/hex/ModelMetricsRegressionHGLMGeneric.java delete mode 100644 h2o-core/src/main/java/water/api/schemas3/ModelMetricsHGLMGaussianGaussianGenericV3.java delete mode 100644 h2o-core/src/main/java/water/api/schemas3/ModelMetricsHGLMGaussianGaussianV3.java delete mode 100644 h2o-core/src/main/java/water/api/schemas3/ModelMetricsHGLMGenericV3.java delete mode 100644 h2o-core/src/main/java/water/api/schemas3/ModelMetricsHGLMV3.java create mode 100644 h2o-core/src/main/java/water/api/schemas3/ModelMetricsRegressionHGLMGenericV3.java create mode 100644 h2o-core/src/main/java/water/api/schemas3/ModelMetricsRegressionHGLMV3.java create mode 100644 h2o-py/h2o/estimators/hglm.py rename h2o-py/tests/pyunit_utils/{utils_for_glm_tests.py => utils_for_glm_hglm_tests.py} (81%) delete mode 100644 h2o-py/tests/testdir_algos/glm/pyunit_PUBDEV_6876_HGLM_compare_R_large.py delete mode 100644 h2o-py/tests/testdir_algos/glm/pyunit_PUBDEV_6876_HGLM_initial_values.py create mode 100644 h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_1p5_noise_var_scoring_history_summary.py create mode 100644 h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_2_noise_var_init_beta_ubeta_tmat.py create mode 100644 h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_2_noise_var_scoring_history_summary.py create mode 100644 h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_3_noise_variance_random_intercept_only.py create mode 100644 h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_3_noise_variance_scoring_history_summary.py create mode 100644 h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_coefficients_check.py create mode 100644 h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_p05_noise_variance_scoring_history_summary.py create mode 100644 h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_p5_noise_var_scoring_history_summary.py create mode 100644 h2o-py/tests/testdir_algos/hglm/pyunit_generate_synthetic_HGLM_data.py create mode 100644 h2o-r/h2o-package/R/hglm.R delete mode 100644 h2o-r/tests/testdir_algos/glm/runit_PUBDEV_6876_HGLM_setInitialValues.R delete mode 100644 h2o-r/tests/testdir_algos/glm/runit_PUBDEV_6876_HGLM_testAgainstR1.R create mode 100644 h2o-r/tests/testdir_algos/hglm/runit_GH_8487_coefs_check.R diff --git a/h2o-algos/src/main/java/hex/DataInfo.java b/h2o-algos/src/main/java/hex/DataInfo.java index 119a438b82e9..8829bce1628a 100644 --- a/h2o-algos/src/main/java/hex/DataInfo.java +++ b/h2o-algos/src/main/java/hex/DataInfo.java @@ -373,7 +373,7 @@ public DataInfo validDinfo(Frame valid) { public double[] denormalizeBeta(double [] beta) { int N = fullN()+1; - assert (beta.length % N) == 0:"beta len = " + beta.length + " expected multiple of" + N; + assert (beta.length % N) == 0:"beta len = " + beta.length + " expected multiple of " + N; int nclasses = beta.length/N; beta = MemoryManager.arrayCopyOf(beta,beta.length); if (_predictor_transform == DataInfo.TransformType.STANDARDIZE) { @@ -1078,24 +1078,6 @@ public final double innerProduct(DataInfo.Row row) { return res; } - /*** - * For HGLM, will perform multiplication of w*data part and not the random columns. - * @param w - * @param rowContent - * @param catOffsets - * @return - */ - public double[] scalarProduct(double w, double[] rowContent, int catOffsets) { // multiple a row with scaler w - rowContent[0] = w; // intercept term - for (int i = 0; i < nBins; ++i) { - rowContent[binIds[i]+1] = w; // value is absolute - } - - for (int i = 0; i < numVals.length; ++i) - rowContent[i+catOffsets+1] += numVals[i]*w; - - return rowContent; - } public final double twoNormSq() { assert !_intercept; assert numIds == null; diff --git a/h2o-algos/src/main/java/hex/api/RegisterAlgos.java b/h2o-algos/src/main/java/hex/api/RegisterAlgos.java index b31bbcc39834..5135a8911347 100644 --- a/h2o-algos/src/main/java/hex/api/RegisterAlgos.java +++ b/h2o-algos/src/main/java/hex/api/RegisterAlgos.java @@ -2,7 +2,6 @@ import hex.ModelBuilder; import hex.anovaglm.ANOVAGLM; -import hex.modelselection.ModelSelection; import hex.psvm.PSVM; import hex.tree.TreeHandler; import water.api.AlgoAbstractRegister; @@ -39,6 +38,7 @@ public void registerEndPoints(RestApiContext context) { new hex.modelselection.ModelSelection (true), new hex.isotonic .IsotonicRegression(true), new hex.tree.dt .DT (true), + new hex.hglm .HGLM (true), new hex.adaboost. AdaBoost (true) }; diff --git a/h2o-algos/src/main/java/hex/gam/GAM.java b/h2o-algos/src/main/java/hex/gam/GAM.java index 9f598be447e4..f0ddda0cd257 100644 --- a/h2o-algos/src/main/java/hex/gam/GAM.java +++ b/h2o-algos/src/main/java/hex/gam/GAM.java @@ -46,8 +46,7 @@ import static hex.glm.GLMModel.GLMParameters.GLMType.gam; import static hex.util.LinearAlgebraUtils.generateOrthogonalComplement; import static hex.util.LinearAlgebraUtils.generateQR; -import static water.util.ArrayUtils.expandArray; -import static water.util.ArrayUtils.subtract; +import static water.util.ArrayUtils.*; public class GAM extends ModelBuilder { diff --git a/h2o-algos/src/main/java/hex/gam/MatrixFrameUtils/GamUtils.java b/h2o-algos/src/main/java/hex/gam/MatrixFrameUtils/GamUtils.java index aab8dc90e379..1e2b9e3c0473 100644 --- a/h2o-algos/src/main/java/hex/gam/MatrixFrameUtils/GamUtils.java +++ b/h2o-algos/src/main/java/hex/gam/MatrixFrameUtils/GamUtils.java @@ -139,29 +139,6 @@ else if (!name1ContainsResp && standarNContainsResp) // if name1 does not conta return equalNames; } - public static void copy2DArray(double[][] src_array, double[][] dest_array) { - int numRows = src_array.length; - for (int colIdx = 0; colIdx < numRows; colIdx++) { // save zMatrix for debugging purposes or later scoring on training dataset - System.arraycopy(src_array[colIdx], 0, dest_array[colIdx], 0, - src_array[colIdx].length); - } - } - - // copy a square array - public static double[][] copy2DArray(double[][] src_array) { - double[][] dest_array = MemoryManager.malloc8d(src_array.length, src_array[0].length); - copy2DArray(src_array, dest_array); - return dest_array; - } - - public static void copy2DArray(int[][] src_array, int[][] dest_array) { - int numRows = src_array.length; - for (int colIdx = 0; colIdx < numRows; colIdx++) { // save zMatrix for debugging purposes or later scoring on training dataset - System.arraycopy(src_array[colIdx], 0, dest_array[colIdx], 0, - src_array[colIdx].length); - } - } - public static void copyCVGLMtoGAMModel(GAMModel model, GLMModel glmModel, GAMParameters parms, String foldColumn) { // copy over cross-validation metrics model._output._cross_validation_metrics = glmModel._output._cross_validation_metrics; diff --git a/h2o-algos/src/main/java/hex/glm/ComputationState.java b/h2o-algos/src/main/java/hex/glm/ComputationState.java index 1aaaf697e3bc..9ed0482d77ee 100644 --- a/h2o-algos/src/main/java/hex/glm/ComputationState.java +++ b/h2o-algos/src/main/java/hex/glm/ComputationState.java @@ -16,7 +16,6 @@ import water.H2ORuntime; import water.Job; import water.MemoryManager; -import water.fvec.Frame; import water.util.ArrayUtils; import water.util.IcedHashMap; import water.util.Log; @@ -49,7 +48,6 @@ public final class ComputationState { private boolean _dispersionEstimated; boolean _allIn; int _iter; - int _iterHGLM_GLMMME; // keep track of iterations used in estimating fixed/random coefficients private double _lambda = 0; private double _lambdaMax = Double.NaN; private GLMGradientInfo _ginfo; // gradient info excluding l1 penalty @@ -73,15 +71,6 @@ public final class ComputationState { ConstraintsGram[] _gramLess = null; private final GLM.BetaInfo _modelBetaInfo; private double[] _beta; // vector of coefficients corresponding to active data - private double[] _ubeta; // HGLM, store coefficients of random effects; - private double[] _psi; // HGLM, psi - private double[] _phi; // HGLM, size random columns levels - private double _tau; // HGLM for ei - private double _correction_HL; // HGLM - double[] _sumEtaSquareConvergence; // HGLM: sotre sumEtaSquare, convergence - double[] _likelihoodInfo; // HGLM: stores 4 elements: hlik, pvh, pbvh, cAIC - public String[] _randCoeffNames; // store random coefficient names - private Frame _priorw_wpsi; // weight calculated for psi final DataInfo _dinfo; private GLMGradientSolver _gslvr; private final Job _job; @@ -105,11 +94,6 @@ public ComputationState(Job job, GLMParameters parms, DataInfo dinfo, BetaConstr _nbetas = bi._nBetas; _betaLengthPerClass = dinfo.fullN()+1; _totalBetaLength = _betaLengthPerClass * _nbetas; - if (_parms._HGLM) { - _sumEtaSquareConvergence = new double[2]; - if (_parms._calc_like) - _likelihoodInfo = new double[4]; - } _modelBetaInfo = bi; } @@ -224,84 +208,14 @@ void copyCheckModel2State(GLMModel model, int[][] _gamColIndices) { } } } - - public void set_sumEtaSquareConvergence(double[] sumInfo) { - _sumEtaSquareConvergence = sumInfo; - } - - /*** - * Copy GLM coefficients stored in beta to _beta of computationState - * @param beta: store coefficients to be copied from - * @param startIdx: first index of beta to copy from - * @param len: length of coefficients to copy from beta - * @param interceptFirst: true if the first index of beta stored the intercept term - */ - public void set_beta_HGLM(double[] beta, int startIdx, int len, boolean interceptFirst) { - if (_beta==null) - _beta = new double[len]; - if (interceptFirst) { - int lastIndex = len-1; - System.arraycopy(beta, startIdx+1, _beta, 0, lastIndex); - _beta[lastIndex] = beta[startIdx]; - } else { - System.arraycopy(beta, startIdx, _beta, 0, len); - } - } - - public void set_likelihoodInfo(double hlik, double pvh, double pbvh, double cAIC) { - _likelihoodInfo[0] = hlik; - _likelihoodInfo[1] = pvh; - _likelihoodInfo[2] = pbvh; - _likelihoodInfo[3] = cAIC; - } - public void set_ubeta_HGLM(double[] ubeta, int startIdx, int len) { - if (_ubeta==null) - _ubeta = new double[len]; - System.arraycopy(ubeta, startIdx, _ubeta, 0, len); - } - public void setZValues(double[] zValues, boolean dispersionEstimated) { _zValues = zValues; _dispersionEstimated = dispersionEstimated; } - public double[] get_psi() { - return _psi; - } - - public double get_correction_HL() { - return _correction_HL; - } - - public double[] get_phi() { - return _phi; - } - - public Frame get_priorw_wpsi() { - return _priorw_wpsi; - } - - public double get_tau() { - return _tau; - } - public boolean getLambdaNull() { return _lambdaNull; } - public void set_tau(double tau) { - _tau=tau; - } - - public void set_psi(double[] psi) { - assert _psi.length==psi.length:"Length of _psi and psi should be the same."; - System.arraycopy(psi, 0, _psi, 0, psi.length); - } - - public void set_phi(double[] phi) { - assert _phi.length==phi.length:"Length of _phi and phi should be the same."; - System.arraycopy(phi, 0, _phi, 0, phi.length); - } - public GLMGradientSolver gslvr(){return _gslvr;} public double lambda(){return _lambda;} public double alpha() {return _alpha;} @@ -339,9 +253,6 @@ public void setLambda(double lambda) { return betaMultinomial(_activeClass,_beta); return _beta; } - public double[] ubeta(){ - return _ubeta; // could be null. Be careful - } public GLMGradientInfo ginfo(){return _ginfo == null?(_ginfo = gslvr().getGradient(beta())):_ginfo;} public BetaConstraint activeBC(){return _activeBC;} public double likelihood() {return _likelihood;} @@ -1060,19 +971,6 @@ protected void setIter(int iteration) { protected void setActiveDataMultinomialNull() { _activeDataMultinomial = null; } protected void setActiveDataNull() { _activeData = null; } protected void setLambdaSimple(double lambda) { _lambda=lambda; } - - protected void setHGLMComputationState(double [] beta, double[] ubeta, double[] psi, double[] phi, - double hlcorrection, double tau, Frame wpsi, String[] randCoeffNames){ - _beta = Arrays.copyOf(beta, beta.length); - _ubeta = Arrays.copyOf(ubeta, ubeta.length); - _randCoeffNames = Arrays.copyOf(randCoeffNames, randCoeffNames.length); - _psi = Arrays.copyOf(psi, psi.length); - _phi = Arrays.copyOf(phi, phi.length); - _correction_HL = hlcorrection; - _tau = tau; - _priorw_wpsi = wpsi; // store prior_weight and calculated wpsi value for coefficients of random columns - _iterHGLM_GLMMME = 0; - } public double [] expandBeta(double [] beta) { // for multinomials int fullCoefLen = (_dinfo.fullN() + 1) * _nbetas; diff --git a/h2o-algos/src/main/java/hex/glm/GLM.java b/h2o-algos/src/main/java/hex/glm/GLM.java index 29b51f5db4ff..7b94b7d9b125 100644 --- a/h2o-algos/src/main/java/hex/glm/GLM.java +++ b/h2o-algos/src/main/java/hex/glm/GLM.java @@ -2,7 +2,6 @@ import Jama.Matrix; import hex.*; -import hex.gam.MatrixFrameUtils.GamUtils; import hex.glm.GLMModel.GLMOutput; import hex.glm.GLMModel.GLMParameters.Family; import hex.glm.GLMModel.GLMParameters.Link; @@ -21,13 +20,7 @@ import hex.optimization.L_BFGS.ProgressMonitor; import hex.optimization.L_BFGS.Result; import hex.optimization.OptimizationUtils.*; -import hex.svd.SVD; -import hex.svd.SVDModel; -import hex.svd.SVDModel.SVDParameters; import hex.util.CheckpointUtils; -import hex.util.LinearAlgebraUtils; -import hex.util.LinearAlgebraUtils.BMulTask; -import hex.util.LinearAlgebraUtils.FindMaxIndex; import jsr166y.CountedCompleter; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; @@ -38,8 +31,6 @@ import water.fvec.InteractionWrappedVec; import water.fvec.Vec; import water.parser.BufferedString; -import water.rapids.Rapids; -import water.rapids.Val; import water.udf.CFuncRef; import water.util.*; @@ -62,7 +53,7 @@ import static hex.glm.GLMModel.GLMParameters.Influence.dfbetas; import static hex.glm.GLMModel.GLMParameters.Solver.IRLSM; import static hex.glm.GLMUtils.*; -import static water.fvec.Vec.T_NUM; +import static water.util.ArrayUtils.copy2DArray; /** * Created by tomasnykodym on 8/27/14. @@ -75,10 +66,7 @@ public class GLM extends ModelBuilder { static NumberFormat devFormatter = new DecimalFormat(".##"); private static final DateTimeFormatter DATE_TIME_FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss"); public static final int SCORING_INTERVAL_MSEC = 15000; // scoreAndUpdateModel every minute unless score every iteration is set - public int[] _randC; // contains categorical column levels for random columns for HGLM public String _generatedWeights = null; - public String[] _randCoeffNames = null; - public String[] _randomColNames = null; public double[][][] _penaltyMatrix = null; public String[][] _gamColnames = null; public int[][] _gamColIndices = null; // corresponding column indices in dataInfo @@ -547,10 +535,6 @@ protected void checkMemoryFootPrint(DataInfo activeData) { HeartBeat hb = H2O.SELF._heartbeat; long mem_usage = (long) (hb._cpus_allowed * (p * p + activeData.largestCat()) * 8/*doubles*/ * (1 + .5 * Math.log((double) _train.lastVec().nChunks()) / Math.log(2.))); //one gram per core long max_mem = hb.get_free_mem(); - if (_parms._HGLM) { // add check to check memories used by arrays. - int expandedRandColValues = ArrayUtils.sum(_randC); - mem_usage = expandedRandColValues*expandedRandColValues*5+(_nobs+expandedRandColValues)*5; // rough estimate - } if (mem_usage > max_mem) { String msg = "Gram matrices (one per thread) won't fit in the driver node's memory (" + PrettyPrint.bytes(mem_usage) + " > " + PrettyPrint.bytes(max_mem) @@ -569,8 +553,6 @@ static class ScoringHistory { private ArrayList _scoringTimes = new ArrayList<>(); private ArrayList _likelihoods = new ArrayList<>(); private ArrayList _objectives = new ArrayList<>(); - private ArrayList _convergence = new ArrayList<>(); // HGLM: ratio of sum(eta0-eta.i)^2/sum(eta.i^2) - private ArrayList _sumEtaiSquare = new ArrayList<>(); // HGLM: sum(eta.i^2) private ArrayList _lambdas; // thest are only used when _parms.generate_scoring_history=true private ArrayList _lambdaDevTrain; private ArrayList _lambdaDevTest; @@ -619,15 +601,6 @@ public synchronized void addIterationScore(boolean updateTrain, boolean updateVa _lambdaDevTest.add(devValid/nobsValid); } - public synchronized void addIterationScore(int iter, double[] sumEtaInfo) { - if (_scoringIters.size() > 0 && _scoringIters.get(_scoringIters.size() - 1) >= iter) - return; // do not record twice, happens for the last iteration, need to record scoring history in checkKKTs because of gaussian fam. - _scoringIters.add(iter); - _scoringTimes.add(System.currentTimeMillis()); - _sumEtaiSquare.add(sumEtaInfo[0]); - _convergence.add(sumEtaInfo[0]/sumEtaInfo[1]); - } - public synchronized TwoDimTable to2dTable(GLMParameters parms, double[] xvalDev, double[] xvalSE) { String[] cnames = new String[]{"timestamp", "duration", "iterations", "negative_log_likelihood", "objective"}; String[] ctypes = new String[]{"string", "string", "int", "double", "double"}; @@ -671,33 +644,14 @@ public synchronized TwoDimTable to2dTable(GLMParameters parms, double[] xvalDev, return res; } - public synchronized TwoDimTable to2dTableHGLM() { - String[] cnames = new String[]{"timestamp", "duration", "iterations", "sum(etai-eta0)^2", "convergence"}; - String[] ctypes = new String[]{"string", "string", "int", "double", "double"}; - String[] cformats = new String[]{"%s", "%s", "%d", "%.5f", "%.5f"}; - TwoDimTable res = new TwoDimTable("Scoring History", "", new String[_scoringIters.size()], cnames, ctypes, cformats, ""); - for (int i = 0; i < _scoringIters.size(); ++i) { - int col = 0; - res.set(i, col++, DATE_TIME_FORMATTER.print(_scoringTimes.get(i))); - res.set(i, col++, PrettyPrint.msecs(_scoringTimes.get(i) - _scoringTimes.get(0), true)); - res.set(i, col++, _scoringIters.get(i)); - res.set(i, col++, _sumEtaiSquare.get(i)); - res.set(i, col++, _convergence.get(i)); - } - return res; - } - void restoreFromCheckpoint(TwoDimTable sHist, int[] colIndices, boolean hglm) { + void restoreFromCheckpoint(TwoDimTable sHist, int[] colIndices) { int numRows = sHist.getRowDim(); for (int rowInd = 0; rowInd < numRows; rowInd++) { // if lambda_search is enabled, _sc is not updated _scoringIters.add((Integer) sHist.get(rowInd, colIndices[0])); _scoringTimes.add(DATE_TIME_FORMATTER.parseMillis((String) sHist.get(rowInd, colIndices[1]))); _likelihoods.add((Double) sHist.get(rowInd, colIndices[2])); _objectives.add((Double) sHist.get(rowInd, colIndices[3])); - if (hglm) { // for HGLM family - _convergence.add((Double) sHist.get(rowInd, colIndices[4])); - _sumEtaiSquare.add((Double) sHist.get(rowInd, colIndices[5])); - } } } } @@ -962,13 +916,6 @@ public void init(boolean expensive) { if (_parms._plug_values == null && _parms.missingValuesHandling() == MissingValuesHandling.PlugValues) { error("_missing_values_handling", "No plug values frame provided for Missing Values Handling = PlugValues."); } - if (_parms._HGLM) { - for (int randInx:_parms._random_columns) { - if (!_parms.train().vec(randInx).isCategorical()) { - error("HGLM random_columns", "Must contain categorical columns."); - } - } - } if (_parms._max_iterations == 0) { warn("max_iterations", "for GLM, must be >= 1 (or -1 for unlimited or default setting) " + "to obtain proper model. Setting it to be 0 will only return the correct coefficient names and an empty" + @@ -1001,7 +948,7 @@ public void init(boolean expensive) { _parms._generate_scoring_history); _train.bulkRollups(); // make sure we have all the rollups computed in parallel _t0 = System.currentTimeMillis(); - if ((_parms._lambda_search || !_parms._intercept || _parms._lambda == null || _parms._lambda[0] > 0) && !_parms._HGLM) + if ((_parms._lambda_search || !_parms._intercept || _parms._lambda == null || _parms._lambda[0] > 0)) _parms._use_all_factor_levels = true; if (_parms._family == Family.AUTO) { if (_nclass == 1) { @@ -1110,7 +1057,7 @@ public void init(boolean expensive) { if (_parms._init_dispersion_parameter <= 0) error("init_dispersion_parameter", " must exceed 0.0."); - boolean standardizeQ = _parms._HGLM?false:_parms._standardize; + boolean standardizeQ = _parms._standardize; _dinfo = new DataInfo(_train.clone(), _valid, 1, _parms._use_all_factor_levels || _parms._lambda_search, standardizeQ ? DataInfo.TransformType.STANDARDIZE : DataInfo.TransformType.NONE, DataInfo.TransformType.NONE, _parms.missingValuesHandling() == MissingValuesHandling.Skip, _parms.imputeMissing(), @@ -1226,70 +1173,67 @@ public void init(boolean expensive) { if(_offset != null) vecs.add(_offset); vecs.add(_response); double[] beta = getNullBeta(); - if (_parms._HGLM) { - setHGLMInitValues(beta); - _parms._lambda = new double[]{0}; // disable elastic-net regularization - } else { - if (_parms._startval != null) { // allow user start set initial beta values - if (_parms._startval.length != beta.length) { - throw new IllegalArgumentException("Initial coefficient length (" + _parms._startval.length + ") does not " + - "equal to actual GLM coefficient length(" + beta.length + ").\n The order of coefficients should" + - " be the following:\n"+String.join("\n", _dinfo._adaptedFrame._names)+"\n Intercept.\n " + - "Run your model without specifying startval to find out the actual coefficients names and " + - "lengths."); - } else { - System.arraycopy(_parms._startval, 0, beta, 0, beta.length); - } - } else if (_parms._linear_constraints != null && _parms._init_optimal_glm) { // start value is not assigned - beta = genInitBeta(); - } - GLMGradientInfo ginfo = gam.equals(_parms._glmType) ? new GLMGradientSolver(_job, _parms, _dinfo, 0, - _state.activeBC(), _betaInfo, _penaltyMatrix, _gamColIndices).getGradient(beta) : new GLMGradientSolver(_job, - _parms, _dinfo, 0, _state.activeBC(), _betaInfo).getGradient(beta); // gradient with L2 penalty, no constraints - _lmax = lmax(ginfo._gradient); - _gmax = _lmax*Math.max(1e-2, _parms._alpha[0]); // each alpha should have its own best lambda - _state.setLambdaMax(_lmax); - _state.setgMax(_gmax); - if (_parms._lambda_min_ratio == -1) { - _parms._lambda_min_ratio = (_nobs >> 4) > _dinfo.fullN() ? 1e-4 : 1e-2; - if (_parms._alpha[0] == 0) - _parms._lambda_min_ratio *= 1e-2; // smalelr lambda min for ridge as we are starting quite high - } - _betaStart = new double[beta.length]; - System.arraycopy(beta, 0, _betaStart, 0, beta.length); - _state.updateState(beta, ginfo); - if (_parms._lambda == null) { // no lambda given, we will base lambda as a fraction of lambda max - if (_parms._lambda_search) { - _parms._lambda = new double[_parms._nlambdas]; - double dec = Math.pow(_parms._lambda_min_ratio, 1.0 / (_parms._nlambdas - 1)); - _parms._lambda[0] = _lmax; - double l = _lmax; - for (int i = 1; i < _parms._nlambdas; ++i) - _parms._lambda[i] = (l *= dec); - // todo set the null submodel - } else - _parms._lambda = new double[]{10 * _parms._lambda_min_ratio * _lmax}; - } - if (!Double.isNaN(_lambdaCVEstimate)) { // in main model, shrink the lambda range to search - for (int i = 0; i < _parms._lambda.length; ++i) - if (_parms._lambda[i] < _lambdaCVEstimate) { - _parms._lambda = Arrays.copyOf(_parms._lambda, i + 1); - break; - } - _parms._lambda[_parms._lambda.length - 1] = _lambdaCVEstimate; - _parms._lambda[_parms._lambda.length - 1] = _lambdaCVEstimate; + if (_parms._startval != null) { // allow user start set initial beta values + if (_parms._startval.length != beta.length) { + throw new IllegalArgumentException("Initial coefficient length (" + _parms._startval.length + ") does not " + + "equal to actual GLM coefficient length(" + beta.length + ").\n The order of coefficients should" + + " be the following:\n" + String.join("\n", _dinfo._adaptedFrame._names) + "\n Intercept.\n " + + "Run your model without specifying startval to find out the actual coefficients names and " + + "lengths."); + } else { + System.arraycopy(_parms._startval, 0, beta, 0, beta.length); } + } else if (_parms._linear_constraints != null && _parms._init_optimal_glm) { // start value is not assigned + beta = genInitBeta(); + } + + GLMGradientInfo ginfo = gam.equals(_parms._glmType) ? new GLMGradientSolver(_job, _parms, _dinfo, 0, + _state.activeBC(), _betaInfo, _penaltyMatrix, _gamColIndices).getGradient(beta) : new GLMGradientSolver(_job, + _parms, _dinfo, 0, _state.activeBC(), _betaInfo).getGradient(beta); // gradient with L2 penalty, no constraints + _lmax = lmax(ginfo._gradient); + _gmax = _lmax * Math.max(1e-2, _parms._alpha[0]); // each alpha should have its own best lambda + _state.setLambdaMax(_lmax); + _state.setgMax(_gmax); + if (_parms._lambda_min_ratio == -1) { + _parms._lambda_min_ratio = (_nobs >> 4) > _dinfo.fullN() ? 1e-4 : 1e-2; + if (_parms._alpha[0] == 0) + _parms._lambda_min_ratio *= 1e-2; // smalelr lambda min for ridge as we are starting quite high + } + _betaStart = new double[beta.length]; + System.arraycopy(beta, 0, _betaStart, 0, beta.length); + _state.updateState(beta, ginfo); + if (_parms._lambda == null) { // no lambda given, we will base lambda as a fraction of lambda max + if (_parms._lambda_search) { + _parms._lambda = new double[_parms._nlambdas]; + double dec = Math.pow(_parms._lambda_min_ratio, 1.0 / (_parms._nlambdas - 1)); + _parms._lambda[0] = _lmax; + double l = _lmax; + for (int i = 1; i < _parms._nlambdas; ++i) + _parms._lambda[i] = (l *= dec); + // todo set the null submodel + } else + _parms._lambda = new double[]{10 * _parms._lambda_min_ratio * _lmax}; + } + if (!Double.isNaN(_lambdaCVEstimate)) { // in main model, shrink the lambda range to search + for (int i = 0; i < _parms._lambda.length; ++i) + if (_parms._lambda[i] < _lambdaCVEstimate) { + _parms._lambda = Arrays.copyOf(_parms._lambda, i + 1); + break; + } + _parms._lambda[_parms._lambda.length - 1] = _lambdaCVEstimate; + _parms._lambda[_parms._lambda.length - 1] = _lambdaCVEstimate; } - if(_parms._objective_epsilon == -1) { - if(_parms._lambda_search) + + if (_parms._objective_epsilon == -1) { + if (_parms._lambda_search) _parms._objective_epsilon = 1e-4; else // lower default objective epsilon for non-standardized problems (mostly to match classical tools) - _parms._objective_epsilon = _parms._lambda[0] == 0?1e-6:1e-4; + _parms._objective_epsilon = _parms._lambda[0] == 0 ? 1e-6 : 1e-4; } - if(_parms._gradient_epsilon == -1) { + if (_parms._gradient_epsilon == -1) { _parms._gradient_epsilon = _parms._lambda[0] == 0 ? 1e-6 : 1e-4; - if(_parms._lambda_search) _parms._gradient_epsilon *= 1e-2; + if (_parms._lambda_search) _parms._gradient_epsilon *= 1e-2; } // check for correct setting for Tweedie ML dispersion parameter setting @@ -1304,7 +1248,7 @@ public void init(boolean expensive) { // likelihood calculation for gaussian, gamma, negativebinomial and tweedie families requires dispersion parameter estimation // _dispersion_parameter_method: gaussian - pearson (default); gamma, negativebinomial, tweedie - ml. - if(!_parms._HGLM && _parms._calc_like) { + if(_parms._calc_like) { switch (_parms._family) { case gaussian: _parms._compute_p_values = true; @@ -1451,153 +1395,6 @@ void checkAssignLinearConstraints() { } } - /** - * initialize the following parameters for HGLM from either user initial inputs or from a GLM model if user did not - * provide any starting values. - * If user initial inputs are provided, it should be in the form of a big double array and the values should be - * stacked according to the following sequence: - * - beta, ubeta, phi, psi, tau, init_sig_e, init_sig_u - * - * @param beta - * - * Internal H2O method. Do not use. - */ - public void setHGLMInitValues(double[] beta) { - _randC = new int[_parms._random_columns.length]; // _randC, array of random columns cardinalities - _randomColNames = new String[_parms._random_columns.length]; // store random column names - for (int rcInd = 0; rcInd < _parms._random_columns.length; rcInd++) { - _randC[rcInd] = _parms.train().vec(_parms._random_columns[rcInd]).cardinality(); - _randomColNames[rcInd] = _parms.train().name(_parms._random_columns[rcInd]); - } - int fixedEffectSize = beta.length; - int randomEffectSize = ArrayUtils.sum(_randC); - _randCoeffNames = findExpandedRandColNames(); // column names for expanded random columns - double tau=0; // store estimate of sig_e - double[] phi = new double[randomEffectSize]; - double[] psi = new double[randomEffectSize]; - double[] ubeta = new double[randomEffectSize]; - double hlcorrection = 0; // fixed for different distributions - Vec tempVec = Vec.makeOne(randomEffectSize); // vector to store prior weights for random effects/columns - // randWeights stores prior weight (0 col), calculated weight for random columns (1 col), zmi (intermediate values) - Frame randWeights = new Frame(tempVec.makeOnes(3)); - randWeights.setNames(new String[]{"prior_weghts", "wpsi", "zmi"}); // add column names - if (_parms._startval==null) { - GLMModel tempModel = runGLMModel(_parms._standardize, Family.gaussian, Link.family_default, _parms._train, - _parms._response_column, null, _parms._ignored_columns, false); - System.arraycopy(tempModel.beta(), 0, beta, 0, beta.length); - hex.ModelMetricsRegressionGLM tMetric = (hex.ModelMetricsRegressionGLM) tempModel._output._training_metrics; - double init_sig_e = 0.6*tMetric.residual_deviance()/tMetric.residual_degrees_of_freedom(); - double init_sig_u = init_sig_e*0.66; - init_sig_e = restrictMag(init_sig_e); // make sure initial values are not too small - init_sig_u = restrictMag(init_sig_u); - Arrays.fill(phi, init_sig_u/_randC.length); - tau = init_sig_e; - _state.setHGLMComputationState(beta, ubeta, psi, phi, hlcorrection, tau, randWeights, _randCoeffNames); - tempModel.remove(); - tMetric.remove(); - } else { - copyUserInitialValues(fixedEffectSize, randomEffectSize, beta, ubeta, phi, hlcorrection, psi, randWeights, - _randCoeffNames); - _parms._startval = null; // reset back to null - } - } - - /** - * Construct random column coefficient names. - * @return a string array containing random column coefficient names. - */ - private String[] findExpandedRandColNames() { - String[] randExpandedColNames = new String[ArrayUtils.sum(_randC)]; - int numRandCols = _randC.length; - String[] randColNames = new String[numRandCols]; - int offset = 0; - for (int index=0; index < numRandCols; index++) { - int randomColIndex = _parms._random_columns[index]; - String[] domains = _parms.train().vec(randomColIndex).domain(); - int domainLen = domains.length; - randColNames[index] = _parms.train().name(randomColIndex); - for (int insideInd = 0; insideInd < domainLen; insideInd++) { - randExpandedColNames[offset+insideInd] = randColNames[index]+"_"+insideInd; - } - offset += domainLen; - } - return randExpandedColNames; - } - - public double restrictMag(double val) { - if (val < 0.0001) - return 0.1; - else - return val; - } - - /** - * This method performs a simple copying of user provided initial values to parameters - * - beta, ubeta, phi, tau, psi, init_sig_u - */ - public void copyUserInitialValues(int fixedEffectSize, int randomEffectSize, double[] beta, - double[] ubeta, double[] phi, double hlcorrection, double[] psi, - Frame randWeights, String[] randCoeffNames) { - int off = 0; // offset into startval - int lengthLimit = fixedEffectSize; - int totalstartvallen = fixedEffectSize+randomEffectSize+_randC.length+1; - assert _parms._startval.length==totalstartvallen:"Expected startval length: "+totalstartvallen+", Actual" + - " startval length: "+_parms._startval.length; // ensure startval contains enough initialization param - for (int fixedInd=off; fixedInd < lengthLimit; fixedInd++) { - beta[fixedInd] = _parms._startval[fixedInd]; - } - off += fixedEffectSize; - lengthLimit += randomEffectSize; - for (int randomInd = off; randomInd < lengthLimit; randomInd++) { - ubeta[randomInd-off] = _parms._startval[randomInd]; - } - off += randomEffectSize; - lengthLimit += _randC.length; - int sig_u_off = 0; - for (int siguInd=off; siguInd < lengthLimit; siguInd++) { - double init_sig_u = _parms._startval[siguInd]; - for (int index=0; index < _randC[siguInd-off]; index++) - phi[index+sig_u_off] = init_sig_u; - sig_u_off += _randC[siguInd-off]; - } - double tau = _parms._startval[lengthLimit]; - if (tau < 0.0001 || ArrayUtils.minValue(phi) < 0.0001) // Following R thresholds - error("init_sig_u, init_sig_e", "unacceptable initial values supplied for variance" + - " parameter or dispersion parameter of the random effects. They need to exceed 0.0001."); - _state.setHGLMComputationState(beta, ubeta, psi, phi, hlcorrection, tau, randWeights, randCoeffNames); - } - - /** - * This method will quickly generate GLM model with proper parameter settings during the HGLM building process. - * This is an internal H2O function. Do not call. It will change with no notice. - * - * @param standardize - * @param family - * @param link - * @param trainKey - * @param responseColName - * @param weightColumns - * @param ignored_columns - * @param computePValue - * @return - */ - private GLMModel runGLMModel(boolean standardize, Family family, Link link, Key trainKey, - String responseColName, String weightColumns, String[] ignored_columns, - boolean computePValue) { - GLMParameters tempParams = new GLMParameters(); - tempParams._train = trainKey; - tempParams._family = family; - tempParams._link = link; - tempParams._lambda = new double[]{0}; - tempParams._standardize = standardize; - tempParams._response_column = responseColName; - tempParams._ignored_columns = ignored_columns; - tempParams._weights_column = weightColumns; - tempParams._compute_p_values = computePValue; - tempParams._useDispersion1 = computePValue; - GLMModel model = new GLM(tempParams).trainModel().get(); - return model; - } // copy from scoring_history back to _sc or _lsc private void restoreScoringHistoryFromCheckpoint() { @@ -1607,12 +1404,12 @@ private void restoreScoringHistoryFromCheckpoint() { "deviance_test", "alpha"} : new String[]{"iteration", "timestamp", "negative_log_likelihood", "objective", "sum(etai-eta0)^2", "convergence"}; - int num2Copy = _parms._HGLM || _parms._lambda_search ? colHeaders2Restore.length : colHeaders2Restore.length-2; + int num2Copy = _parms._lambda_search ? colHeaders2Restore.length : colHeaders2Restore.length-2; int[] colHeadersIndex = grabHeaderIndex(scoringHistory, num2Copy, colHeaders2Restore); if (_parms._lambda_search) _lambdaSearchScoringHistory.restoreFromCheckpoint(scoringHistory, colHeadersIndex); else - _scoringHistory.restoreFromCheckpoint(scoringHistory, colHeadersIndex, _parms._HGLM); + _scoringHistory.restoreFromCheckpoint(scoringHistory, colHeadersIndex); } static int[] grabHeaderIndex(TwoDimTable sHist, int numHeaders, String[] colHeadersUseful) { @@ -1688,13 +1485,6 @@ private void doCleanup() { try { if (_parms._lambda_search && _parms._is_cv_model) Scope.untrack(removeLater(_dinfo.getWeightsVec()._key)); - if (_parms._HGLM) { - Key[] vecKeys = _toRemove; - for (int index = 0; index < vecKeys.length; index++) { - Vec tempVec = DKV.getGet(vecKeys[index]); - tempVec.remove(); - } - } } catch (Exception e) { Log.err("Error while cleaning up GLM " + _result); Log.err(e); @@ -1712,7 +1502,7 @@ private double[] constraintGLM_solve(GramGrad gram) { if (!_parms._intercept) throw H2O.unimpl(); ArrayList ignoredCols = new ArrayList<>(); double[] xy = gram._xy.clone(); - Cholesky chol = ((_state._iter == 0) ? gram.qrCholesky(ignoredCols, GamUtils.copy2DArray(gram._gram), _parms._standardize) : gram.cholesky(null, gram._gram)); + Cholesky chol = ((_state._iter == 0) ? gram.qrCholesky(ignoredCols, copy2DArray(gram._gram), _parms._standardize) : gram.cholesky(null, gram._gram)); if (!chol.isSPD()) throw new NonSPDMatrixException(); if (!ignoredCols.isEmpty()) { int[] collinearCols = ignoredCols.stream().mapToInt(x -> x).toArray(); @@ -1818,313 +1608,7 @@ private void fitCOD_multinomial(Solver s) { _state.updateState(beta, _state.gslvr().getGradient(beta)); // only calculate _gradient here when needed } } - - public Frame makeZeroOrOneFrame(long rowNumber, int colNumber, int val, String[] columnNames) { - Vec tempVec = val == 0 ? Vec.makeZero(rowNumber) : Vec.makeOne(rowNumber); - Frame madeFrame = val == 0 ? new Frame(tempVec.makeZeros(colNumber)) : new Frame(tempVec.makeOnes(colNumber)); - if (columnNames != null) { - if (columnNames.length == colNumber) - madeFrame.setNames(columnNames); - else - throw new IllegalArgumentException("Column names length and number of columns in Frame differ."); - } - cleanupHGLMMemory(null, null, new Vec[]{tempVec}, null); - return madeFrame; - } - - /** - * This method will estimate beta, ubeta as described in step 2 of HGLM fitting algorithm. Details can be found in - * the appendix. - * - * @param randCatLevels - * @param totRandCatLevels - * @param returnFrame: will contain the calculation of dev, hv which will be used in step 3 and later. - * @param dinfoWCol - * @param wCol - * @param etaOColIndexReturnFrame - * @param dinfoResponseColID - * @param sumEtaInfo - * @param augXZ - * @param cholR - * @return - */ - private double[] fitCoeffs(int[] randCatLevels, int totRandCatLevels, Frame returnFrame, int[] dinfoWCol, int[] wCol, - int etaOColIndexReturnFrame, int dinfoResponseColID, double[] sumEtaInfo, Frame augXZ, - double[][] cholR) { - // qMatrix is used to store the Q matrix from QR decomposition of augXZ. It is used in the loop here. - Frame qMatrix = makeZeroOrOneFrame(_dinfo._adaptedFrame.numRows() + ArrayUtils.sum(_randC), - _state.beta().length + _state.ubeta().length, 0, null); - Frame augZW = makeZeroOrOneFrame(augXZ.numRows(), 1, 0, new String[]{"AugZ*W"}); - int betaLength = _state.beta().length; - int ubetaLength = _state.ubeta().length; - CalculateW4Data calculateW4Data; - double[] start_delta = MemoryManager.malloc8d(betaLength + ubetaLength); - int iteration = 0; - // store eta at the beginning and move it from _dinfo response columns to returnFrame for metrics calculation - new CopyPartsOfFrame(_dinfo._adaptedFrame, new int[]{etaOColIndexReturnFrame}, - new int[]{_dinfo.responseChunkId(dinfoResponseColID)}, _dinfo._adaptedFrame.numRows()).doAll(returnFrame); - new CalculateW4Rand(_job, _parms, randCatLevels, _state.get_psi(), _state.get_phi(), - _state.ubeta()).doAll(_state.get_priorw_wpsi()); - new RandColAddW2AugXZ(_job, _randC, _state.get_priorw_wpsi(), 1, _dinfo._adaptedFrame.numRows(), - augXZ.numCols() - ArrayUtils.sum(_randC), augXZ.numCols()).doAll(augXZ); - do { // start loop GLM.MME loop in R - start_delta = calculate_all_beta(start_delta, augXZ, augZW, totRandCatLevels, cholR); // calculate new beta, ubeta - new CopyPartsOfFrame(augXZ, null, null, qMatrix.numRows()).doAll(qMatrix); // copy Q matrix from augXZ to qMatrix - _state.set_beta_HGLM(start_delta, 0, betaLength, true); // save new fixed/random coefficients to _state - _state.set_ubeta_HGLM(start_delta, betaLength, ubetaLength); - iteration++; // update iteration count - // generate weight for data part and AugXZ with the new weights - calculateW4Data = calculateNewWAugXZ(augXZ, _randC); - } while (progressHGLMGLMMME(calculateW4Data._sumEtaDiffSq, calculateW4Data._sumEtaSq, iteration, true, - null, null, null, null, null, null, null, null)); - if (iteration > _parms._max_iterations) - Log.debug(LogMsg("HGLM GLM.MME did not converge in " + iteration + " iterations.")); - ReturnGLMMMERunInfoData glmRunInfoData = new ReturnGLMMMERunInfoData(_job, _dinfo, qMatrix, dinfoWCol, - _parms).doAll(returnFrame); - ReturnGLMMMERunInfoRandCols glmRunInfo = new ReturnGLMMMERunInfoRandCols(_job, _dinfo, _state.get_priorw_wpsi(), - qMatrix, wCol, _parms, _state.get_psi(), _state.ubeta(), - ArrayUtils.cumsum(randCatLevels)).doAll(returnFrame); - glmRunInfo._sumDev += glmRunInfoData._sumDev; - new GenerateResid(_job, - 1.0 / Math.sqrt(glmRunInfo._sumDev / (returnFrame.numRows() - (betaLength + ubetaLength))), - 1, 4, _dinfo._adaptedFrame.numRows()).doAll(returnFrame); // generate resid - sumEtaInfo[0] = glmRunInfoData._sumEtaDiffSq; - sumEtaInfo[1] = glmRunInfoData._sumEtaSq; - cleanupHGLMMemory(null, new Frame[]{augZW, qMatrix}, null, null); - return sumEtaInfo; - } - - public CalculateW4Data calculateNewWAugXZ(Frame augXZ, int[] randCatLevels) { - // generate weight for data part - CalculateW4Data calculateW4Data = new CalculateW4Data(_job, _dinfo, _parms, randCatLevels, _state.beta(), _state.ubeta(), - _state.get_psi(), _state.get_phi(), _state.get_tau(), - _state.get_correction_HL()).doAll(_dinfo._adaptedFrame); - // generate weight for lower part of AugXZ - new CalculateW4Rand(_job, _parms, randCatLevels, _state.get_psi(), _state.get_phi(), - _state.ubeta()).doAll(_state.get_priorw_wpsi()); - // generate AugXZ as [X | Z] * wdata for top half and bottom half as [0 | I]*wpsi - new DataAddW2AugXZ(_job, _dinfo, _randC).doAll(augXZ); - new RandColAddW2AugXZ(_job, _randC, _state.get_priorw_wpsi(), 1, _dinfo._adaptedFrame.numRows(), augXZ.numCols() - ArrayUtils.sum(_randC), augXZ.numCols()).doAll(augXZ); - return calculateW4Data; - } - - public void copyOver(double[][] cholR, double[][] cholRcopy) { - int dim1 = cholR.length; - int dim2 = cholR[0].length; - for (int index = 0; index < dim1; index++) - System.arraycopy(cholR[index], 0, cholRcopy[index], 0, dim2); - } - - /*** - * This method will estimate beta and ubeta using QR decomposition. Refer to HGLM documentation appendix step 2. - * @param start_delta - * @param augXZ - * @param augZW - * @param totRandCatLevels - * @param cholRcopy - * @return - */ - public double[] calculate_all_beta(double[] start_delta, Frame augXZ, Frame augZW, int totRandCatLevels, - double[][] cholRcopy) { - // perform QR decomposition on augXZ and store R as a double[][] array, Q back in augXZ - DataInfo augXZInfo = new DataInfo(augXZ, null, true, DataInfo.TransformType.NONE, - true, false, false); - DKV.put(augXZInfo._key, augXZInfo); - // QR decomposition of augXZ, Q stored in augXZ, cholR stores transpose(R). - double[][] cholR = ArrayUtils.transpose(LinearAlgebraUtils.computeQInPlace(_job._key, augXZInfo)); - copyOver(cholR, cholRcopy); // copy over R matrix, it is lower triangle, used in model metrics calculation later. - Frame qTransposed = DMatrix.transpose(augXZ); // transpose Q (stored in Q) and store in qTransposed - // generate frame augZW, it is one column only - new CalculateAugZWData(_job, _dinfo, _dinfo.responseChunkId(1)).doAll(augZW); - new CalculateAugZWRandCols(_job, _state.get_priorw_wpsi(), 1, - _dinfo._adaptedFrame.numRows()).doAll(augZW); - double[][] augZWTransposed = new double[1][]; - augZWTransposed[0] = FrameUtils.asDoubles(augZW.vec(0)); // store augZW as array - // generate transpose(Q)*AugZxW and put the result into an array - DataInfo qTinfo = new DataInfo(qTransposed, null, true, DataInfo.TransformType.NONE, - true, false, false); - DKV.put(qTinfo._key, qTinfo); - Frame qTAugZW = (new BMulTask(_job._key, qTinfo, augZWTransposed).doAll(augZWTransposed.length, T_NUM, - qTinfo._adaptedFrame)).outputFrame(Key.make("Q*Augz*W"), null, null); - double[] qtaugzw = new FrameUtils.Vec2ArryTsk((int) qTAugZW.numRows()).doAll(qTAugZW.vec(0)).res; - // backward solve to get new coefficients for fixed and random columns - start_delta = LinearAlgebraUtils.backwardSolve(cholR, qtaugzw, start_delta); - cleanupHGLMMemory(new DataInfo[]{qTinfo, augXZInfo}, new Frame[]{qTransposed, qTAugZW}, null, - new Key[]{qTinfo._key, augXZInfo._key}); - return start_delta; // return newly estimated coefficients - } - - /*** - * Method to clean up memories like Frames, DataInfos and Vectors after usage. - * - * @param tempdInfo - * @param tempFrames - * @param tempVectors - */ - private void cleanupHGLMMemory(DataInfo[] tempdInfo, Frame[] tempFrames, Vec[] tempVectors, Key[] dkvKeys) { - if (tempdInfo != null) { - for (int index = 0; index < tempdInfo.length; index++) - if (tempdInfo[index] != null) - tempdInfo[index].remove(); - } - if (tempFrames != null) { - for (int index = 0; index < tempFrames.length; index++) - if (tempFrames[index] != null) - tempFrames[index].delete(); - } - if (tempVectors != null) { - for (int index = 0; index < tempVectors.length; index++) - if (tempVectors[index] != null) - tempVectors[index].remove(); - } - if (dkvKeys != null) { - for (int index = 0; index < dkvKeys.length; index++) { - if (dkvKeys[index] != null) - DKV.remove(dkvKeys[index]); - } - } - } - - private void fitHGLM() { - // figure out random columns categorical levels - int numRandCols = _parms._random_columns.length; - int totRandCatLevels = ArrayUtils.sum(_randC); - int[] cumSumRandLevels = ArrayUtils.cumsum(_randC); - // glmmmeReturnFrame is pre-allocated and stays alive for the whole HGLM model building process. Hence, instead - // of allocating and de-allocating it, I kept it around. It has _nobs+number of expanded random columns in rows. - // eta.i = Xi*beta, etao contains the old eta.i - Frame glmmmeReturnFrame = makeZeroOrOneFrame(totRandCatLevels + _dinfo._adaptedFrame.numRows(), - 6, 0, new String[]{"AugZ", "hv", "dev", "eta.i", "resid", "etao"}); - // hvDataOnly stores the shortened hv column in glmmmeReturnFrame - Frame hvDataOnly = makeZeroOrOneFrame(_dinfo._adaptedFrame.numRows(), 1, 0, new String[]{"hv_data"}); - // store column indices that contains HGLM info: response, zi, etaOld, prior_weight or response, zi, etaOld - int[] dinfoWCol = _dinfo._weights ? new int[]{_dinfo.responseChunkId(0), _dinfo.responseChunkId(2), - _dinfo.responseChunkId(3), _dinfo.weightChunkId()} : new int[]{_dinfo.responseChunkId(0), - _dinfo.responseChunkId(2), _dinfo.responseChunkId(3)}; - int[] wCol = new int[]{0, 2}; // in w_prior_wpsi, col 0 is prior weight, col 2 is zmi - int[] devHvColIdx = new int[]{2, 1}; // refer to column indices of dev, hv from glmmmeReturnFrame - double[] sumEtaInfo = new double[2]; // first element is sum(eta.i-etao)^2, second element: sum(eta.i^2) - long numDataRows = _dinfo._adaptedFrame.numRows(); - double[][] VC2 = new double[numRandCols][2];// store estimates and standard error of the random effects in the dispersion model - double[] VC1 = new double[2]; // store estimates and standard error of the fixed predictor in the dispersion model - int iter = 0; - // augXZ stores Ta*W as described in HGLM documentation, it is used throughout the whole fitting process - Frame augXZ = makeZeroOrOneFrame(_dinfo._adaptedFrame.numRows() + ArrayUtils.sum(_randC), - _state.beta().length + _state.ubeta().length, 0, null); - // generate weights for [X | Z] part of AugXZ and stored it in _dinfo - // generate weight for data part - CalculateW4Data calculateW4Data = new CalculateW4Data(_job, _dinfo, _parms, _randC, _state.beta(), _state.ubeta(), - _state.get_psi(), _state.get_phi(), _state.get_tau(), - _state.get_correction_HL()).doAll(_dinfo._adaptedFrame); - // generate AugXZ as [X | Z] * wdata for top half and bottom half as [0 | I]*wpsi - new DataAddW2AugXZ(_job, _dinfo, _randC).doAll(augXZ); - sumEtaInfo[0] = calculateW4Data._sumEtaDiffSq; - sumEtaInfo[1] = calculateW4Data._sumEtaSq; - _state.set_sumEtaSquareConvergence(sumEtaInfo); - GLMModel fixedModel; - GLMModel[] randModels = new GLMModel[numRandCols]; - updateProgress(null, null, null, null, null, null, - Double.NaN, Double.NaN, false, null, null); - double[][] cholR = new double[augXZ.numCols()][augXZ.numCols()]; // store R from QR decomposition of augXZ - - do { - // step 2, estimate beta, ubeta from HGLM documentation - if (_state._iter > 0) { - new CalculateW4Data(_job, _dinfo, _parms, _randC, _state.beta(), _state.ubeta(), - _state.get_psi(), _state.get_phi(), _state.get_tau(), - _state.get_correction_HL()).doAll(_dinfo._adaptedFrame); - new DataAddW2AugXZ(_job, _dinfo, _randC).doAll(augXZ); - } - sumEtaInfo = fitCoeffs(_randC, totRandCatLevels, glmmmeReturnFrame, dinfoWCol, wCol, - 5, 3, sumEtaInfo, augXZ, cholR); - // step 3, estimate init_sig_e - fixedModel = fitDataDispersion(glmmmeReturnFrame, devHvColIdx, VC1); - // step 4, estimate init_sig_u - estimateRandomCoeffCh(glmmmeReturnFrame, devHvColIdx, cumSumRandLevels, numRandCols, numDataRows, randModels, VC2); - } while (progressHGLMGLMMME(sumEtaInfo[0], sumEtaInfo[1], iter, false, fixedModel, randModels, - glmmmeReturnFrame, hvDataOnly, VC1, VC2, cholR, augXZ)); - scoreAndUpdateModelHGLM(fixedModel, randModels, glmmmeReturnFrame, hvDataOnly, VC1, VC2, sumEtaInfo[0], - sumEtaInfo[1], cholR, augXZ, true); - fixedModel.remove(); - cleanupHGLMMemory(null, new Frame[]{glmmmeReturnFrame, hvDataOnly, augXZ, _state.get_priorw_wpsi()}, null, null); - } - - /*** - * This method will estimate the characteristics of the random coefficients for each random column. - */ - private void estimateRandomCoeffCh(Frame returnFrame, int[] devHvColIdx, int[] cumSumRandLevels, int numRandCols, - long numDataRows, GLMModel[] gRandModels, double[][] VC2) { - long dev2UseStart = numDataRows; - int startIndex = 0; - double[] phi = _state.get_phi(); - for (int colIndex = 0; colIndex < numRandCols; colIndex++) { - Frame constXYWeight = new Frame(makeZeroOrOneFrame(cumSumRandLevels[colIndex], 3, 1, - new String[]{"response", "X", "weights"})); - DKV.put(constXYWeight._key, constXYWeight); - gRandModels[colIndex] = buildGammaGLM(returnFrame, constXYWeight, devHvColIdx, dev2UseStart, - (long) cumSumRandLevels[colIndex], true); // generate frame to build glm model - // update phi value with glmRand fitted value - double sigma2u = Math.exp(gRandModels[colIndex].coefficients().get("Intercept")); - double newPhi = sigma2u; - for (int index = startIndex; index < cumSumRandLevels[colIndex]; index++) - phi[index] = newPhi; - _state.set_phi(phi); - startIndex = cumSumRandLevels[colIndex]; - // set phi with new value in _state - dev2UseStart += cumSumRandLevels[colIndex]; - assignEstStdErr(gRandModels[colIndex], VC2[colIndex]); - cleanupHGLMMemory(null, new Frame[]{constXYWeight}, null, new Key[]{constXYWeight._key}); - gRandModels[colIndex].remove(); - } - } - - /*** - * This method estimates the init_sig_e by building a gamma GLM with response - * @param returnFrame - * @param devHvColIdx - * @param VC1 - * @return - */ - public GLMModel fitDataDispersion(Frame returnFrame, int[] devHvColIdx, double[] VC1) { - // constXYWeight stores response, weights for Steps 4, 5 of the fitting algorithm. - Frame constXYWeight = new Frame(makeZeroOrOneFrame(_dinfo._adaptedFrame.numRows(), 3, 1, - new String[]{"response", "X", "weights"})); - DKV.put(constXYWeight._key, constXYWeight); - GLMModel gfixed = buildGammaGLM(returnFrame, constXYWeight, devHvColIdx, 0, - _dinfo._adaptedFrame.numRows(), true); // build a gamma GLM - double sigma2e = Math.exp(gfixed.coefficients().get("Intercept")); // extra dispersion parameter - _state.set_tau(sigma2e); - assignEstStdErr(gfixed, VC1); - cleanupHGLMMemory(null, new Frame[]{constXYWeight}, null, new Key[]{constXYWeight._key}); - gfixed.remove(); - return gfixed; // return gamma GLM model - } - - private void assignEstStdErr(GLMModel glm, double[] VC) { - double[] stdErr = glm._output.stdErr(); - VC[0] = glm.coefficients().get("Intercept"); - VC[1] = stdErr[0]; - } - - /** - * This method will generate a training frame according to HGLM doc, build a gamma GLM model with dispersion - * parameter set to 1 if enabled and calcluate the p-value if enabled. It will return the GLM model. - */ - public GLMModel buildGammaGLM(Frame returnFrame, Frame constXYWeight, int[] devHvColIdx, long startRowIndex, - long numRows, boolean computePValues) { - // generate training frame constXYWeight from returnFrame - new ExtractFrameFromSourceWithProcess(returnFrame, devHvColIdx, startRowIndex, numRows).doAll(constXYWeight); - DKV.put(constXYWeight._key, constXYWeight); - boolean originalPValues = _parms._compute_p_values; // enable p value computation if needed - boolean originalDispersion = _parms._useDispersion1; - _parms._compute_p_values = computePValues; - _parms._useDispersion1 = computePValues; // set dispersion to 1 if needed - GLMModel g11 = runGLMModel(_parms._standardize, Family.gamma, Link.log, constXYWeight._key, "response", - "weights", null, computePValues); // generate gamma GLM model - _parms._compute_p_values = originalPValues; - _parms._useDispersion1 = originalDispersion; - return g11; - } + private void fitIRLSM_multinomial(Solver s) { assert _dinfo._responses == 3 : "IRLSM for multinomial needs extra information encoded in additional reponses, expected 3 response vecs, got " + _dinfo._responses; @@ -3580,7 +3064,7 @@ private void fitLBFGS() { _state._u = l1Solver._u; _state.updateState(beta, gslvr.getGradient(beta)); } else { - if (!_parms._lambda_search && _state._iter == 0 && !_parms._HGLM) + if (!_parms._lambda_search && _state._iter == 0) updateProgress(false); Result r = lbfgs.solve(gslvr, beta, _state.ginfo(), new ProgressMonitor() { @Override @@ -3746,41 +3230,37 @@ private void fitCOD() { private void fitModel() { Solver solver = (_parms._solver == Solver.AUTO) ? defaultSolver() : _parms._solver; - if (_parms._HGLM) { - fitHGLM(); - } else { - switch (solver) { - case COORDINATE_DESCENT: // fall through to IRLSM - case IRLSM: - if (multinomial.equals(_parms._family)) - fitIRLSM_multinomial(solver); - else if (ordinal.equals(_parms._family)) - fitIRLSM_ordinal_default(solver); - else if (gaussian.equals(_parms._family) && Link.identity.equals(_parms._link) && _parms._linear_constraints == null) - fitLSM(solver); // not constrained GLM - else { - if (_parms._dispersion_parameter_method.equals(ml)) - fitIRLSMML(solver); - else if (_parms._linear_constraints == null) - fitIRLSM(solver); - else - fitIRLSMCS(); // constrained GLM IRLSM - } - break; - case GRADIENT_DESCENT_LH: - case GRADIENT_DESCENT_SQERR: - if (ordinal.equals(_parms._family)) - fitIRLSM_ordinal_default(solver); - break; - case L_BFGS: - fitLBFGS(); - break; - case COORDINATE_DESCENT_NAIVE: - fitCOD(); - break; - default: - throw H2O.unimpl(); - } + switch (solver) { + case COORDINATE_DESCENT: // fall through to IRLSM + case IRLSM: + if (multinomial.equals(_parms._family)) + fitIRLSM_multinomial(solver); + else if (ordinal.equals(_parms._family)) + fitIRLSM_ordinal_default(solver); + else if (gaussian.equals(_parms._family) && Link.identity.equals(_parms._link) && _parms._linear_constraints == null) + fitLSM(solver); // not constrained GLM + else { + if (_parms._dispersion_parameter_method.equals(ml)) + fitIRLSMML(solver); + else if (_parms._linear_constraints == null) + fitIRLSM(solver); + else + fitIRLSMCS(); // constrained GLM IRLSM + } + break; + case GRADIENT_DESCENT_LH: + case GRADIENT_DESCENT_SQERR: + if (ordinal.equals(_parms._family)) + fitIRLSM_ordinal_default(solver); + break; + case L_BFGS: + fitLBFGS(); + break; + case COORDINATE_DESCENT_NAIVE: + fitCOD(); + break; + default: + throw H2O.unimpl(); } // Make sure if we set dispersion for Tweedie p and phi estimation even without calculating p values @@ -3842,7 +3322,7 @@ else if (_parms._linear_constraints == null) double[][] inv = chol.getInv(); if (_parms._influence != null) { _cholInvInfluence = new double[inv.length][inv.length]; - GamUtils.copy2DArray(inv, _cholInvInfluence); + copy2DArray(inv, _cholInvInfluence); ArrayUtils.mult(_cholInvInfluence, _parms._obj_reg); g.mul(1.0/_parms._obj_reg); } @@ -3863,234 +3343,7 @@ else if (_parms._linear_constraints == null) private long timeSinceLastScoring() { return System.currentTimeMillis() - _lastScore; } - - /*** - * performm score and update for HGLM models. However, here, there is no scoring, only metrics update. - * @param fixedModel - * @param randModels - * @param glmmmeReturns - */ - private void scoreAndUpdateModelHGLM(GLMModel fixedModel, GLMModel[] randModels, Frame glmmmeReturns, - Frame hvDataOnly, double[] VC1, double[][] VC2, double sumDiff2, - double convergence, double[][] cholR, Frame augXZ, boolean compute_hlik) { - Log.info(LogMsg("Scoring after " + timeSinceLastScoring() + "ms")); - long t1 = System.currentTimeMillis(); - Frame train = DKV.getGet(_parms._train); // need to keep this frame to get scoring metrics back - String[] domain = new String[]{"HGLM_" + _parms._family.toString() + "_" + _parms._rand_family.toString()}; - ModelMetrics.MetricBuilder mb = _model.makeMetricBuilder(domain); - ModelMetricsHGLM.MetricBuilderHGLM mmHGLMBd = (ModelMetricsHGLM.MetricBuilderHGLM) (((GLMMetricBuilder) mb)._metricBuilder); - updateSimpleHGLMMetrics(fixedModel, randModels, VC1, VC2, mmHGLMBd, sumDiff2, convergence); - calBad(glmmmeReturns, hvDataOnly, mmHGLMBd); - calseFeseRedfReFe(cholR, mmHGLMBd, augXZ); - if (_parms._calc_like && compute_hlik) { // computation/memory intensive, only calculated it if needed - calHlikStuff(mmHGLMBd, glmmmeReturns, augXZ); - _state.set_likelihoodInfo(mmHGLMBd._hlik, mmHGLMBd._pvh, mmHGLMBd._pbvh, mmHGLMBd._caic); - } - mb.makeModelMetrics(_model, train, _dinfo._adaptedFrame, null); // add generated metric to DKV - scorePostProcessing(train, t1); - } - - private void calHlikStuff(ModelMetricsHGLM.MetricBuilderHGLM mmHGLMBd, Frame glmmmeReturns, Frame augXZ) { - calculateNewWAugXZ(augXZ, _randC); - double cond_hlik = calculatecAIC(mmHGLMBd, glmmmeReturns); - Frame hlikH = formHMatrix(augXZ); - mmHGLMBd._hlik = cond_hlik; - mmHGLMBd._pvh = cond_hlik - 0.5 * calcLogDeterminant(hlikH); // logdet(H/(2pi)) is correct - Frame hlikA = formAMatrix(hlikH, augXZ); - mmHGLMBd._pbvh = cond_hlik - 0.5 * calcLogDeterminant(hlikA); - cleanupHGLMMemory(null, new Frame[]{hlikH, hlikA}, null, null); - } - - private double calcLogDeterminant(Frame frame) { // calculate the log determinant for frame/(2*Math.PI) - SVDModel.SVDParameters parms = new SVDModel.SVDParameters(); - parms._train = frame._key; - parms._transform = DataInfo.TransformType.NONE; - parms._svd_method = SVDParameters.Method.GramSVD; - parms._save_v_frame = false; - parms._nv = frame.numCols(); - SVDModel model = new SVD(parms).trainModel().get(); - double[] singular_values = model._output._d; - double cond = ArrayUtils.minValue(singular_values) / ArrayUtils.maxValue(singular_values); - if (cond < 0.00000001) // value copied from R - warn("pbvh", "The Hessian used for computing pbvh is ill-conditioned."); - double sumLog = 0.0; - double log2Pi = Math.log(2 * Math.PI); - for (int index = 0; index < parms._nv; index++) - sumLog += Math.log(singular_values[index]) - log2Pi; - model.delete(); // clean up model before proceeding. - return sumLog; - } - - private Frame formAMatrix(Frame hlikH, Frame augXZ) { - Frame dataFrame = getXW1(augXZ); - DataInfo hlikAInfo = new DataInfo(dataFrame, null, true, DataInfo.TransformType.NONE, - true, false, false); - Gram.GramTask dgram = new Gram.GramTask(_job._key, hlikAInfo, false, false).doAll(hlikAInfo._adaptedFrame); - Frame leftUp = new ArrayUtils().frame(dgram._gram.getXX()); - Frame augzW1 = getaugZW1(augXZ); - Frame tX = DMatrix.transpose(dataFrame); - int expandedRandColNum = ArrayUtils.sum(_randC); - tX.add(new Frame(makeZeroOrOneFrame(tX.numRows(), expandedRandColNum, 0, null))); - DKV.put(augzW1._key, augzW1); - new LinearAlgebraUtils.BMulTaskMatrices(augzW1).doAll(tX); - Frame tXW1z = tX.subframe(tX.numCols() - expandedRandColNum, tX.numCols()); - leftUp.add(tXW1z); - Frame leftDown = DMatrix.transpose(tXW1z); - leftDown._key = Key.make(); - leftDown.add(hlikH); - leftDown.setNames(leftUp.names()); - DKV.put(leftDown._key, leftDown); - DKV.put(leftUp._key, leftUp); - String tree = String.format("(rbind %s %s)", leftUp._key, leftDown._key); - Val val = Rapids.exec(tree); - Frame amatrix = val.getFrame(); - amatrix._key = Key.make(); - DKV.put(amatrix._key, amatrix); - cleanupHGLMMemory(new DataInfo[]{hlikAInfo}, new Frame[]{dataFrame, leftUp, augzW1, tX, tXW1z, leftDown}, null, null); - return amatrix; - } - - private Frame getXW1(Frame augXZ) { - int numDataCols = augXZ.numCols() - ArrayUtils.sum(_randC); - int[] colIndices = new int[numDataCols]; - for (int index = 0; index < numDataCols; index++) - colIndices[index] = index; - Frame dataFrame = new Frame(makeZeroOrOneFrame(_nobs, numDataCols, 0, null)); - new CopyPartsOfFrame(augXZ, colIndices, colIndices, _nobs).doAll(dataFrame); - return dataFrame; - } - - - private Frame formHMatrix(Frame augXZ) { - Frame augZW1 = getaugZW1(augXZ); - DataInfo augZW1Info = new DataInfo(augZW1, null, true, DataInfo.TransformType.NONE, - true, false, false); - Gram.GramTask dgram = new Gram.GramTask(_job._key, augZW1Info, false, false).doAll(augZW1Info._adaptedFrame); - Frame wranddata = _state.get_priorw_wpsi(); - double[][] W2 = null; - W2 = ArrayUtils.transpose(new ArrayUtils.FrameToArray(1, 1, wranddata.numRows(), W2).doAll(wranddata).getArray()); - ArrayUtils.mult(W2[0], W2[0]); // generate W2 square - dgram._gram.addDiag(W2[0]); - cleanupHGLMMemory(new DataInfo[]{augZW1Info}, new Frame[]{augZW1, wranddata}, null, null); - return new ArrayUtils().frame(dgram._gram.getXX()); - } - - private Frame getaugZW1(Frame augXZ) { - int numRandExpandedCols = ArrayUtils.sum(_randC); - int randIndexStart = augXZ.numCols() - numRandExpandedCols; - Frame augZW1 = new Frame(makeZeroOrOneFrame(_nobs, numRandExpandedCols, 0, null)); - int[] colIndices = new int[numRandExpandedCols]; - int colNum = augXZ.numCols(); - for (int index = randIndexStart; index < colNum; index++) { - colIndices[index - randIndexStart] = index; - } - new CopyPartsOfFrame(augXZ, null, colIndices, _nobs).doAll(augZW1); - return augZW1; - } - - private double calculatecAIC(ModelMetricsHGLM.MetricBuilderHGLM mmHGLMBd, Frame glmmmeReturns) { - Frame hv_dev_pw = new Frame(makeZeroOrOneFrame(_nobs, 2, 0, new String[]{"hv", "dev"})); - new CopyPartsOfFrame(glmmmeReturns, new int[]{0, 1}, new int[]{1, 2}, _nobs).doAll(hv_dev_pw); - if (_dinfo._weights) - hv_dev_pw.add("weights", _dinfo._adaptedFrame.vec(_dinfo.weightChunkId())); // frame could have 2 or 3 columns - HelpercAIC calcAIC = new HelpercAIC(_dinfo._weights, mmHGLMBd._varfix).doAll(hv_dev_pw); - double constance = -0.5 * calcAIC._constT; - double cond_hlik = constance - 0.5 * calcAIC._devOphi; - mmHGLMBd._caic = -2 * cond_hlik + 2 * calcAIC._p; - // add contribution from lfv and sum(log(abs(du_dv))) which is zero for Gaussian - double[] lfvals = lfv_du_dv(_parms._rand_family, _parms._rand_link, _state.get_phi(), _state.ubeta()); - cond_hlik += lfvals[0] + lfvals[1]; - hv_dev_pw.remove(); - return cond_hlik; - } - - public double[] lfv_du_dv(Family[] family, Link[] link, double[] phi, double[] u) { - double[] vals = new double[2]; // first element stores lfv, second for du_dv - int numRandCols = _randC.length; - for (int k = 0; k < numRandCols; k++) { // go over each random column - Family tfamily = family[k]; - Link tlink = tfamily.defaultLink; - if (!(link == null)) - tlink = link[k]; - GLMWeightsFun glmfun = new GLMWeightsFun(tfamily, tlink, 0, 0, 0, 1, false); - int colLength = _randC[k]; - for (int col = 0; col < colLength; col++) { - int index = k * colLength + col; - if (Family.gaussian.equals(tfamily)) { // only implementation now - if (Link.identity.equals(tlink)) { - vals[1] += Math.log(Math.abs(glmfun.linkInvDeriv(glmfun.link(u[index])))); - vals[0] -= Math.log(Math.sqrt(2 * Math.PI)) + Math.log(Math.sqrt(phi[index])) + u[index] * u[index] / (2 * phi[index]); - } - } - } - } - return vals; - } - - /*** - * This method will calculate the HGLM metrics seFe, seRe, dfReFe - * @param cholR - */ - private void calseFeseRedfReFe(double[][] cholR, ModelMetricsHGLM.MetricBuilderHGLM mmHGLMBd, Frame augXZ) { - double[][] RTRInv = LinearAlgebraUtils.chol2Inv(cholR); // should be the transpose but care about the diagonal. - double[] seFeRe = LinearAlgebraUtils.sqrtDiag(RTRInv); - int sefelen = _state.beta().length; - int serelen = _state.ubeta().length; - if (mmHGLMBd._sefe == null) - mmHGLMBd._sefe = new double[sefelen]; - System.arraycopy(seFeRe, 0, mmHGLMBd._sefe, 0, sefelen); - if (mmHGLMBd._sere == null) - mmHGLMBd._sere = new double[serelen]; - System.arraycopy(seFeRe, sefelen, mmHGLMBd._sere, 0, serelen); - Frame augZ = new Frame(makeZeroOrOneFrame(_nobs, augXZ.numCols(), 0, - null)); - new CopyPartsOfFrame(augXZ, null, null, _nobs).doAll(augZ); - DataInfo augzInfo = new DataInfo(augZ, null, true, DataInfo.TransformType.NONE, - true, false, false); - Gram.GramTask dgram = new Gram.GramTask(_job._key, augzInfo, false, false).doAll(augzInfo._adaptedFrame); - double[][] gramMatrix = dgram._gram.getXX(); - double pd = ArrayUtils.sum(ArrayUtils.diagArray(LinearAlgebraUtils.matrixMultiply(RTRInv, gramMatrix))); - mmHGLMBd._dfrefe = Math.round(_nobs - pd); - cleanupHGLMMemory(new DataInfo[]{augzInfo}, new Frame[]{augZ}, null, null); - } - - private void calBad(Frame glmmeReturns, Frame hvFrameOnly, ModelMetricsHGLM.MetricBuilderHGLM mmHGLMBd) { - new CopyPartsOfFrame(glmmeReturns, new int[]{0}, new int[]{1}, _nobs).doAll(hvFrameOnly); - Vec vec = hvFrameOnly.vec(0); - double sigma6 = vec.mean() + 6 * vec.sigma(); - double maxVec = vec.max(); - if (maxVec > sigma6) { - mmHGLMBd._bad = (new FindMaxIndex(0, maxVec).doAll(hvFrameOnly))._maxIndex; - } else - mmHGLMBd._bad = -1; - } - - private void updateSimpleHGLMMetrics(GLMModel fixedModel, GLMModel[] randModels, double[] VC1, double[][] VC2, - ModelMetricsHGLM.MetricBuilderHGLM mmHGLMBd, double sumDiff2, double convergence) { - mmHGLMBd.updateCoeffs(_state.beta(), _state.ubeta()); // update coefficients - mmHGLMBd.updateSummVC(VC1, VC2, _randC); // update summVC1 and summVC2 - mmHGLMBd._varfix = Math.exp(fixedModel.coefficients().get("Intercept")); // update sigmas for coefficients - int randColNum = mmHGLMBd._randc.length; - if (mmHGLMBd._varranef == null) - mmHGLMBd._varranef = new double[randColNum]; - boolean converged = true; - double sumSigma2u = 0; - for (int index = 0; index < randColNum; index++) { - mmHGLMBd._varranef[index] = Math.exp(randModels[index].coefficients().get("Intercept")); - sumSigma2u += mmHGLMBd._varranef[index]; - } - for (int index = 0; index < randColNum; index++) { - if ((mmHGLMBd._varranef[index] / (sumSigma2u + mmHGLMBd._varfix)) > 0.9999) { // 0.9999 from R - converged = false; - break; - } - } - mmHGLMBd._converge = converged && (_state._iter < _parms._max_iterations); - mmHGLMBd._sumetadiffsquare = sumDiff2; - mmHGLMBd._convergence = convergence; - mmHGLMBd._iterations = _state._iter; - mmHGLMBd._nobs = _nobs; - } + private void scoreAndUpdateModel() { // compute full validation on train and test @@ -4160,17 +3413,12 @@ private void scorePostProcessing(Frame train, long t1) { } if (_parms._lambda_search) _model._output._scoring_history = _lambdaSearchScoringHistory.to2dTable(); - else if (_parms._HGLM) - _model._output._scoring_history = _scoringHistory.to2dTableHGLM(); else _model._output._scoring_history = _scoringHistory.to2dTable(_parms, _xval_deviances_generate_SH, _xval_sd_generate_SH); _model.update(_job._key); - if (_parms._HGLM) - _model.generateSummaryHGLM(_parms._train, _state._iter); - else - _model.generateSummary(_parms._train, _state._iter); + _model.generateSummary(_parms._train, _state._iter); _lastScore = System.currentTimeMillis(); long scoringTime = System.currentTimeMillis() - t1; _scoringInterval = Math.max(_scoringInterval, 20 * scoringTime); // at most 5% overhead for scoring @@ -4220,14 +3468,12 @@ protected Submodel computeSubmodel(int i, double lambda, double nullDevTrain, do else _model.addSubmodel(i, sm = new Submodel(lambda, _state.alpha(), getNullBeta(), _state._iter, nullDevTrain, nullDevValid, _betaInfo.totalBetaLength(), null, false)); - } else { // this is also the path for HGLM model + } else { if (continueFromPreviousSubmodel) { sm = _model._output._submodels[i]; } else { sm = new Submodel(lambda, _state.alpha(), _state.beta(), _state._iter, -1, -1, _betaInfo.totalBetaLength(), _state.zValues(), _state.dispersionEstimated());// restart from last run - if (_parms._HGLM) // add random coefficients for random effects/columns - sm.ubeta = Arrays.copyOf(_state.ubeta(), _state.ubeta().length); _model.addSubmodel(i, sm); } if (_insideCVCheck && _parms._generate_scoring_history && !Solver.L_BFGS.equals(_parms._solver) && @@ -4248,10 +3494,9 @@ protected Submodel computeSubmodel(int i, double lambda, double nullDevTrain, do addGLMVec(vecs, true, _dinfo); } } - if (!_parms._HGLM) { // only perform this when HGLM is not used. - if (!_checkPointFirstIter) - _state.setLambda(lambda); - } + + if (!_checkPointFirstIter) + _state.setLambda(lambda); checkMemoryFootPrint(_state.activeData()); do { @@ -4263,31 +3508,25 @@ protected Submodel computeSubmodel(int i, double lambda, double nullDevTrain, do fitModel(); } while (!_state.checkKKTs()); Log.info(LogMsg("solution has " + ArrayUtils.countNonzeros(_state.beta()) + " nonzeros")); - if (_parms._HGLM) { - sm = new Submodel(lambda, _state.alpha(), _state.beta(), _state._iter, nullDevTrain, nullDevValid, - _betaInfo.totalBetaLength(), _state.zValues(), _state.dispersionEstimated()); - sm.ubeta = Arrays.copyOf(_state.ubeta(), _state.ubeta().length); - _model.updateSubmodel(i, sm); - } else { - double trainDev = _state.deviance() / _nobs; - double validDev = Double.NaN; // calculated from validation dataset below if present - if (_validDinfo != null) { // calculate deviance for validation set and save as testDev - if (ordinal.equals(_parms._family)) - validDev = new GLMResDevTaskOrdinal(_job._key, _validDinfo, _dinfo.denormalizeBeta(_state.beta()), _nclass).doAll(_validDinfo._adaptedFrame).avgDev(); - else - validDev = multinomial.equals(_parms._family) - ? new GLMResDevTaskMultinomial(_job._key, _validDinfo, _dinfo.denormalizeBeta(_state.beta()), _nclass).doAll(_validDinfo._adaptedFrame).avgDev() - : new GLMResDevTask(_job._key, _validDinfo, _parms, _dinfo.denormalizeBeta(_state.beta())).doAll(_validDinfo._adaptedFrame).avgDev(); - } - Log.info(LogMsg("train deviance = " + trainDev + ", valid deviance = " + validDev)); - double xvalDev = ((_xval_deviances == null) || (_xval_deviances.length <= i)) ? -1 : _xval_deviances[i]; - double xvalDevSE = ((_xval_sd == null) || (_xval_deviances.length <= i)) ? -1 : _xval_sd[i]; - if (_parms._lambda_search) - _lambdaSearchScoringHistory.addLambdaScore(_state._iter, ArrayUtils.countNonzeros(_state.beta()), - _state.lambda(), trainDev, validDev, xvalDev, xvalDevSE, _state.alpha()); // add to scoring history - _model.updateSubmodel(i, sm = new Submodel(_state.lambda(), _state.alpha(), _state.beta(), _state._iter, - trainDev, validDev, _betaInfo.totalBetaLength(), _state.zValues(), _state.dispersionEstimated())); + double trainDev = _state.deviance() / _nobs; + double validDev = Double.NaN; // calculated from validation dataset below if present + if (_validDinfo != null) { // calculate deviance for validation set and save as testDev + if (ordinal.equals(_parms._family)) + validDev = new GLMResDevTaskOrdinal(_job._key, _validDinfo, _dinfo.denormalizeBeta(_state.beta()), _nclass).doAll(_validDinfo._adaptedFrame).avgDev(); + else + validDev = multinomial.equals(_parms._family) + ? new GLMResDevTaskMultinomial(_job._key, _validDinfo, _dinfo.denormalizeBeta(_state.beta()), _nclass).doAll(_validDinfo._adaptedFrame).avgDev() + : new GLMResDevTask(_job._key, _validDinfo, _parms, _dinfo.denormalizeBeta(_state.beta())).doAll(_validDinfo._adaptedFrame).avgDev(); } + Log.info(LogMsg("train deviance = " + trainDev + ", valid deviance = " + validDev)); + double xvalDev = ((_xval_deviances == null) || (_xval_deviances.length <= i)) ? -1 : _xval_deviances[i]; + double xvalDevSE = ((_xval_sd == null) || (_xval_deviances.length <= i)) ? -1 : _xval_sd[i]; + if (_parms._lambda_search) + _lambdaSearchScoringHistory.addLambdaScore(_state._iter, ArrayUtils.countNonzeros(_state.beta()), + _state.lambda(), trainDev, validDev, xvalDev, xvalDevSE, _state.alpha()); // add to scoring history + _model.updateSubmodel(i, sm = new Submodel(_state.lambda(), _state.alpha(), _state.beta(), _state._iter, + trainDev, validDev, _betaInfo.totalBetaLength(), _state.zValues(), _state.dispersionEstimated())); + } return sm; } @@ -4377,13 +3616,8 @@ private void doCompute() { Vec[] vecs = genGLMVectors(_dinfo, getNullBeta()); addGLMVec(vecs, false, _dinfo); } - - if (_parms._HGLM) { // add w, augZ, etaOld and random columns to response for easy access inside _dinfo - addWdataZiEtaOld2Response(); - } else { // only need these for non HGLM - _ginfoStart = GLMUtils.copyGInfo(_state.ginfo()); - _betaDiffStart = _state.getBetaDiff(); - } + _ginfoStart = GLMUtils.copyGInfo(_state.ginfo()); + _betaDiffStart = _state.getBetaDiff(); double oldDevTrain = nullDevTrain; double oldDevTest = nullDevValid; @@ -4396,7 +3630,7 @@ private void doCompute() { _model._output._submodels = null; // null out submodel only for single alpha/lambda values } - if (!_parms._lambda_search & !_parms._HGLM) + if (!_parms._lambda_search) updateProgress(false); // alpha, lambda search loop @@ -4412,14 +3646,14 @@ private void doCompute() { _model._output._lambda_array_size = _parms._lambda.length; for (int alphaInd = alphaStart; alphaInd < _parms._alpha.length; alphaInd++) { _state.setAlpha(_parms._alpha[alphaInd]); // loop through the alphas - if ((!_parms._HGLM) && (alphaInd > 0) && !_checkPointFirstIter) // no need for cold start during the first iteration + if ((alphaInd > 0) && !_checkPointFirstIter) // no need for cold start during the first iteration coldStart(devHistoryTrain, devHistoryTest); // reset beta, lambda, currGram for (int i = lambdaStart; i < _parms._lambda.length; ++i) { // for lambda search, can quit before it is done if (_job.stop_requested() || (timeout() && _model._output._submodels.length > 0)) break; //need at least one submodel on timeout to avoid issues. if (_parms._max_iterations != -1 && _state._iter >= _parms._max_iterations) break; // iterations accumulate across all lambda/alpha values when coldstart = false - if ((!_parms._HGLM && (_parms._cold_start || (!_parms._lambda_search && _parms._cold_start))) && (i > 0) + if ((_parms._cold_start || (!_parms._lambda_search && _parms._cold_start)) && (i > 0) && !_checkPointFirstIter) // default: cold_start for non lambda_search coldStart(devHistoryTrain, devHistoryTest); Submodel sm = computeSubmodel(submodelCount, _parms._lambda[i], nullDevTrain, nullDevValid); @@ -4488,12 +3722,10 @@ private void doCompute() { _model.setVcov(_vcov); _model.update(_job._key); } - if (!_parms._HGLM) { // no need to do for HGLM - _model._finalScoring = true; // enables likelihood calculation while scoring - scoreAndUpdateModel(); - _model._finalScoring = false; // avoid calculating likelihood in case of further updates - } - + _model._finalScoring = true; // enables likelihood calculation while scoring + scoreAndUpdateModel(); + _model._finalScoring = false; // avoid calculating likelihood in case of further updates + if (dfbetas.equals(_parms._influence)) genRID(); @@ -4632,32 +3864,6 @@ private void checkCoeffsBounds() { throw new H2OFailException("\n"+errorMessage.toString()); } - /*** - * Internal H2O method. Do not use. - * This method will add three more columns to _dinfo.response for easy access later - * - column 1: wdata, calculated weight for fixed columns/effects - * - column 2: zi, intermediate values - * - column 3: eta = X*beta, intermediate values - */ - public void addWdataZiEtaOld2Response() { // attach wdata, zi, eta to response for HGLM - int moreColnum = 3 + _parms._random_columns.length; - Vec[] vecs = _dinfo._adaptedFrame.anyVec().makeZeros(moreColnum); - String[] colNames = new String[moreColnum]; - colNames[0] = "wData"; // store weight w for data rows only - colNames[1] = "zi"; - colNames[2] = "etaOld"; - int[] randColIndices = _parms._random_columns; - for (int index = 3; index < moreColnum; index++) { - colNames[index] = _parms.train().name(index - 3); - vecs[index] = _parms.train().vec(randColIndices[index - 3]).makeCopy(); - } - _dinfo.addResponse(colNames, vecs); - for (int index = 0; index < moreColnum; index++) { - Scope.untrack(vecs[index]._key); - removeLater(vecs[index]._key); - } - } - @Override public void onCompletion(CountedCompleter caller) { doCleanup(); @@ -4695,20 +3901,6 @@ public boolean progress(double[] beta, GradientInfo ginfo) { } } - public boolean progressHGLMGLMMME(double sumDiff2, double sumeta2, int iteration, boolean atGLMMME, GLMModel - fixedModel, GLMModel[] randModels, Frame glmmmeReturns, Frame hvDataOnly, double[] VC1, double[][] VC2, - double[][] cholR, Frame augZ) { - boolean converged = !_earlyStopEnabled && (sumDiff2 < (_parms._objective_epsilon * sumeta2)); - if (atGLMMME) { - _state._iterHGLM_GLMMME++; - } else { - _state._iter++; - updateProgress(fixedModel, randModels, glmmmeReturns, hvDataOnly, VC1, VC2, sumDiff2, - sumDiff2 / sumeta2, true, cholR, augZ); - } - return !stop_requested() && !converged && (iteration < _parms._max_iterations) && !_earlyStop; - } - public boolean progress(double[] beta, double likelihood) { _state._iter++; _state.updateState(beta, likelihood); @@ -4723,19 +3915,6 @@ public boolean progress(double[] beta, double likelihood) { } private transient long _scoringInterval = SCORING_INTERVAL_MSEC; - - protected void updateProgress(GLMModel fixedModel, GLMModel[] randModels, Frame glmmmeReturns, Frame hvDataOnly, - double[] VC1, double[][] VC2, double sumDiff2, double convergence, boolean canScore, - double[][] cholR, Frame augXZ) { - _scoringHistory.addIterationScore(_state._iter, _state._sumEtaSquareConvergence); - if (canScore && (_parms._score_each_iteration || timeSinceLastScoring() > _scoringInterval || - ((_parms._score_iteration_interval > 0) && ((_state._iter % _parms._score_iteration_interval) == 0)))) { - _model.update(_state.expandBeta(_state.beta()), _state.ubeta(), -1, -1, _state._iter); - scoreAndUpdateModelHGLM(fixedModel, randModels, glmmmeReturns, hvDataOnly, VC1, VC2, sumDiff2, convergence, - cholR, augXZ, false); - _earlyStop = _earlyStopEnabled && updateEarlyStop(); - } - } // update user visible progress protected void updateProgress(boolean canScore) { assert !_parms._lambda_search || _parms._generate_scoring_history; diff --git a/h2o-algos/src/main/java/hex/glm/GLMMetricBuilder.java b/h2o-algos/src/main/java/hex/glm/GLMMetricBuilder.java index 32e9b0855eb7..6e207e9353c5 100644 --- a/h2o-algos/src/main/java/hex/glm/GLMMetricBuilder.java +++ b/h2o-algos/src/main/java/hex/glm/GLMMetricBuilder.java @@ -5,7 +5,6 @@ import hex.ModelMetricsBinomial.MetricBuilderBinomial; import hex.ModelMetricsBinomialGLM.ModelMetricsMultinomialGLM; import hex.ModelMetricsBinomialGLM.ModelMetricsOrdinalGLM; -import hex.ModelMetricsHGLM.MetricBuilderHGLM; import hex.ModelMetricsMultinomial.MetricBuilderMultinomial; import hex.ModelMetricsOrdinal.MetricBuilderOrdinal; import hex.ModelMetricsRegression.MetricBuilderRegression; @@ -18,7 +17,7 @@ import water.util.ArrayUtils; import water.util.MathUtils; -;import java.util.Arrays; +import java.util.Arrays; /** * Class for GLMValidation. @@ -51,28 +50,24 @@ public GLMMetricBuilder(String[] domain, double [] ymu, GLMWeightsFun glmf, int : Arrays.asList(Family.multinomial, Family.gaussian, Family.binomial, Family.quasibinomial, Family.fractionalbinomial, Family.poisson, Family.negativebinomial, Family.gamma, Family.tweedie) .contains(_glmf._family); - if(_computeMetrics) { - if (domain!=null && domain.length==1 && domain[0].contains("HGLM")) { - _metricBuilder = new MetricBuilderHGLM(domain); - } else { - switch (_glmf._family) { - case binomial: - case quasibinomial: - case fractionalbinomial: - _metricBuilder = new MetricBuilderBinomial(domain); - break; - case multinomial: - _metricBuilder = new MetricBuilderMultinomial(domain.length, domain, aucType); - ((MetricBuilderMultinomial) _metricBuilder)._priorDistribution = ymu; - break; - case ordinal: - _metricBuilder = new MetricBuilderOrdinal(domain.length, domain); - ((MetricBuilderOrdinal) _metricBuilder)._priorDistribution = ymu; - break; - default: - _metricBuilder = new MetricBuilderRegression(); - break; - } + if (_computeMetrics) { + switch (_glmf._family) { + case binomial: + case quasibinomial: + case fractionalbinomial: + _metricBuilder = new MetricBuilderBinomial(domain); + break; + case multinomial: + _metricBuilder = new MetricBuilderMultinomial(domain.length, domain, aucType); + ((MetricBuilderMultinomial) _metricBuilder)._priorDistribution = ymu; + break; + case ordinal: + _metricBuilder = new MetricBuilderOrdinal(domain.length, domain); + ((MetricBuilderOrdinal) _metricBuilder)._priorDistribution = ymu; + break; + default: + _metricBuilder = new MetricBuilderRegression(); + break; } } } @@ -222,45 +217,34 @@ protected void computeAIC(GLMModel gm) { } } - @Override public ModelMetrics makeModelMetrics(Model m, Frame f, Frame adaptedFrame, Frame preds) { + @Override + public ModelMetrics makeModelMetrics(Model m, Frame f, Frame adaptedFrame, Frame preds) { GLMModel gm = (GLMModel) m; - if (!gm._parms._HGLM) - computeAIC(gm); + computeAIC(gm); ModelMetrics metrics = _metricBuilder.makeModelMetrics(gm, f, null, null); - if (gm._parms._HGLM) { // HGLM - ModelMetricsHGLM.MetricBuilderHGLM metricsBDHGLM = (ModelMetricsHGLM.MetricBuilderHGLM) _metricBuilder; - metrics = new ModelMetricsHGLMGaussianGaussian(m, f, metricsBDHGLM._nobs, 0, - ((ModelMetricsHGLM) metrics)._domain, 0, - metrics._custom_metric, metricsBDHGLM._sefe, metricsBDHGLM._sere, metricsBDHGLM._varfix, - metricsBDHGLM._varranef, metricsBDHGLM._converge,metricsBDHGLM._dfrefe, metricsBDHGLM._summvc1, - metricsBDHGLM._summvc2,metricsBDHGLM._hlik, metricsBDHGLM._pvh, metricsBDHGLM._pbvh, metricsBDHGLM._caic, - metricsBDHGLM._bad, metricsBDHGLM._sumetadiffsquare, metricsBDHGLM._convergence, metricsBDHGLM._randc, - metricsBDHGLM._fixef, metricsBDHGLM._ranef, metricsBDHGLM._iterations); - } else { - if (_glmf._family == Family.binomial || _glmf._family == Family.quasibinomial || - _glmf._family == Family.fractionalbinomial) { - ModelMetricsBinomial metricsBinommial = (ModelMetricsBinomial) metrics; - GainsLift gl = null; - if (preds != null) { - Vec resp = f.vec(m._parms._response_column); - Vec weights = f.vec(m._parms._weights_column); - if (resp != null && Family.fractionalbinomial != _glmf._family) { // don't calculate for frac binomial - gl = new GainsLift(preds.lastVec(), resp, weights); - gl._groups = m._parms._gainslift_bins; - gl.exec(m._output._job); - } + if (_glmf._family == Family.binomial || _glmf._family == Family.quasibinomial || + _glmf._family == Family.fractionalbinomial) { + ModelMetricsBinomial metricsBinommial = (ModelMetricsBinomial) metrics; + GainsLift gl = null; + if (preds != null) { + Vec resp = f.vec(m._parms._response_column); + Vec weights = f.vec(m._parms._weights_column); + if (resp != null && Family.fractionalbinomial != _glmf._family) { // don't calculate for frac binomial + gl = new GainsLift(preds.lastVec(), resp, weights); + gl._groups = m._parms._gainslift_bins; + gl.exec(m._output._job); } - metrics = new ModelMetricsBinomialGLM(m, f, metrics._nobs, metrics._MSE, _domain, metricsBinommial._sigma, metricsBinommial._auc, metricsBinommial._logloss, residualDeviance(), null_devince, _aic, nullDOF(), resDOF(), gl, _customMetric, _log_likelihood); - } else if (_glmf._family == Family.multinomial) { - ModelMetricsMultinomial metricsMultinomial = (ModelMetricsMultinomial) metrics; - metrics = new ModelMetricsMultinomialGLM(m, f, metricsMultinomial._nobs, metricsMultinomial._MSE, metricsMultinomial._domain, metricsMultinomial._sigma, metricsMultinomial._cm, metricsMultinomial._hit_ratios, metricsMultinomial._logloss, residualDeviance(), null_devince, _aic, nullDOF(), resDOF(), metricsMultinomial._auc, _customMetric, _log_likelihood); - } else if (_glmf._family == Family.ordinal) { // ordinal should have a different resDOF() - ModelMetricsOrdinal metricsOrdinal = (ModelMetricsOrdinal) metrics; - metrics = new ModelMetricsOrdinalGLM(m, f, metricsOrdinal._nobs, metricsOrdinal._MSE, metricsOrdinal._domain, metricsOrdinal._sigma, metricsOrdinal._cm, metricsOrdinal._hit_ratios, metricsOrdinal._logloss, residualDeviance(), null_devince, _aic, nullDOF(), resDOF(), _customMetric, _log_likelihood); - } else { - ModelMetricsRegression metricsRegression = (ModelMetricsRegression) metrics; - metrics = new ModelMetricsRegressionGLM(m, f, metricsRegression._nobs, metricsRegression._MSE, metricsRegression._sigma, metricsRegression._mean_absolute_error, metricsRegression._root_mean_squared_log_error, residualDeviance(), residualDeviance() / _wcount, null_devince, _aic, nullDOF(), resDOF(), _customMetric, _log_likelihood); } + metrics = new ModelMetricsBinomialGLM(m, f, metrics._nobs, metrics._MSE, _domain, metricsBinommial._sigma, metricsBinommial._auc, metricsBinommial._logloss, residualDeviance(), null_devince, _aic, nullDOF(), resDOF(), gl, _customMetric, _log_likelihood); + } else if (_glmf._family == Family.multinomial) { + ModelMetricsMultinomial metricsMultinomial = (ModelMetricsMultinomial) metrics; + metrics = new ModelMetricsMultinomialGLM(m, f, metricsMultinomial._nobs, metricsMultinomial._MSE, metricsMultinomial._domain, metricsMultinomial._sigma, metricsMultinomial._cm, metricsMultinomial._hit_ratios, metricsMultinomial._logloss, residualDeviance(), null_devince, _aic, nullDOF(), resDOF(), metricsMultinomial._auc, _customMetric, _log_likelihood); + } else if (_glmf._family == Family.ordinal) { // ordinal should have a different resDOF() + ModelMetricsOrdinal metricsOrdinal = (ModelMetricsOrdinal) metrics; + metrics = new ModelMetricsOrdinalGLM(m, f, metricsOrdinal._nobs, metricsOrdinal._MSE, metricsOrdinal._domain, metricsOrdinal._sigma, metricsOrdinal._cm, metricsOrdinal._hit_ratios, metricsOrdinal._logloss, residualDeviance(), null_devince, _aic, nullDOF(), resDOF(), _customMetric, _log_likelihood); + } else { + ModelMetricsRegression metricsRegression = (ModelMetricsRegression) metrics; + metrics = new ModelMetricsRegressionGLM(m, f, metricsRegression._nobs, metricsRegression._MSE, metricsRegression._sigma, metricsRegression._mean_absolute_error, metricsRegression._root_mean_squared_log_error, residualDeviance(), residualDeviance() / _wcount, null_devince, _aic, nullDOF(), resDOF(), _customMetric, _log_likelihood); } return gm.addModelMetrics(metrics); // Update the metrics in-place with the GLM version, do DKV.put } diff --git a/h2o-algos/src/main/java/hex/glm/GLMModel.java b/h2o-algos/src/main/java/hex/glm/GLMModel.java index 53f9339d7bfa..744372d0356b 100755 --- a/h2o-algos/src/main/java/hex/glm/GLMModel.java +++ b/h2o-algos/src/main/java/hex/glm/GLMModel.java @@ -368,11 +368,7 @@ private int rank(double [] ds) { if(domain == null && (_parms._family == Family.binomial || _parms._family == Family.quasibinomial || _parms._family == Family.fractionalbinomial)) domain = binomialClassNames; - if (_parms._HGLM) { - String[] domaint = new String[]{"HGLM_" + _parms._family.toString() + "_" + _parms._rand_family[0].toString()}; - return new GLMMetricBuilder(domaint, null, null, 0, true, false, MultinomialAucType.NONE); - } else - return new GLMMetricBuilder(domain, _ymu, new GLMWeightsFun(_parms), _output.bestSubmodel().rank(), true, _parms._intercept, _parms._auc_type); + return new GLMMetricBuilder(domain, _ymu, new GLMWeightsFun(_parms), _output.bestSubmodel().rank(), true, _parms._intercept, _parms._auc_type); } protected double [] beta_internal(){ @@ -448,7 +444,6 @@ public void update(double [] beta, double[] ubeta, double devianceTrain, double Submodel sm = new Submodel(_output._submodels[id].lambda_value,_output._submodels[id].alpha_value,beta,iter, devianceTrain, devianceTest, _output._totalBetaLength, _output._submodels[id].zValues, _output._submodels[id].dispersionEstimated); - sm.ubeta = Arrays.copyOf(ubeta, ubeta.length); _output._submodels[id] = sm; _output.setSubmodelIdx(id, _parms); } @@ -477,9 +472,7 @@ public enum Constraints {EqualTo, LessThanEqualTo}; public boolean _standardize = true; public boolean _useDispersion1 = false; // internal use only, not for users public Family _family; - public Family[] _rand_family; // for HGLM public Link _link; - public Link[] _rand_link; // for HGLM public Solver _solver = Solver.AUTO; public double _tweedie_variance_power; public double _tweedie_link_power; @@ -488,7 +481,7 @@ public enum Constraints {EqualTo, LessThanEqualTo}; public double _invTheta; public double [] _alpha; public double [] _lambda; - public double[] _startval; // for HGLM, initialize fixed and random coefficients (init_u), init_sig_u, init_sig_e + public double[] _startval; // initialize GLM coefficients public boolean _calc_like; public int[] _random_columns; public int _score_iteration_interval = -1; @@ -496,7 +489,6 @@ public enum Constraints {EqualTo, LessThanEqualTo}; public Serializable _missing_values_handling = MissingValuesHandling.MeanImputation; public double _prior = -1; public boolean _lambda_search = false; - public boolean _HGLM = false; // true to enable HGLM public boolean _cold_start = false; // start GLM model from scratch if true public int _nlambdas = -1; public boolean _non_negative = false; @@ -635,39 +627,7 @@ public void validate(GLM glm) { if (_lambda_search) glm.error("ordinal regression", "Ordinal regression do not support lambda search."); } - if (_HGLM) { // check correct parameter settings for HGLM - if (_random_columns==null) - throw new IllegalArgumentException("Need to specify the random component columns for HGLM."); - if (!(_random_columns.length == 1)) - throw new IllegalArgumentException("HGLM only supports ONE random component for now."); - if (!(_rand_family==null) && !(_rand_family.length==_random_columns.length)) - throw new IllegalArgumentException("HGLM _rand_family: must have the same length as random_columns."); - if (!(_rand_link==null) && !(_rand_link.length==_random_columns.length)) - throw new IllegalArgumentException("HGLM _rand_link: must have the same length as random_columns."); - if (!_family.equals(Family.gaussian)) - throw new IllegalArgumentException("HGLM only supports Gaussian distributions for now."); - if (!(_rand_family==null)) { - for (Family fam : _rand_family) { - if (!fam.equals(Family.gaussian)) - throw new IllegalArgumentException("HGLM only supports Gaussian distributions for now."); - } - } - if (!(_rand_link==null)) { - for (Link lin : _rand_link) { - if (!lin.equals(Link.identity) && !lin.equals(Link.family_default)) - throw new IllegalArgumentException("HGLM only supports identity link functions for now."); - } - } - if (!(_link.equals(Link.family_default)) && !(_link.equals(Link.identity))) - throw new IllegalArgumentException("HGLM only supports identity link functions for now."); - if (_lambda_search) - throw new IllegalArgumentException("HGLM does not allow lambda search. Set it to False/FALSE/false to disable it."); - if (_nfolds > 1) - throw new IllegalArgumentException("HGLM does not allow cross-validation."); - if (_valid != null) - throw new IllegalArgumentException("HGLM does not allow validation."); - _glmType = GLMType.hglm; - } + if(_link != Link.family_default) { // check we have compatible link switch (_family) { case AUTO: @@ -924,32 +884,6 @@ public final double linkDeriv(double x) { // note: compute an inverse of what R throw H2O.unimpl(); } } - - public final double randLinkInv(double x, int index) { - switch(_rand_link[index]) { -// case multinomial: // should not be used - case identity: - return x; - case ologlog: - return 1.0-Math.exp(-1.0*Math.exp(x)); - case oprobit: - return _dprobit.cumulativeProbability(x); - case ologit: - case logit: - return 1.0 / (Math.exp(-x) + 1.0); - case log: - return Math.exp(x); - case inverse: - double xx = (x < 0) ? Math.min(-1e-5, x) : Math.max(1e-5, x); - return 1.0 / xx; - case tweedie: - return _tweedie_link_power == 0 - ?Math.max(2e-16,Math.exp(x)) - :Math.pow(x, 1/ _tweedie_link_power); - default: - throw new RuntimeException("unexpected link function id " + this); - } - } public final double linkInv(double x) { switch(_link) { @@ -1426,8 +1360,6 @@ public static class Submodel extends Iced { public final double devianceValid; public final int [] idxs; public final double [] beta; - public double[] ubeta; // store HGLM random coefficients - public double _trainTheta; public double[] zValues; public boolean dispersionEstimated; @@ -1549,8 +1481,6 @@ public static class GLMOutput extends Model.Output { DataInfo _dinfo; double[] _ymu; public String[] _coefficient_names; - String[] _random_coefficient_names; // for HGLM - String[] _random_column_names; public long _training_time_ms; public TwoDimTable _variable_importances; public VarImp _varimp; // should contain the same content as standardized coefficients @@ -1583,7 +1513,6 @@ public double lambda_selected(){ } final int _totalBetaLength; double[] _global_beta; - double[] _ubeta; // HGLM: random coefficients private double[] _zvalues; double[] _variable_inflation_factors; String[] _vif_predictor_names; // predictor names corresponding to the variableInflationFactors @@ -1705,9 +1634,6 @@ public String[] multiClassCoeffNames() { } return multinomialNames; } - - public String[] randomcoefficientNames() { return _random_coefficient_names; } - public double[] ubeta() { return _ubeta; } // GLM is always supervised public boolean isSupervised() { return true; } @@ -1793,10 +1719,6 @@ public GLMOutput(GLM glm) { setNames(names, glm._dinfo._adaptedFrame.typesStr()); _domains = domains; _coefficient_names = Arrays.copyOf(cnames, cnames.length + 1); - if (glm._parms._HGLM) { - _random_coefficient_names = Arrays.copyOf(glm._randCoeffNames, glm._randCoeffNames.length); - _random_column_names = Arrays.copyOf(glm._randomColNames, glm._randomColNames.length); - } _coefficient_names[_coefficient_names.length-1] = "Intercept"; _nclasses = glm.nclasses(); _totalBetaLength = glm._betaInfo.totalBetaLength(); @@ -1907,9 +1829,6 @@ public void setSubmodelIdx(int l, GLMParameters parms){ _best_lambda_idx = l; // kept to ensure backward compatibility _selected_alpha_idx = indexOf(_submodels[l].alpha_value, parms._alpha); _selected_lambda_idx = indexOf(_submodels[l].lambda_value, parms._lambda); - - if (_random_coefficient_names != null) - _ubeta = Arrays.copyOf(_submodels[l].ubeta, _submodels[l].ubeta.length); if(_multinomial || _ordinal) { _global_beta_multinomial = getNormBetaMultinomial(l); for(int i = 0; i < _global_beta_multinomial.length; ++i) @@ -2060,63 +1979,6 @@ else if (_output.bestSubmodel().alpha_value == 1) return _output._model_summary; } - /*** - * This one is for HGLM - * @param train - * @param iter - * @return - */ - public TwoDimTable generateSummaryHGLM(Key train, int iter){ - String[] names = new String[]{"Family", "Link", "Number of Predictors Total", "Number of Active Predictors", "Number of Iterations", "Training Frame"}; - String[] types = new String[]{"string", "string", "int", "int", "int", "string"}; - String[] formats = new String[]{"%s", "%s", "%d", "%d", "%d", "%s"}; - int numRand = _parms._rand_family.length; - String[] rand_family_links = new String[numRand * 2]; - for (int index = 0; index < numRand; index++) { - int tindex = index * 2; - rand_family_links[tindex] = "rand_family_for_column_" + index; - rand_family_links[tindex + 1] = "rand_link_for_column_" + index; - } - int totListLen = _parms._rand_family.length * 2 + names.length; - String[] tnames = new String[totListLen]; - String[] ttypes = new String[totListLen]; - String[] tformats = new String[totListLen]; - System.arraycopy(names, 0, tnames, 0, 2); // copy family, link - System.arraycopy(types, 0, ttypes, 0, 2); - System.arraycopy(formats, 0, tformats, 0, 2); - int numCopy = 2 * numRand; - for (int index = 0; index < numCopy; index++) { // insert random family/link info - tnames[index + 2] = rand_family_links[index]; - ttypes[index + 2] = "string"; - tformats[index + 2] = "%s"; - } - int offset = 2 + numCopy; - int copyLength = names.length - 2; - System.arraycopy(names, 2, tnames, offset, copyLength); // copy remaining of original names - System.arraycopy(types, 2, ttypes, offset, copyLength); - System.arraycopy(formats, 2, tformats, offset, copyLength); - _output._model_summary = new TwoDimTable("HGLM Model", "summary", new String[]{""}, - tnames, ttypes, tformats, ""); - int tableColIndex = 0; - _output._model_summary.set(0, tableColIndex++, _parms._family.toString()); - _output._model_summary.set(0, tableColIndex++, _parms._link.toString()); - int numFamily = _parms._rand_family.length; - for (int index = 0; index < numFamily; index++) { - _output._model_summary.set(0, tableColIndex++, _parms._rand_family[index].name()); - if (_parms._rand_link == null) - _output._model_summary.set(0, tableColIndex++, _parms._rand_family[index].defaultLink.name()); - else - _output._model_summary.set(0, tableColIndex++, _parms._rand_link[index].name()); - } - int intercept = _parms._intercept ? 1 : 0; - _output._model_summary.set(0, tableColIndex++, beta().length - 1); - _output._model_summary.set(0, tableColIndex++, Integer.toString(_output.rank() - intercept)); - _output._model_summary.set(0, tableColIndex++, Integer.valueOf(iter)); - _output._model_summary.set(0, tableColIndex, train.toString()); - return _output._model_summary; - } - - @Override public long checksum_impl(){ if(_parms._train == null) return 0; return super.checksum_impl(); @@ -2393,13 +2255,13 @@ protected ModelMetrics.MetricBuilder scoreMetrics(Frame adaptFrm) { @Override public boolean haveMojo() { - if (!_parms._HGLM && _parms.interactionSpec() == null) return super.haveMojo(); + if (_parms.interactionSpec() == null) return super.haveMojo(); else return false; } @Override public boolean havePojo() { - if (!_parms._HGLM && _parms.interactionSpec() == null && _parms._offset_column == null) return super.havePojo(); + if (_parms.interactionSpec() == null && _parms._offset_column == null) return super.havePojo(); else return false; } diff --git a/h2o-algos/src/main/java/hex/glm/GLMTask.java b/h2o-algos/src/main/java/hex/glm/GLMTask.java index 4449014c186d..4c34f96969a5 100644 --- a/h2o-algos/src/main/java/hex/glm/GLMTask.java +++ b/h2o-algos/src/main/java/hex/glm/GLMTask.java @@ -13,7 +13,6 @@ import water.H2O.H2OCountedCompleter; import water.fvec.C0DChunk; import water.fvec.Chunk; -import water.fvec.Frame; import water.util.ArrayUtils; import water.util.FrameUtils; import water.util.MathUtils; @@ -23,7 +22,6 @@ import static hex.glm.GLMModel.GLMParameters.DispersionMethod.deviance; import static hex.glm.GLMModel.GLMParameters.Family.gaussian; -import static hex.glm.GLMTask.DataAddW2AugXZ.getCorrectChunk; import static hex.glm.GLMUtils.updateGradGam; import static hex.glm.GLMUtils.updateGradGamMultinomial; import static org.apache.commons.math3.special.Gamma.*; @@ -40,8 +38,7 @@ */ public abstract class GLMTask { final static double EPS=1e-10; - final static double ZEROEQUAL = 1e-8; - final static double ONEEQUAL = 1-1e-8; + static class NullDevTask extends MRTask { double _nullDev; final double [] _ymu; @@ -1740,1160 +1737,6 @@ public void reduce(GLMCoordinateDescentTask git){ // adding contribution of all } */ - - public static class RandColAddW2AugXZ extends MRTask { - public int[] _cumRandCatLevels; // cumulative sum of random column categorical levels - public int _randNumColStart; - public long _randRowStart; // index into absolute row number start in Frame _AugXZ - Job _job; - Frame _prior_weights_psi; // first column is prior weight, second column is wpsi, third is zmi - public int _totAugxzColNumber; - public int[] _weightID; - - public RandColAddW2AugXZ(Job job, int[] randCatLevels, Frame prior_weights_psi, int wpsiID, long randRowStart, - int randNumColStart, int augXZColNum) { - _job = job; - _prior_weights_psi = prior_weights_psi; - _weightID = new int[]{wpsiID}; - _cumRandCatLevels = ArrayUtils.cumsum(randCatLevels); - _randRowStart = randRowStart; - _randNumColStart = randNumColStart; - _totAugxzColNumber = augXZColNum; - } - - /*** - * Given colIndex to the expanded random columns, this method will calculate which random column that colIndex - * belongs to. - * @param cumrandCatLevels - * @param colIndex - * @return - */ - public static int findRandColIndex(int[] cumrandCatLevels, long colIndex) { - int len = cumrandCatLevels.length; - for (int index = 0; index < len; index++) { - if (colIndex < cumrandCatLevels[index]) - return index; - } - return (len-1); - } - - @Override - public void map(Chunk[] chunks) { // chunks will be AugXZ - long chkStartIdx = chunks[0].start(); // absolute row number of AugXZ chunk - // Note here, we are working on the lower rows of augXZ related to 0 | Iq. - if ((chkStartIdx+chunks[0].len()) >= _randRowStart) { // only start working if we are looking at correct chunk - // need to figure out which chunk of priorWeightsWpsi to take and where the row start should be as well - Chunk[] priorWeightsWpsi = new Chunk[1]; - int chkRowStart = (int) (_randRowStart-chkStartIdx); // relative start in AugXZ - chkRowStart = chkRowStart > 0?chkRowStart:0; // whole chunk of AugXZ used to calculate lower part of AugXZ - long chkWeightRowStart = chkRowStart+chkStartIdx-_randRowStart; // first row of absolute row index of weight - int[] weightChunkInfo = getCorrectChunk(_prior_weights_psi, 0, chkWeightRowStart, priorWeightsWpsi, - _weightID, null); // start from 0 to total randColExpanded - int chkWeightRelRow = weightChunkInfo[2]; - int psiColumnIndex = (int)priorWeightsWpsi[0].start()+_randNumColStart+chkWeightRelRow;// start of column for random Columns - for (int index = chkRowStart; index < chunks[0]._len; index++) { // go throw each row of AugXZ - if (chkWeightRelRow >= weightChunkInfo[1]) { // need to grab a new weight chunk - weightChunkInfo = getCorrectChunk(_prior_weights_psi, weightChunkInfo[0]+1, - chkWeightRelRow+priorWeightsWpsi[0].start(), priorWeightsWpsi, _weightID, weightChunkInfo); - chkWeightRelRow = weightChunkInfo[2]; - } - double wpsi = priorWeightsWpsi[0].atd(chkWeightRelRow); - for (int colIndex=0; colIndex < psiColumnIndex; colIndex++) { - chunks[colIndex].set(index, 0.0); // zero out columns to left of psiColumnIndex - } - chunks[psiColumnIndex].set(index, wpsi); // update weight to AugXZ - psiColumnIndex++; - for (int colIndex=psiColumnIndex; colIndex < _totAugxzColNumber; colIndex++) - chunks[colIndex].set(index, 0.0); // zero out columns to right of psiColumnIndex - chkWeightRelRow++; - } - } - } - } - - /*** - * - * This class calculates wpsi, zmi for frame _prior_weights_psi. - */ - public static class CalculateW4Rand extends MRTask { - GLMParameters _parms; - public int[] _cumRandCatLevels; // cumulative sum of random column categorical levels - public double[] _psi; - public double[] _phi; - public int _numRandCol; // number of random columns specified by user - Job _job; - double[] _vi; // store random column coefficients - - public CalculateW4Rand(Job job, GLMParameters params, int[] randCatLevels, double[] psi, double[] phi, - double[] vi) { - _job = job; - _parms = params; - _numRandCol = _parms._random_columns.length; // number of random columns specified by user - _cumRandCatLevels = ArrayUtils.cumsum(randCatLevels); - _psi = psi; - _phi = phi; - _vi = vi; - } - - /*** - * Given colIndex to the expanded random columns, this method will calculate which random column that colIndex - * belongs to. - * @param cumrandCatLevels - * @param colIndex - * @return - */ - public static int findRandColIndex(int[] cumrandCatLevels, long colIndex) { - int len = cumrandCatLevels.length; - for (int index = 0; index < len; index++) { - if (colIndex < cumrandCatLevels[index]) - return index; - } - return (len-1); - } - - @Override - public void map(Chunk[] chunks) { // chunks will be wpsi_frame - GLMWeightsFun[] glmfunRand = ReturnGLMMMERunInfo.getRandGLMFuns(null, _numRandCol, _parms); - double temp, ui, zmi, wpsi; - for (int index = 0; index < chunks[0]._len; index++) { // go throw each row of AugXZ - int randIndex = findRandColIndex(_cumRandCatLevels, index); - temp = glmfunRand[randIndex].linkInvDeriv(_phi[index]); // du_dv - ui = glmfunRand[randIndex].linkInv(_vi[index]); - zmi = _vi[index]+(_psi[index]-ui)/temp; - chunks[2].set(index, zmi); - wpsi = chunks[0].atd(index) * temp * temp / (glmfunRand[randIndex].variance(_psi[index]) * _phi[index]); - chunks[1].set(index, Math.sqrt(wpsi)); // update weight frame with new weight - } - } - - } - - public static class GenerateResid extends MRTask { - public Job _job; - double _oneOverSqrtSumDevONMP; - int _hvColIdx; - int _residColIdx; - long _numDataRows; - - public GenerateResid(Job job, double oneOverSqrtSumDevONMP, int hvColIdx, int residColIdx, long numDataRows) { - _job = job; - _oneOverSqrtSumDevONMP = oneOverSqrtSumDevONMP; - _hvColIdx = hvColIdx; - _residColIdx = residColIdx; - _numDataRows = numDataRows; - } - - @Override - public void map(Chunk[] chunks) { // chunk contains infos from GLMMME run - long chkStartRowIdx = chunks[0].start(); - int chkRowNumber = chunks[0].len(); - for (int rowIndex=0; rowIndex { - public double _sumEtaDiffSq; - public double _sumEtaSq; - public int[] _etaOetaN; - - public CalculateEtaInfo(int[] etaOldetaNew) { - _sumEtaDiffSq = 0; - _sumEtaSq = 0; - _etaOetaN = etaOldetaNew; - } - - @Override - public void map(Chunk[] chunks) { - _sumEtaDiffSq = 0; - _sumEtaSq = 0; - int chkLen = chunks[0].len(); - for (int rowIndex=0; rowIndex < chkLen; rowIndex++) { - double tempetaN = chunks[_etaOetaN[1]].atd(rowIndex); // grab etaNew value - double tempetaDiff = chunks[_etaOetaN[0]].atd(rowIndex)-tempetaN; - _sumEtaSq += tempetaN*tempetaN; - _sumEtaDiffSq += tempetaDiff*tempetaDiff; - } - } - - @Override - public void reduce(CalculateEtaInfo other){ - this._sumEtaDiffSq += other._sumEtaDiffSq; - this._sumEtaSq += other._sumEtaSq; - } - } - - - public static class ExtractFrameFromSourceWithProcess extends MRTask { - public Frame _sourceFrame; - int[] _devhvColIdx; - long _startRowIndex; // matches 0 row of dest chunk - long _lengthToCopy; - - public ExtractFrameFromSourceWithProcess(Frame sourceFrame, int[] devHvColIdx, long startRowIndex, long lengthCopy) { - _sourceFrame = sourceFrame; - _devhvColIdx = devHvColIdx; - _startRowIndex = startRowIndex; - _lengthToCopy = lengthCopy; - } - - @Override - public void map(Chunk[] chunks) { - long startChkIdx = chunks[0].start(); // absolute row index of chunks to copy to. - int chkLen = chunks[0].len(); - long sourceChkIdx = _startRowIndex+startChkIdx; // absolute source chunk row index - Chunk[] sourceChunks = new Chunk[_devhvColIdx.length]; - int[] fetchedChunkInfo = getCorrectChunk(_sourceFrame, 0, sourceChkIdx, sourceChunks, _devhvColIdx, - null); - int fetchedRelRowIndex = fetchedChunkInfo[2]; - for (int rowIndex=0; rowIndex < chkLen; rowIndex++) { - if (rowIndex+startChkIdx >= _lengthToCopy) - break; - if (fetchedRelRowIndex >= fetchedChunkInfo[1]) { - fetchedChunkInfo = getCorrectChunk(_sourceFrame, fetchedChunkInfo[0]+1, - fetchedRelRowIndex+sourceChunks[0].start(), sourceChunks, _devhvColIdx, fetchedChunkInfo); - fetchedRelRowIndex = fetchedChunkInfo[2]; - } - double temp = 1.0-sourceChunks[1].atd(fetchedRelRowIndex); - chunks[0].set(rowIndex, sourceChunks[0].atd(fetchedRelRowIndex)/temp); // set response - chunks[2].set(rowIndex, temp/2); // set weight - fetchedRelRowIndex++; - } - } - } - - /*** - * This class will copy columns from a source frame to columns in the destination frame - * - */ - public static class CopyPartsOfFrame extends MRTask { - public Frame _sourceFrame; - public int[] _destColIndices; - public int[] _sourceColIndices; - public long _nrowsToCopy; - - public CopyPartsOfFrame(Frame fr, int[] destFrameColID, int[] sourceFrameColID, long numRows) { - _sourceFrame = fr; - if (sourceFrameColID==null) { - int numCols = fr.numCols(); - _sourceColIndices = new int[numCols]; - for (int index=0; index < numCols; index++) - _sourceColIndices[index] = index; - } else - _sourceColIndices = sourceFrameColID; - - if (destFrameColID == null) { - int numCols = _sourceColIndices.length; - _destColIndices = new int[numCols]; - for (int index=0; index < numCols; index++) - _destColIndices[index]=index; - } else - _destColIndices = destFrameColID; - - assert _destColIndices.length==_sourceColIndices.length; - _nrowsToCopy = numRows; - } - - @Override - public void map(Chunk[] chunks) { // chunk contains infos from GLMMME run - int colLen = _sourceColIndices.length; - long chkStartIdx = chunks[0].start(); // first row of destination frame - Chunk[] sourceChunks = new Chunk[colLen]; // just fetch the needed columns from the source - long lastRowIndex = chkStartIdx + chunks[0].len(); - if (chkStartIdx < _nrowsToCopy) { // only copy chunk when there are enough source chunks - int rowLen = lastRowIndex > _nrowsToCopy ? ((int) (_nrowsToCopy - chkStartIdx)) : chunks[0].len(); - int[] fetchedChkInfo = getCorrectChunk(_sourceFrame, 0, chkStartIdx, sourceChunks, _sourceColIndices, null); - int fetchedChkRelRow = fetchedChkInfo[2]; - for (int rowIndex = 0; rowIndex < rowLen; rowIndex++) { - if (fetchedChkRelRow >= fetchedChkInfo[1]) { // need new chunk - fetchedChkInfo = getCorrectChunk(_sourceFrame, fetchedChkInfo[0] + 1, - fetchedChkRelRow + sourceChunks[0].start(), sourceChunks, _sourceColIndices, fetchedChkInfo); - fetchedChkRelRow = fetchedChkInfo[2]; - } - for (int colIndex = 0; colIndex < colLen; colIndex++) { - chunks[_destColIndices[colIndex]].set(rowIndex, sourceChunks[colIndex].atd(fetchedChkRelRow)); - } - fetchedChkRelRow++; - } - } - } - } - - public static class ReturnGLMMMERunInfo extends MRTask { - public DataInfo _dinfo; - public Frame _w_prior_wpsi; - public Frame _qMatrix; - Job _job; - double _sumDev; - double _sumEtaDiffSq; - double _sumEtaSq; - public int _totalaugXZCol; - public int[] _dinfoWCol; // columns to load from dinfo to calculate z and dev - public int[] _wpriorwpsiCol; // columns to load from _w_prior_wpsi to calculate z and dev - public long _numDataRow; - public int _maxdinfoCol; // number of columns to load from dinfo._adaptedFrame - GLMParameters _parms; - public double[] _psi; - public double[] _ubeta; - public int[] _cumRandCatLevels; // cumulative sum of random column categorical levels - public int _numRandCol; - - public ReturnGLMMMERunInfo(Job job, DataInfo datainfo, Frame wpriorwpsi, Frame qMatrix, int[] dinfoWCol, int[] wCol, - GLMParameters params, double[] psi, double[] ubeta, int[] cumRandCatLevels) { - _job = job; - _dinfo = datainfo; - _w_prior_wpsi = wpriorwpsi; - _qMatrix= qMatrix; - _sumDev = 0; - _sumEtaDiffSq = 0; - _sumEtaSq = 0; - _totalaugXZCol = qMatrix.numCols(); - _dinfoWCol = dinfoWCol; - _wpriorwpsiCol = wCol; - _numDataRow = _dinfo._adaptedFrame.numRows(); - _maxdinfoCol = _dinfo._weights?4:3; - _parms = params; - _psi = psi; - _ubeta = ubeta; - _cumRandCatLevels = cumRandCatLevels; - _numRandCol = cumRandCatLevels.length; - } - - public static GLMWeightsFun[] getRandGLMFuns(GLMWeightsFun[] randGLMs, int numRandFuncs, GLMParameters params) { - if (randGLMs == null) - randGLMs = new GLMWeightsFun[numRandFuncs]; - for (int index=0; index < numRandFuncs; index++) { - Link randlink; - if (params._rand_link==null) - randlink = params._rand_family[index].defaultLink; - else - randlink = params._rand_link[index]; - randGLMs[index] = new GLMWeightsFun(params._rand_family[index], randlink, - params._tweedie_variance_power, params._tweedie_link_power, 0, params._init_dispersion_parameter, false); - } - return randGLMs; - } - - @Override - public void reduce(ReturnGLMMMERunInfo other){ - this._sumEtaDiffSq += other._sumEtaDiffSq; - this._sumEtaSq += other._sumEtaSq; - } - - @Override - public void map(Chunk[] chunks) { // chunk contains infos from GLMMME run - GLMWeightsFun glmfun=null; - GLMWeightsFun[] glmfunRand = null; - long chkStartRowIdx = chunks[0].start(); // first row number of chunk - int chkRowNumber = chunks[0].len(); - Chunk[] chunksqMatrix = new Chunk[_totalaugXZCol]; // fetch chunk from AugXZ in order to calculate hv, Augz - int[] qMatrixInfo = getCorrectChunk(_qMatrix, 0, chkStartRowIdx, chunksqMatrix, null, null); - Chunk[] chunks4ZDev = new Chunk[4]; // potentially load response, zi, etai, weightID - int[] zdevChunkInfo = new int[3]; - boolean usingWpsi; - long rowOffset = chkStartRowIdx-_numDataRow; - if (chkStartRowIdx >= _numDataRow) { // load _w_prior_wpsi chunks, get wprior, zmi - usingWpsi = true; - zdevChunkInfo = getCorrectChunk(_w_prior_wpsi, 0, rowOffset, chunks4ZDev, - _wpriorwpsiCol, zdevChunkInfo); - glmfunRand = getRandGLMFuns(glmfunRand, _numRandCol, _parms); - } else { // load from dinfo: response, zi, etai and maybe weightID for prior_weight - usingWpsi = false; - glmfun = new GLMWeightsFun(_parms._family, _parms._link, _parms._tweedie_variance_power, - _parms._tweedie_link_power, 0, _parms._init_dispersion_parameter, false); - zdevChunkInfo = getCorrectChunk(_dinfo._adaptedFrame, 0, chkStartRowIdx, chunks4ZDev, _dinfoWCol, - zdevChunkInfo); - } - for (int rowIndex=0; rowIndex < chkRowNumber; rowIndex++) { // correct chunks are loaded for now - int zdevAbsRelRowNumber = usingWpsi?(int)(rowIndex+rowOffset):rowIndex+zdevChunkInfo[2]; // offset into zdevChunks - if (!usingWpsi && (zdevAbsRelRowNumber >= zdevChunkInfo[1])) { // running out of rows with dinfo - long rowAbsIndex = rowIndex+chkStartRowIdx; - if (rowAbsIndex>=_numDataRow) { // load from wprior_wpsi - usingWpsi=true; - zdevChunkInfo = getCorrectChunk(_w_prior_wpsi, 0, rowAbsIndex-_numDataRow, - chunks4ZDev, _wpriorwpsiCol, zdevChunkInfo); - if (glmfunRand==null) { // generate glmfunRand[] for the first time only - glmfunRand = getRandGLMFuns(glmfunRand, _numRandCol, _parms); - } - } else { // still load from dinfo - zdevChunkInfo = getCorrectChunk(_dinfo._adaptedFrame, 0, - rowAbsIndex, chunks4ZDev, _dinfoWCol, zdevChunkInfo); - if (glmfun==null) - glmfun = new GLMWeightsFun(_parms._family, _parms._link, _parms._tweedie_variance_power, - _parms._tweedie_link_power, 0, _parms._init_dispersion_parameter, false); - } - zdevAbsRelRowNumber = usingWpsi?(int)(rowIndex+rowOffset):rowIndex+zdevChunkInfo[2]; - } else if (usingWpsi && (zdevAbsRelRowNumber-zdevChunkInfo[0]) >= zdevChunkInfo[1]) { // load from wprior_wpsi - zdevChunkInfo = getCorrectChunk(_w_prior_wpsi, 0, zdevAbsRelRowNumber, chunks4ZDev, _wpriorwpsiCol, - zdevChunkInfo); - if (glmfunRand==null) { // generate glmfunRand[] for the first time only - glmfunRand = getRandGLMFuns(glmfunRand, _numRandCol, _parms); - } - zdevAbsRelRowNumber = (int)(rowIndex+rowOffset); - } - _sumDev += calDev(usingWpsi, _cumRandCatLevels, zdevAbsRelRowNumber, chunks4ZDev, chunks, rowIndex, glmfun, glmfunRand, _psi, _ubeta); - setHv(chunksqMatrix, chunks[1], rowIndex, qMatrixInfo[2]++); // get hv from augXZ only - if (qMatrixInfo[2] > qMatrixInfo[1]) { // need to load in new chunk - qMatrixInfo = getCorrectChunk(_qMatrix, 1+qMatrixInfo[0], chkStartRowIdx, chunksqMatrix, null, qMatrixInfo); - } - } - } - - public static void setHv(Chunk[] qmat, Chunk hv, int relRowIndex, int qmatRelRowIndex) { - int numCol = qmat.length; - double rowSum = 0; - for (int colIndex=0; colIndex < numCol; colIndex++) { - double temp = qmat[colIndex].atd(qmatRelRowIndex); - rowSum += temp*temp; - } - hv.set(relRowIndex, rowSum > ONEEQUAL?ONEEQUAL:rowSum); - } - - public double calDev(boolean usingWpsi, int[] _cumRandCatLevels, int zdevAbsRelRowNumber, Chunk[] chunks4ZDev, - Chunk[] chunks, int rowIndex, GLMWeightsFun glmfun, - GLMWeightsFun[] glmfunRand, double[] psi, double[] ubeta) { - if (usingWpsi) { - int randIndex = RandColAddW2AugXZ.findRandColIndex(_cumRandCatLevels, zdevAbsRelRowNumber); - return setZDevEta(chunks4ZDev, chunks, rowIndex, (int) (zdevAbsRelRowNumber-chunks[0].start()), - (int) zdevAbsRelRowNumber, glmfunRand[randIndex], psi, ubeta); - } else { // get z, dev, eta from dinfo - return setZDevEta(chunks4ZDev, chunks, rowIndex, zdevAbsRelRowNumber, glmfun); - } - } - - public static double setZDevEta(Chunk[] wpsiChunks, Chunk[] destChunk, int relRowIndex, int wpsiRowIndex, - int abswpsiRowIndex, GLMWeightsFun glmfuns, double[] psi, double[] ubeta) { - destChunk[0].set(relRowIndex, wpsiChunks[1].atd(wpsiRowIndex)); // set Z value - double temp = psi[abswpsiRowIndex]-ubeta[abswpsiRowIndex]; - double devVal=wpsiChunks[0].atd(wpsiRowIndex)*temp*temp; - destChunk[2].set(relRowIndex, devVal < ZEROEQUAL?ZEROEQUAL:devVal); - return devVal; - } - - public double setZDevEta(Chunk[] dinfoChunks, Chunk[] destChunk, int relRowIndex, int dinfoRowIndex, - GLMWeightsFun glmfun) { - destChunk[0].set(relRowIndex, dinfoChunks[1].atd(dinfoRowIndex)); // set AugZ value - double eta = dinfoChunks[2].atd(dinfoRowIndex); - destChunk[3].set(relRowIndex, eta); // set new eta value - double temp2 = eta-destChunk[5].atd(relRowIndex); - _sumEtaDiffSq += temp2*temp2; - _sumEtaSq += eta*eta; - double temp = dinfoChunks[0].atd(dinfoRowIndex)-glmfun.linkInv(eta); - destChunk[4].set(relRowIndex, temp); // set resid = (y-mu.i) - double prior_weight = dinfoChunks[3]==null?1:dinfoChunks[3].atd(dinfoRowIndex); - double devVal = prior_weight*temp*temp; - destChunk[2].set(relRowIndex, devVal < ZEROEQUAL?ZEROEQUAL:devVal); - return devVal; - } - } - - public static class ReturnGLMMMERunInfoRandCols extends MRTask { - public DataInfo _dinfo; - public Frame _w_prior_wpsi; - public Frame _qMatrix; - Job _job; - double _sumDev; - public int _totalqMatrixCols; - public int[] _wpriorwpsiCol; // columns to load from _w_prior_wpsi to calculate z and dev - public long _numDataRow; - GLMParameters _parms; - public double[] _psi; - public double[] _ubeta; - public int[] _cumRandCatLevels; // cumulative sum of random column categorical levels - public int _numRandCol; - - public ReturnGLMMMERunInfoRandCols(Job job, DataInfo datainfo, Frame wpriorwpsi, Frame qmatrix, int[] wCol, - GLMParameters params, double[] psi, double[] ubeta, int[] cumRandCatLevels) { - _job = job; - _w_prior_wpsi = wpriorwpsi; - _qMatrix = qmatrix; - _sumDev = 0; - _totalqMatrixCols = qmatrix.numCols(); - _wpriorwpsiCol = wCol; - _numDataRow = datainfo._adaptedFrame.numRows(); - _parms = params; - _psi = psi; - _ubeta = ubeta; - _cumRandCatLevels = cumRandCatLevels; - _numRandCol = cumRandCatLevels.length; - } - - @Override - public void map(Chunk[] chunks) { // chunk contains infos from GLMMME run - GLMWeightsFun[] glmfunRand = null; - long chkStartRowIdx = chunks[0].start(); // first row number of chunk - long maxChkRowIdx = chunks[0].start() + chunks[0].len(); - if (chkStartRowIdx >= _numDataRow || _numDataRow < maxChkRowIdx) { // only update random column part - int chkRowNumber = chunks[0].len(); - chkStartRowIdx = chkStartRowIdx >= _numDataRow?chkStartRowIdx:_numDataRow; // absolute row start index - Chunk[] chunksqMatrix = new Chunk[_totalqMatrixCols]; // fetch chunk from qMatrix in order to calculate hv, Augz - int[] qMatrixInfo = getCorrectChunk(_qMatrix, 0, chkStartRowIdx, chunksqMatrix, null, null); - Chunk[] chunks4ZDev = new Chunk[4]; // potentially load response, zi, etai, weightID - int[] zdevChunkInfo = new int[3]; - - long rowOffset = chkStartRowIdx - _numDataRow; - zdevChunkInfo = getCorrectChunk(_w_prior_wpsi, 0, rowOffset, chunks4ZDev, - _wpriorwpsiCol, zdevChunkInfo); - int zdevRelRowNumber = zdevChunkInfo[2]; - int qMatrixRelRow = qMatrixInfo[2]; - glmfunRand = getRandGLMFuns(glmfunRand, _numRandCol, _parms); - int chunkRowStart = (int)(chkStartRowIdx-chunks[0].start()); - for (int rowIndex = chunkRowStart; rowIndex < chkRowNumber; rowIndex++) { // correct chunks are loaded for now - if (zdevRelRowNumber >= zdevChunkInfo[1]) { - zdevChunkInfo = getCorrectChunk(_w_prior_wpsi, zdevChunkInfo[0]+1, - zdevRelRowNumber+chunks4ZDev[0].start(), chunks4ZDev, _wpriorwpsiCol, - zdevChunkInfo); - zdevRelRowNumber = zdevChunkInfo[2]; - if (glmfunRand == null) { // generate glmfunRand[] for the first time only - glmfunRand = getRandGLMFuns(glmfunRand, _numRandCol, _parms); - } - } - if (qMatrixRelRow >= qMatrixInfo[1]) { - qMatrixInfo = getCorrectChunk(_qMatrix, qMatrixInfo[0]+1, qMatrixRelRow+chunksqMatrix[0].start(), - chunksqMatrix, null, qMatrixInfo); - qMatrixRelRow=qMatrixInfo[2]; - } - int randIndex = RandColAddW2AugXZ.findRandColIndex(_cumRandCatLevels, zdevRelRowNumber); - _sumDev += setZDevEta(chunks4ZDev, chunks, rowIndex, zdevRelRowNumber, - (int) (zdevRelRowNumber+chunks4ZDev[0].start()), _psi, _ubeta); - setHv(chunksqMatrix, chunks[1], rowIndex, qMatrixRelRow); // get hv from augXZ only - qMatrixRelRow++; - zdevRelRowNumber++; - } - } - } - - - public static GLMWeightsFun[] getRandGLMFuns(GLMWeightsFun[] randGLMs, int numRandFuncs, GLMParameters params) { - if (randGLMs == null) - randGLMs = new GLMWeightsFun[numRandFuncs]; - for (int index=0; index < numRandFuncs; index++) { - Link randlink; - if (params._rand_link==null) - randlink = params._rand_family[index].defaultLink; - else - randlink = params._rand_link[index]; - randGLMs[index] = new GLMWeightsFun(params._rand_family[index], randlink, - params._tweedie_variance_power, params._tweedie_link_power, 0, params._init_dispersion_parameter, false); - } - return randGLMs; - } - - @Override - public void reduce(ReturnGLMMMERunInfoRandCols other){ - this._sumDev += other._sumDev; - } - - - public static void setHv(Chunk[] qmat, Chunk hv, int relRowIndex, int qmatRelRowIndex) { - int numCol = qmat.length; - double rowSum = 0; - for (int colIndex=0; colIndex < numCol; colIndex++) { - double temp = qmat[colIndex].atd(qmatRelRowIndex); - rowSum += temp*temp; - } - hv.set(relRowIndex, rowSum > ONEEQUAL?ONEEQUAL:rowSum); - } - - public static double setZDevEta(Chunk[] wpsiChunks, Chunk[] destChunk, int relRowIndex, int wpsiRowIndex, - int abswpsiRowIndex, double[] psi, double[] ubeta) { - destChunk[0].set(relRowIndex, wpsiChunks[1].atd(wpsiRowIndex)); // set Z value - double temp = psi[abswpsiRowIndex]-ubeta[abswpsiRowIndex]; - double devVal=wpsiChunks[0].atd(wpsiRowIndex)*temp*temp; - destChunk[2].set(relRowIndex, devVal < ZEROEQUAL?ZEROEQUAL:devVal); - return devVal; - } - } - - /*** - * fill in the returnFrame from the data portion only - */ - public static class ReturnGLMMMERunInfoData extends MRTask { - public DataInfo _dinfo; - public Frame _qMatrix; - Job _job; - double _sumDev; - double _sumEtaDiffSq; - double _sumEtaSq; - public int _totalaugXZCol; - public int[] _dinfoWCol; // columns to load from dinfo to calculate z and dev - public long _numDataRow; - GLMParameters _parms; - - public ReturnGLMMMERunInfoData(Job job, DataInfo datainfo, Frame qMatrix, int[] dinfoWCol, - GLMParameters params) { - _job = job; - _dinfo = datainfo; - _qMatrix = qMatrix; - _sumDev = 0; - _sumEtaDiffSq = 0; - _sumEtaSq = 0; - _totalaugXZCol = qMatrix.numCols(); - _dinfoWCol = dinfoWCol; - _numDataRow = _dinfo._adaptedFrame.numRows(); - _parms = params; - } - - @Override - public void map(Chunk[] chunks) { // chunk contains infos from GLMMME run - long chkStartRowIdx = chunks[0].start(); // first row number of chunk - if (chkStartRowIdx < _numDataRow) { // only look at chunks corresponding to the data rows - GLMWeightsFun glmfun = null; - - long maxRowIndex = chkStartRowIdx+chunks[0].len(); - int chkRowNumber = maxRowIndex>=_numDataRow?chunks[0].len()-(int)(maxRowIndex-_numDataRow):chunks[0].len(); // number of row to consider - Chunk[] chunksqMatrix = new Chunk[_totalaugXZCol]; // fetch chunk from qMatrix in order to calculate hv, Augz - int[] qMatrixInfo = getCorrectChunk(_qMatrix, 0, chkStartRowIdx, chunksqMatrix, null, null); - int qMatrixRelRow = qMatrixInfo[2]; - Chunk[] chunks4ZDev = new Chunk[4]; // potentially load response, zi, etai, weightID - int[] zdevChunkInfo = new int[3]; - - glmfun = new GLMWeightsFun(_parms._family, _parms._link, _parms._tweedie_variance_power, - _parms._tweedie_link_power, 0, _parms._init_dispersion_parameter, false); - zdevChunkInfo = getCorrectChunk(_dinfo._adaptedFrame, 0, chkStartRowIdx, chunks4ZDev, _dinfoWCol, - zdevChunkInfo); - int zdevAbsRelRowNumber = zdevChunkInfo[2]; - for (int rowIndex = 0; rowIndex < chkRowNumber; rowIndex++) { // correct chunks are loaded for now - if (zdevAbsRelRowNumber >= zdevChunkInfo[1]) { // exceeds chunk limit, grab next one - zdevChunkInfo = getCorrectChunk(_dinfo._adaptedFrame, zdevChunkInfo[0] + 1, - zdevAbsRelRowNumber + chunks4ZDev[0].start(), chunks4ZDev, _dinfoWCol, zdevChunkInfo); - zdevAbsRelRowNumber = zdevChunkInfo[2]; - } - if (qMatrixRelRow > qMatrixInfo[1]) { // need to load in new chunk - qMatrixInfo = getCorrectChunk(_qMatrix, 1 + qMatrixInfo[0], qMatrixRelRow+chunksqMatrix[0].start(), chunksqMatrix, null, qMatrixInfo); - qMatrixRelRow = qMatrixInfo[2]; - } - if (glmfun == null) - glmfun = new GLMWeightsFun(_parms._family, _parms._link, _parms._tweedie_variance_power, - _parms._tweedie_link_power, 0, _parms._init_dispersion_parameter, false); - _sumDev += setZDevEta(chunks4ZDev, chunks, rowIndex, zdevAbsRelRowNumber, glmfun); - setHv(chunksqMatrix, chunks[1], rowIndex, qMatrixRelRow); // get hv from qmatrix only - qMatrixRelRow++; - zdevAbsRelRowNumber++; - } - } - } - - @Override - public void reduce(ReturnGLMMMERunInfoData other){ - this._sumEtaDiffSq += other._sumEtaDiffSq; - this._sumEtaSq += other._sumEtaSq; - this._sumDev += other._sumDev; - } - - public static void setHv(Chunk[] qmat, Chunk hv, int relRowIndex, int qmatRelRowIndex) { - int numCol = qmat.length; - double rowSum = 0; - for (int colIndex=0; colIndex < numCol; colIndex++) { - double temp = qmat[colIndex].atd(qmatRelRowIndex); - rowSum += temp*temp; - } - hv.set(relRowIndex, rowSum > ONEEQUAL?ONEEQUAL:rowSum); - } - - public double setZDevEta(Chunk[] dinfoChunks, Chunk[] destChunk, int relRowIndex, int dinfoRowIndex, - GLMWeightsFun glmfun) { - destChunk[0].set(relRowIndex, dinfoChunks[1].atd(dinfoRowIndex)); // set AugZ value - double eta = dinfoChunks[2].atd(dinfoRowIndex); - destChunk[3].set(relRowIndex, eta); // set new eta value - double temp2 = eta-destChunk[5].atd(relRowIndex); - _sumEtaDiffSq += temp2*temp2; - _sumEtaSq += eta*eta; - double temp = dinfoChunks[0].atd(dinfoRowIndex)-glmfun.linkInv(eta); - destChunk[4].set(relRowIndex, temp); // set resid = (y-mu.i) - double prior_weight = dinfoChunks[3]==null?1:dinfoChunks[3].atd(dinfoRowIndex); - double devVal = prior_weight*temp*temp; - destChunk[2].set(relRowIndex, devVal < ZEROEQUAL?ZEROEQUAL:devVal); - return devVal; - } - } - public static class ExpandRandomColumns extends MRTask { - Job _job; - int[] _randomColIndices; - int[] _randomColLevels; - int _numRandCols; - int _startRandomExpandedColumn; - public ExpandRandomColumns(Job job, int[] randomColIndices, int[] randomColLevels, int startExpandedCol) { - _job = job; - _randomColIndices = randomColIndices; - _randomColLevels = randomColLevels; - _startRandomExpandedColumn = startExpandedCol; - _numRandCols = randomColIndices.length; - } - - @Override - public void map(Chunk[] chunks) { - int chunkRowLen = chunks[0].len(); - int columnOffset = _startRandomExpandedColumn; - for (int colIndex = 0; colIndex < _numRandCols; colIndex++) { // expand each random column for each row - for (int rowIndex = 0; rowIndex < chunkRowLen; rowIndex++) { - int randColVal = ((int) chunks[_randomColIndices[colIndex]].atd(rowIndex)) + columnOffset; - chunks[randColVal].set(rowIndex, 1); - } - columnOffset += _randomColLevels[colIndex]; - } - } - } - - // generate AugZ*W as a double array - public static class CalculateAugZWData extends MRTask { - public DataInfo _dinfo; // contains X and Z in response - public long _numDataRows; - Job _job; - public int[] _dinfoWCol; - - public CalculateAugZWData(Job job, DataInfo dInfo, int dinfoRespColStart) { // pass it norm mul and norm sup - in the weights already done. norm - _job = job; - _dinfo = dInfo; - _numDataRows = _dinfo._adaptedFrame.numRows(); // number of data rows - _dinfoWCol = new int[]{dinfoRespColStart, dinfoRespColStart+1}; - } - - @Override - public void map(Chunk[] chunks) { // chunks from AugZ - long chkStartIdx = chunks[0].start();// first row number of chunks - if (chkStartIdx < _numDataRows) { // only deal with data portion - long lastChkIdx = chkStartIdx+chunks[0]._len; - int chunkLen = lastChkIdx < _numDataRows?chunks[0]._len:(int)(_numDataRows-chkStartIdx); - Chunk[] augzwChunks = new Chunk[2]; // loaded from _dinfo or _prior_weight_psi - int[] extraChkInfo = new int[3]; - extraChkInfo = getCorrectChunk(_dinfo._adaptedFrame, 0, chkStartIdx, augzwChunks, - _dinfoWCol, extraChkInfo); - - int extraRelRow = extraChkInfo[2]; - for (int rowIndex = 0; rowIndex < chunkLen; rowIndex++) { - if (extraRelRow >= extraChkInfo[1]) { // need to load new chunk - long chkAbsRowNumber = rowIndex + chkStartIdx; - extraChkInfo = getCorrectChunk(_dinfo._adaptedFrame, extraChkInfo[0], chkAbsRowNumber, augzwChunks, - _dinfoWCol, extraChkInfo); - - extraRelRow = extraChkInfo[2]; - } - chunks[0].set(rowIndex, augzwChunks[0].atd(extraRelRow) * augzwChunks[1].atd(extraRelRow)); - extraRelRow++; - } - } - } - } - - // generate AugZ*W as a double array - public static class CalculateAugZWRandCols extends MRTask { - public long _numDataRows; - Job _job; - Frame _prior_weight_psi; // contains prior_weight, wpsi, zmi for random effects/columns - public int[] _weightWCol; - - public CalculateAugZWRandCols(Job job, Frame prior_weight_psi, int weightColStart, - long numDataRows) { // pass it norm mul and norm sup - in the weights already done. norm - _job = job; - _prior_weight_psi = prior_weight_psi; - _numDataRows = numDataRows; // number of data rows - _weightWCol = new int[]{weightColStart, weightColStart+1}; - } - - @Override - public void map(Chunk[] chunks) { // chunks from AugZW - long chkStartIdx = chunks[0].start();// first row number of chunks - long chkEndIdx = chkStartIdx+chunks[0]._len; - if (chkStartIdx > _numDataRows || chkEndIdx > _numDataRows) { - int chunkLen = chunks[0]._len; - int chunkStartRow = chkStartIdx > _numDataRows?0:(int)(_numDataRows-chkStartIdx); - Chunk[] augzwChunks = new Chunk[2]; // loaded from _dinfo or _prior_weight_psi - int[] extraChkInfo = new int[3]; - extraChkInfo = getCorrectChunk(_prior_weight_psi, 0, - chunkStartRow+chkStartIdx - _numDataRows, augzwChunks, _weightWCol, extraChkInfo); - int extraRelRow = extraChkInfo[2]; - for (int rowIndex = chunkStartRow; rowIndex < chunkLen; rowIndex++) { - if (extraRelRow >= extraChkInfo[1]) { // need to load new chunk - extraChkInfo = getCorrectChunk(_prior_weight_psi, extraChkInfo[0]+1, - extraRelRow+augzwChunks[0].start(), augzwChunks, _weightWCol, extraChkInfo); - extraRelRow = extraChkInfo[2]; - } - chunks[0].set(rowIndex, augzwChunks[0].atd(extraRelRow) * augzwChunks[1].atd(extraRelRow)); - extraRelRow++; - } - } - } - } - - // generate AugZ*W as a double array - public static class CalculateAugZW extends MRTask { - GLMParameters _parms; - public DataInfo _dinfo; // contains X and Z in response - public int[] _random_columnsID; - public int _augZID; - public int _dataColNumber; - public int _randColNumber; - public int _numColStart; - public int _numRandCol; - public long _numDataRows; - Job _job; - Frame _prior_weight_psi; // contains prior_weight, wpsi, zmi for random effects/columns - public int[] _dinfoWCol; - public int[] _weightWCol; - - public CalculateAugZW(Job job, DataInfo dInfo, GLMParameters params, Frame prior_weight_psi, int randCatLevels, - int dinfoRespColStart, int weightColStart) { // pass it norm mul and norm sup - in the weights already done. norm - _job = job; - _dinfo = dInfo; - _parms = params; - _prior_weight_psi = prior_weight_psi; - _augZID = _dinfo.responseChunkId(2); // 0: response, 1: wdata, 2: zi - _dataColNumber = _dinfo.fullN()+1; // add 1 for intercept at the beginning - _numColStart = _dinfo._cats==0?0:_dinfo._catOffsets[_dinfo._cats]; - _numRandCol = _parms._random_columns.length; - _random_columnsID = new int[_numRandCol]; - System.arraycopy(_parms._random_columns, 0, _random_columnsID, 0, _numRandCol); - _randColNumber = randCatLevels; // total number of random columns expanded - _numDataRows = _dinfo._adaptedFrame.numRows(); // number of data rows - _dinfoWCol = new int[]{dinfoRespColStart, dinfoRespColStart+1}; - _weightWCol = new int[]{weightColStart, weightColStart+1}; - } - - @Override - public void map(Chunk[] chunks) { // chunks from AugZ - long chkStartIdx = chunks[0].start();// first row number of chunks - int chunkLen = chunks[0]._len; - Chunk[] augzwChunks = new Chunk[2]; // loaded from _dinfo or _prior_weight_psi - int[] extraChkInfo = new int[3]; - if (chkStartIdx < _numDataRows) { // grab wdata and zi from _dinfo._adaptedFrame. - extraChkInfo = getCorrectChunk(_dinfo._adaptedFrame, 0, chkStartIdx, augzwChunks, - _dinfoWCol, extraChkInfo); - } else { // grab weight from _prior_weight_psi - extraChkInfo = getCorrectChunk(_prior_weight_psi, 0, chkStartIdx-_numDataRows, augzwChunks, - _weightWCol, extraChkInfo); - } - int extraRelRow = extraChkInfo[2]; - for (int rowIndex=0; rowIndex < chunkLen; rowIndex++) { - if (extraRelRow >= extraChkInfo[1]) { // need to load new chunk - long chkAbsRowNumber = rowIndex+chkStartIdx; - if (chkAbsRowNumber < _numDataRows) { // need to load from dinfo - extraChkInfo = getCorrectChunk(_dinfo._adaptedFrame, extraChkInfo[0], chkAbsRowNumber, augzwChunks, - _dinfoWCol, extraChkInfo); - } else { // need to load from w_prior_psi - extraChkInfo = getCorrectChunk(_prior_weight_psi, extraChkInfo[0], chkAbsRowNumber-_numDataRows, - augzwChunks, _weightWCol, extraChkInfo); - } - extraRelRow = extraChkInfo[2]; - } - chunks[0].set(rowIndex, augzwChunks[0].atd(extraRelRow)*augzwChunks[1].atd(extraRelRow)); - extraRelRow++; - } - } - } - - public static class HelpercAIC extends MRTask { - final double TWOPI = 2*Math.PI; // constant to be used for calculation - final double _logOneO2pisd = -Math.log(Math.sqrt(TWOPI)); - public double _p; // stores sum(hv) - public double _devOphi; // store sum(dev/glm.phi - public double _constT; // store *sum(log(2*pi*glm.phi); - boolean _weightPresent; // indicate if we have prior-weight - final double _varFix; - - public HelpercAIC(boolean weightP, double varFix) { - _weightPresent = weightP; - _varFix = varFix; - } - - @Override - public void map(Chunk[] chunks) { - _p = 0; - _devOphi = 0; - _constT = 0; - int chunkLen = chunks[0].len(); - - for (int rowIndex=0; rowIndex < chunkLen; rowIndex++) { - double weight = _weightPresent?chunks[2].atd(rowIndex):1; - double glm_phi = _varFix/weight; - _constT += Math.log(TWOPI*glm_phi); - _p += chunks[0].atd(rowIndex); - _devOphi += chunks[1].atd(rowIndex)/glm_phi; - } - } - - @Override public void reduce(HelpercAIC other) { - _p += other._p; - _constT += other._constT; - _devOphi += other._devOphi; - } - } - - /*** - * This class given the weights generated and stored in _dinfo and wpsi, will multiply the weights to AugXZ and - * store them in AugXZ. - */ - public static class CalculateW4Data extends MRTask { - GLMParameters _parms; - public DataInfo _dinfo; - public int _prior_weightID; // column ID of prior-weights for data rows - public int _wdataID; // column ID to store weight info for data rows - public int _offsetID; // column ID for offets - public int[] _random_columnsID; // column ID where random column values are stored - public int[] _randCatLevels; // categorical levels for random columns - public int _augZID; // column ID where zi is stored - public int _etaOldID; // column ID where old eta.i value is stored - public int _dataColNumber; // fixed column number - public int _numColStart; // numerical fixed column index start - public double[] _beta; // store fixed coefficients - public double[] _ubeta; // store random coefficients - public double[] _psi; - public double[] _phi; - public double _tau; - public int _numRandCol; // number of random effects/columns - Job _job; - public double _sumEtaDiffSq; // store sum of (eta.i-eta.old)^2 - public double _sumEtaSq; // store sum(eta.i^2) - public double _HL_correction; // correction to Hierarchy likelihood, determined by distribution used. - - public CalculateW4Data(Job job, DataInfo dInfo, GLMParameters params, int[] randCatLevels, - double[] beta, double[] ubeta, double[] psi, double[] phi, double tau, double hlCorrection) { // pass it norm mul and norm sup - in the weights already done. norm - _job = job; - _dinfo = dInfo; - _parms = params; - _prior_weightID = _dinfo._weights?_dinfo.weightChunkId():-1; - _augZID = _dinfo.responseChunkId(2); // 0: response, 1: wdata, 2: zi, 3: etaOld - _wdataID = _dinfo.responseChunkId(1); - _etaOldID = _dinfo.responseChunkId(3); - _offsetID = _dinfo._offset?_dinfo.offsetChunkId():-1; - _dataColNumber = _dinfo.fullN()+1; // add 1 for intercept at the beginning - _numColStart = _dinfo.numCats()==0?0:_dinfo._catOffsets[_dinfo._cats]; - _numRandCol = _parms._random_columns.length; - _random_columnsID = _parms._random_columns; - _randCatLevels = randCatLevels; - _beta = beta; - _ubeta = ubeta; - _psi = psi; - _phi = phi; - _tau = tau; - _HL_correction=hlCorrection; - _sumEtaDiffSq=0; - _sumEtaSq=0; - } - - @Override - public void map(Chunk[] chunks) { // chunks from _dinfo._adaptedFrame - GLMWeightsFun glmfun = new GLMWeightsFun(_parms._family, _parms._link, _parms._tweedie_variance_power, - _parms._tweedie_link_power, 0,_parms._init_dispersion_parameter, false); - Row row = _dinfo.newDenseRow(); // one row of fixed effects/columns - double eta, mu, temp, zi, wdata; - for (int i = 0; i < chunks[0]._len; ++i) { // going over all the rows in the chunk of _dinfo._adaptedFrame - _dinfo.extractDenseRow(chunks, i, row); - if (!row.isBad() && row.weight != 0) { - eta = row.innerProduct(_beta) + row.offset; - for (int index=0; index < _numRandCol; index++) { - eta += _ubeta[(int)row.response(4+index)]; - } - if (Double.isNaN(eta)) - throw H2O.fail("GLM.MME diverged! Try different starting values."); - double etaDiff = eta - row.response(3); - chunks[_etaOldID].set(i, eta); // save current eta as etaOld for next round - _sumEtaDiffSq += etaDiff * etaDiff; - _sumEtaSq += eta * eta; - mu = glmfun.linkInv(eta); - temp = glmfun.linkInvDeriv(mu); - zi = eta - row.offset + (row.response(0) - mu) / temp - _HL_correction; - chunks[_augZID].set(i, zi); - wdata = row.weight * temp * temp / (glmfun.variance(mu) * _tau); - chunks[_wdataID].set(i, Math.sqrt(wdata)); // set the new weight back to _dinfo. - } - } - } - - @Override - public void reduce(CalculateW4Data other){ - this._sumEtaDiffSq += other._sumEtaDiffSq; - this._sumEtaSq += other._sumEtaSq; - } - - /*** - * This method will calculate wdata and store it in dinfo response columns. In addition, it will calculate - * sum(eta.i-eta.o)^2, sum(eta.i^2). It will return sqrt(wdata). We use the same method from R to calculate - * wdata. - * - * @param glmfun - * @param beta - * @param ubeta - * @param tau - * @param row - * @param chunks: chunks from dinfo - * @param rowIndex - * @return - */ -/* public double getWeights(GLMWeightsFun glmfun, double[] beta, double[] ubeta, double tau, Row row, Chunk[] chunks, - int rowIndex) { - double eta = row.innerProduct(beta) + row.offset; - for (int index=0; index < _numRandCol; index++) { - eta += ubeta[(int)row.response(4+index)]; - } - if (Double.isNaN(eta)) - throw H2O.fail("GLM.MME diverged! Try different starting values."); - double etaDiff = eta - row.response(3); - chunks[_etaOldID].set(rowIndex, eta); // save current eta as etaOld for next round - _sumEtaDiffSq += etaDiff * etaDiff; - _sumEtaSq += eta * eta; - double mu = glmfun.linkInv(eta); - double temp = glmfun.linkInvDeriv(mu); - double zi = eta - row.offset + (row.response(0) - mu) / temp - _HL_correction; - chunks[_augZID].set(rowIndex, zi); - double wdata = row.weight * temp * temp / (glmfun.variance(mu) * tau); - return Math.sqrt(wdata); - }*/ - } - - /*** - * This class will update the frame AugXZ which contains Ta*sqrt(W inverse) from documentation: - * - * - multiply the generated weight value to Ta and store in AugXZ; - */ - public static class DataAddW2AugXZ extends MRTask { - public DataInfo _dinfo; - public int _wdataID; // column ID to store weight info for data rows - public int[] _randCatLevels; // categorical levels for random columns - public int _dataColNumber; // fixed column number - public int _numColStart; // numerical fixed column index start - public int _numRandCol; // number of random effects/columns - Job _job; - public long _dataRows; - - public DataAddW2AugXZ(Job job, DataInfo dInfo, int[] randCatLevels) { // pass it norm mul and norm sup - in the weights already done. norm - _job = job; - _dinfo = dInfo; - _wdataID = 1; - _dataColNumber = _dinfo.fullN()+1; // add 1 for intercept at the beginning - _numColStart = _dinfo._cats==0?0:_dinfo._catOffsets[_dinfo._cats]; - _numRandCol = randCatLevels.length; - _randCatLevels = randCatLevels; - _dataRows = _dinfo._adaptedFrame.numRows(); - } - - @Override - public void map(Chunk[] chunks) { // chunks from augXZ but only takes care of data part - long chkStartIdx = chunks[0].start(); // first row number of chunks of augXZ - if (chkStartIdx < _dataRows) { // only process data of augXZ upto dataRow - int numColAugXZ = chunks.length; - Chunk[] dinfoChunks = new Chunk[_dinfo._adaptedFrame.numCols()]; - double[] processedRow = new double[chunks.length]; // store one row of AugXZ: wdata*(intercept, x, z (expanded random columns)) - int[] dinfoChunkInfo = getCorrectChunk(_dinfo._adaptedFrame, 0, chkStartIdx, dinfoChunks, null, null); - int dinfoChunkRelRow = dinfoChunkInfo[2]; - int chunkLen = chkStartIdx + chunks[0]._len >= _dataRows ? (int) (_dataRows - chkStartIdx) : chunks[0]._len; - Row row = _dinfo.newDenseRow(); // one row of fixed effects/columns - double wdata; - for (int index = 0; index < chunkLen; index++) { - if (dinfoChunkRelRow >= dinfoChunkInfo[1]) { - dinfoChunkInfo = getCorrectChunk(_dinfo._adaptedFrame, dinfoChunkInfo[0]+1, - dinfoChunkRelRow+dinfoChunks[0].start(), dinfoChunks, null, null); - dinfoChunkRelRow = dinfoChunkInfo[2]; - } - _dinfo.extractDenseRow(dinfoChunks, dinfoChunkRelRow, row); - Arrays.fill(processedRow, 0.0); - wdata = row.response[_wdataID]; - row.scalarProduct(wdata, processedRow, _numColStart); // generate wdata*X - int offset = _dataColNumber; - for (int randColIndex = 0; randColIndex < _numRandCol; randColIndex++) { // generate x*Z - int processRowIdx = offset + (int) row.response[4 + randColIndex]; // 0: response, 1: weight, 2: zi, 3: etai, 4 or more: z - processedRow[processRowIdx] = wdata; // save wdata as - offset += _randCatLevels[randColIndex]; // write to next random column value - } - for (int colIndex = 0; colIndex < numColAugXZ; colIndex++) { // assign the rows to the AugXZ - chunks[colIndex].set(index, processedRow[colIndex]); // set w*X for intercept, data, random columns - } - dinfoChunkRelRow++; - - } - } - } - - /*** - * Given the chkIdx, this method will fetch the chunks with columns specified in vecIdx - * @param augXZ: Frame frome which chunks are fetched - * @param chkIdx: chunk index to fetch - * @param chks: store fetched chunks - * @param vecIdx: null, fetch all columns, else, contains columns of interest to fetch - */ - public static void getAllChunks(Frame augXZ, int chkIdx, Chunk[] chks, int[] vecIdx) { - if (vecIdx==null) { // copy all vectors of the chunk - int chkLen = chks.length; - for (int chkIndex =1 ; chkIndex < chkLen; chkIndex++) - chks[chkIndex] = augXZ.vec(chkIndex).chunkForChunkIdx(chkIdx); - } else { - int veclen = vecIdx.length; - for (int index=1; index < veclen; index++) - chks[index] = augXZ.vec(vecIdx[index]).chunkForChunkIdx(chkIdx); - } - } - - /*** - * Given the absolute row index of interest, this method will find the chunk index of augXZ that contains the - * absolute row index - * - * @param augXZ: Frame where chunks will be fetched - * @param chkIdx: chunk index to check if it contains the absolute row index of interest - * @param currentRowAbs: absolute row index of interest - * @param chks: chunks to stored fetched chunk - * @param vecIdx: column indices to fetch. If null, fetch all columns - * @return - */ - public static int getOneSingleChunk(Frame augXZ, int chkIdx, long currentRowAbs, Chunk[] chks, int[] vecIdx) { - chkIdx = chkIdx>=augXZ.vec(0).nChunks()?0:chkIdx; - if (vecIdx==null) { // copy all vectors of the chunk - // fetch one vector and check if it contains the correct rows - chks[0] = augXZ.vec(0).chunkForChunkIdx(chkIdx); - } else { - chks[0] = augXZ.vec(vecIdx[0]).chunkForChunkIdx(chkIdx); - } - // find correct row offset into chunk. - long strow = chks[0].start(); - long endrow = chks[0].len()+strow; - if ((currentRowAbs >= strow) && (currentRowAbs< endrow)) - return -1; - else if (currentRowAbs < strow) - return (chkIdx-1); - else - return (chkIdx+1); - } - - /** - * This method, given the absolute row index of interest, will grab the correct chunk from augXZ containing the - * same absolute row index of interest. The chunks of augXZ will be stored in chks. In addition, an integer - * array will be returned that contain the following information about the fetched chunk of augXZ: - * - index 0: chunk index; - * - index 1: number of rows of fetched chunk; - * - index 2: relative starting index of fetched chunk that will correspond to the absolute row index of interest - * passed to this method. - * - * @param augXZ: Frame from which chunks will be grabbed - * @param chkIdx: starting chunk index to looking at - * @param currentRowAbs: absolute row index of first row of interest - * @param chks: stored fetched chunk - * @param vecIdx: null if all columns should be fetched. Else, contains the columns to be fetched - * @param returnInfo: information about fetched chunk - * @return - */ - public static int[] getCorrectChunk(Frame augXZ, int chkIdx, long currentRowAbs, Chunk[] chks, int[] vecIdx, - int[] returnInfo) { - assert currentRowAbs < augXZ.numRows(); - int currentIdx = chkIdx >= augXZ.vec(0).nChunks()?0:chkIdx; - while (currentIdx >= 0) { // currentIdx will be -1 if found the correct chunk - currentIdx = getOneSingleChunk(augXZ, currentIdx, currentRowAbs, chks, vecIdx); // find chunk that contains currentRowAbs - } - getAllChunks(augXZ, chks[0].cidx(),chks, vecIdx); // fetched the chunks of augXZ to chks - if (returnInfo == null) { - returnInfo = new int[3]; - } - returnInfo[0] = chks[0].cidx(); // chunk index of fetched chunks - returnInfo[1] = chks[0].len(); // number of rows in fetched chunks - returnInfo[2] = (int) (currentRowAbs-chks[0].start()); // relative row start of first row of fetched chunk - return returnInfo; - } - } - public static class GLMCoordinateDescentTaskSeqNaive extends MRTask { public double [] _normMulold; public double [] _normSubold; diff --git a/h2o-algos/src/main/java/hex/hglm/ComputationStateHGLM.java b/h2o-algos/src/main/java/hex/hglm/ComputationStateHGLM.java new file mode 100644 index 000000000000..e86727130cb1 --- /dev/null +++ b/h2o-algos/src/main/java/hex/hglm/ComputationStateHGLM.java @@ -0,0 +1,150 @@ +package hex.hglm; + +import Jama.Matrix; +import hex.DataInfo; +import water.Job; +import water.util.ArrayUtils; +import water.util.Log; + +import java.util.Random; +import static hex.hglm.HGLMUtils.*; +import static water.util.ArrayUtils.copy2DArray; +import static water.util.ArrayUtils.gaussianVector; + +public class ComputationStateHGLM { + /*** + * the doc = document attached to https://github.com/h2oai/h2o-3/issues/8487, title HGLM_H2O_Implementation.pdf + * I will be referring to the doc and different parts of it to explain my implementation. + */ + final int _numFixedCoeffs; // fixed coefficient length including inactive predictors + final int _numRandomCoeffs; // random coefficient length including inactive predictors + public final HGLMModel.HGLMParameters _parms; + int _iter; + private double[] _beta; // fixed, if standardized, normalized coefficients, else, non-normalized coefficients + private double[][] _ubeta; // random , if standardized, normalized coefficients, else non-normalized coefficients + private double[][] _T; // positive definite matrix, size random coefficient length by random coefficient length + final DataInfo _dinfo; + private final Job _job; + double _tauEVarE10 = 0; // variance estimate of random noise calculated from equation 10 of the doc + double _tauEVarE17 = 0; // variance estimate of random noise calculated from equation 17 of the doc + String[] _fixedCofficientNames; // include intercept if enabled + String[] _randomCoefficientNames; // include intercept only if random effect is in intercept + String[] _level2UnitNames; // enum levels of group column + final int _numLevel2Unit; + final int _level2UnitIndex; + final int _nobs; + + public ComputationStateHGLM(Job job, HGLMModel.HGLMParameters parms, DataInfo dinfo, HGLMTask.ComputationEngineTask engTask, int iter) { + _job = job; + _parms = parms; + _dinfo = dinfo; + _iter = iter; + _fixedCofficientNames = engTask._fixedCoeffNames; + _level2UnitNames = engTask._level2UnitNames; + _randomCoefficientNames = engTask._randomCoeffNames; + _level2UnitIndex = engTask._level2UnitIndex; + initComputationStateHGLM(engTask); + _numFixedCoeffs = _beta.length; + _numRandomCoeffs = _ubeta[0].length; + _numLevel2Unit = _ubeta.length; + _nobs = engTask._nobs; + } + + /** + * set initial values for: + * 1. initial fixed coefficients from user or assigned by us; + * 2. initial random coefficients from user or randomly assigned; + * 3. sigma square; + * 4. T matrix value + */ + void initComputationStateHGLM(HGLMTask.ComputationEngineTask engineTask) { + int numRandomCoeff = _randomCoefficientNames.length; + int numFixCoeff = _fixedCofficientNames.length; + // need to initialize the coefficients, fixed and random + if (_parms._seed == -1) // set the seed if not set by user + _parms._seed = new Random().nextLong(); + Log.info("Random seed: "+_parms._seed); + + Random random = new Random(_parms._seed); + if (_parms._tau_e_var_init > 0.0) + _tauEVarE10 = _parms._tau_e_var_init; + else + _tauEVarE10 = Math.abs(random.nextGaussian()); + + _T = new double[numRandomCoeff][numRandomCoeff]; + if (_parms._initial_t_matrix != null) { + grabInitValuesFromFrame(_parms._initial_t_matrix, _T); + double[][] transposeT = ArrayUtils.transpose(_T); + if (!equal2DArrays(_T, transposeT, 1e-6)) + throw new IllegalArgumentException("initial_t_matrix must be symmetric but is not!"); + // make sure matrix is semi positive definite + Matrix tMat = new Matrix(_T); + if ((_parms._max_iterations > 0) && !tMat.chol().isSPD()) // only check this when we actually build the model + throw new IllegalArgumentException("initial_t_matrix must be positive semi definite but is not!"); + } else { + if (_parms._tau_u_var_init > 0.0) { + _tauEVarE10 = _parms._tau_u_var_init; + } else { + _tauEVarE10 = Math.abs(random.nextGaussian()); + } + setDiagValues(_T, _tauEVarE10); + } + + _ubeta = new double[engineTask._numLevel2Units][engineTask._numRandomCoeffs]; + if ( null != _parms._initial_random_effects) { // read in initial random values + grabInitValuesFromFrame(_parms._initial_random_effects, _ubeta); + } else { // randomly generating random initial values + gaussianVector(random, _ubeta, _level2UnitNames.length, numRandomCoeff); + ArrayUtils.mult(_ubeta, Math.sqrt(_T[0][0])); + } + // copy over initial fixed coefficient values + if (null != _parms._initial_fixed_effects) { + if (_parms._initial_fixed_effects.length != numFixCoeff) + throw new IllegalArgumentException("initial_fixed_effects must be an double[] array of size "+numFixCoeff); + + _beta = _parms._initial_fixed_effects; + } else { + _beta = new double[numFixCoeff]; + _beta[_beta.length-1] = _parms.train().vec(_parms._response_column).mean(); + } + } + + public double[] getBeta() { return _beta; } + public double[][] getUbeta() { return _ubeta; } + public double getTauUVar() { return _tauEVarE10; } + public double getTauEVarE10() { return _tauEVarE10; } + public String[] getFixedCofficientNames() { return _fixedCofficientNames; } + public String[] getRandomCoefficientNames() { return _randomCoefficientNames; } + public String[] getGroupColumnNames() { return _level2UnitNames; } + public double[][] getT() { return _T; } + public int getNumFixedCoeffs() { return _numFixedCoeffs; } + public int getNumRandomCoeffs() { return _numRandomCoeffs; } + public int getNumLevel2Units() { return _numLevel2Unit; } + public int getLevel2UnitIndex() { return _level2UnitIndex; } + public void setBeta(double[] beta) { + System.arraycopy(beta, 0, _beta, 0, beta.length); + } + public void setUbeta(double[][] ubeta) { + copy2DArray(ubeta, _ubeta); + } + public void setT(double[][] tmat) { + copy2DArray(tmat, _T); + } + public void setTauEVarE10(double tEVar) { + _tauEVarE10 = tEVar; + } + + public static class ComputationStateSimple { + final public double[] _beta; + final public double[][] _ubeta; + final public double[][] _tmat; + final public double _tauEVar; + + public ComputationStateSimple(double[] beta, double[][] ubeta, double[][] tmat, double tauEVar) { + _beta = beta; + _ubeta = ubeta; + _tmat = tmat; + _tauEVar = tauEVar; + } + } +} diff --git a/h2o-algos/src/main/java/hex/hglm/HGLM.java b/h2o-algos/src/main/java/hex/hglm/HGLM.java new file mode 100644 index 000000000000..19237e128283 --- /dev/null +++ b/h2o-algos/src/main/java/hex/hglm/HGLM.java @@ -0,0 +1,364 @@ +package hex.hglm; + +import hex.*; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; +import water.H2O; +import water.Job; +import water.Key; +import water.exceptions.H2OModelBuilderIllegalArgumentException; +import water.fvec.Frame; +import water.udf.CFuncRef; +import water.util.Log; +import water.util.TwoDimTable; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import static hex.glm.GLMModel.GLMParameters.Family.gaussian; +import static hex.glm.GLMModel.GLMParameters.MissingValuesHandling.*; +import static hex.hglm.HGLMModel.HGLMParameters.Method.EM; +import static hex.hglm.HGLMUtils.*; +import static hex.hglm.MetricBuilderHGLM.calHGLMLlg; +import static water.util.ArrayUtils.*; + +public class HGLM extends ModelBuilder { + /*** + * the doc = document attached to https://github.com/h2oai/h2o-3/issues/8487, title HGLM_H2O_Implementation.pdf + * I will be referring to the doc and different parts of it to explain my implementation. + */ + long _startTime; // model building start time; + private transient ComputationStateHGLM _state; + private static final DateTimeFormatter DATE_TIME_FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss"); + + @Override + public ModelCategory[] can_build() { + return new ModelCategory[]{ModelCategory.Regression}; + } + + @Override + public boolean isSupervised() { + return true; + } + + @Override + public BuilderVisibility builderVisibility() { + return BuilderVisibility.Experimental; + } + + @Override + public boolean havePojo() { + return false; + } + + @Override + public boolean haveMojo() { + return false; + } + + public HGLM(boolean startup_once) { + super(new HGLMModel.HGLMParameters(), startup_once); + } + + protected HGLM(HGLMModel.HGLMParameters parms) { + super(parms); + init(false); + } + + public HGLM(HGLMModel.HGLMParameters parms, Key key) { + super(parms, key); + init(false); + } + + @Override + protected ModelBuilder.Driver trainModelImpl() { + return new HGLMDriver(); + } + + static class ScoringHistory { + private ArrayList _scoringIters = new ArrayList<>(); + private ArrayList _scoringTimes = new ArrayList<>(); + private ArrayList _logLikelihood = new ArrayList<>(); + private ArrayList _tauEVar = new ArrayList<>(); + + public ArrayList getScoringIters() { return _scoringIters;} + + public void addIterationScore(int iter, double loglikelihood, double tauEVar) { + _scoringIters.add(iter); + _scoringTimes.add(System.currentTimeMillis()); + _logLikelihood.add(loglikelihood); + _tauEVar.add(tauEVar); + } + + public TwoDimTable to2dTable() { + String[] cnames = new String[]{"timestamp", "number_of_iterations", "loglikelihood", "noise_variance"}; + String[] ctypes = new String[]{"string", "int", "double", "double"}; + String[] cformats = new String[]{"%s", "%d", "%.5f", "%.5f"}; + int tableSize = _scoringIters.size(); + TwoDimTable res = new TwoDimTable("Scoring History", "", + new String[tableSize], cnames, ctypes, cformats, ""); + int col = 0; + for (int i=0; i 0 || _parms._fold_column != null) + error("nfolds or _fold_coumn", " cross validation is not supported in HGLM right now."); + + if (null != _parms._family && !gaussian.equals(_parms._family)) + error("family", " only Gaussian families are supported now"); + + if (null != _parms._method && !EM.equals(_parms._method)) + error("method", " only EM (expectation maximization) is supported for now."); + + if (null != _parms._missing_values_handling && + PlugValues == _parms._missing_values_handling && _parms._plug_values == null) + error("PlugValues", " if specified, must provide a frame with plug values in plug_values."); + + if (_parms._tau_u_var_init < 0) + error("tau_u_var_init", "if set, must > 0.0."); + + if (_parms._tau_e_var_init < 0) + error("tau_e_var_init", "if set, must > 0.0."); + + if (_parms._seed == 0) + error("seed", "cannot be set to any number except zero."); + + if (_parms._em_epsilon < 0) + error("em_epsilon", "if specified, must >= 0.0."); + + if (_parms._score_iteration_interval <= 0) + error("score_iteration_interval", "if specified must be >= 1."); + + super.init(expensive); + if (error_count() > 0) + throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(HGLM.this); + if (expensive) { + if (_parms._max_iterations == 0) { + warn("max_iterations", "for HGLM, must be >= 1 (or -1 for unlimited or default setting) " + + "to obtain proper model. Setting it to be 0 will only return the correct coefficient names and an empty" + + " model."); + warn("_max_iterations", H2O.technote(2, "for HGLM, if specified, must be >= 1 or == -1.")); + } + + if (_parms._max_iterations == -1) + _parms._max_iterations = 1000; + + Frame trainFrame = train(); + List columnNames = Arrays.stream(trainFrame.names()).collect(Collectors.toList()); + if (_parms._group_column == null) { + error("group_column", " column used to generate level 2 units is missing"); + } else { + if (!columnNames.contains(_parms._group_column)) + error("group_column", " is not found in the training frame."); + else if (!trainFrame.vec(_parms._group_column).isCategorical()) + error("group_column", " should be a categorical column."); + } + + if (_parms._random_columns == null && !_parms._random_intercept) { + error("random_columns", " should not be null if random_intercept is false. You must " + + "specify predictors in random_columns or set random_intercept to true."); + } + if (_parms._random_columns != null) { + boolean goodRandomColumns = (Arrays.stream(_parms._random_columns).filter(x -> columnNames.contains(x)).count() + == _parms._random_columns.length); + if (!goodRandomColumns) + error("random_columns", " can only contain columns in the training frame."); + } + + if (_parms._gen_syn_data) { + _parms._max_iterations = 0; + if (_parms._tau_e_var_init <= 0) + error("tau_e_var_init", "If gen_syn_data is true, tau_e_var_init must be > 0."); + } + } + } + + private class HGLMDriver extends Driver { + DataInfo _dinfo = null; + + @Override + public void computeImpl() { + _startTime = System.currentTimeMillis(); + init(true); + if (error_count() > 0) + throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(HGLM.this); + + _job.update(0, "Initializing HGLM model training"); + HGLMModel model = null; + ScoringHistory scTrain = new ScoringHistory(); + ScoringHistory scValid = _parms._valid == null ? null : new ScoringHistory(); + try { + /*** + * Need to do the following things: + * 1. Generate all the various coefficient names; + * 2. Initialize the coefficient values (fixed and random) + * 3. Set modelOutput fields. + */ + // _dinfo._adaptedFrame will contain group_column. Check and make sure clients will pass that along as well. + _dinfo = new DataInfo(_train.clone(), null, 1, _parms._use_all_factor_levels, + DataInfo.TransformType.NONE, DataInfo.TransformType.NONE, + _parms.missingValuesHandling() == Skip, + _parms.missingValuesHandling() == MeanImputation + || _parms.missingValuesHandling() == PlugValues, + _parms.makeImputer(), false, hasWeightCol(), hasOffsetCol(), hasFoldCol(), null); + + model = new HGLMModel(dest(), _parms, new HGLMModel.HGLMModelOutput(HGLM.this, _dinfo)); + model.write_lock(_job); + _job.update(1, "Starting to build HGLM model..."); + if (EM == _parms._method) + fitEM(model, _job, scTrain, scValid); + model._output.setModelOutputFields(_state); // must be called before calling scoring + scoreAndUpdateModel(model, true, scTrain); + model._output._model_summary = generateSummary(model._output); + model._output._start_time = _startTime; + model._output._training_time_ms = System.currentTimeMillis() - _startTime; + model._output._scoring_history = scTrain.to2dTable(); + if (valid() != null) { + scoreAndUpdateModel(model, false, scValid); + if (scValid._scoringIters.size() > 0) + model._output._scoring_history_valid = scValid.to2dTable(); + } + } finally { + model.update(_job); + model.unlock(_job); + } + } + + private TwoDimTable generateSummary(HGLMModel.HGLMModelOutput modelOutput) { + String[] names = new String[]{"number_of_iterations", "loglikelihood", "noise_variance"}; + String[] types = new String[]{"int", "double", "double"}; + String[] formats = new String[]{"%d", "%.5f", "%.5f"}; + TwoDimTable summary = new TwoDimTable("HGLM Model", "summary", new String[]{""}, names, types, formats, ""); + summary.set(0, 0, modelOutput._iterations); + summary.set(0, 1, modelOutput._log_likelihood); + summary.set(0, 2, modelOutput._tau_e_var); + return summary; + } + + private long timeSinceLastScoring(long startTime) { return System.currentTimeMillis() - startTime; } + + private void scoreAndUpdateModel(HGLMModel model, boolean forTraining, ScoringHistory sc) { + Log.info("Scoring after " + timeSinceLastScoring(_startTime) + "ms at iteration "+model._output._iterations); + long tcurrent = System.currentTimeMillis(); + if (forTraining) { + model.score(_parms.train(), null, CFuncRef.from(_parms._custom_metric_func)).delete(); + ModelMetricsRegressionHGLM mtrain = (ModelMetricsRegressionHGLM) ModelMetrics.getFromDKV(model, _parms.train()); + model._output._training_metrics = mtrain; + model._output._training_time_ms = tcurrent - model._output._start_time; + if (null != mtrain) { + model._output._log_likelihood = mtrain._log_likelihood; + model._output._icc = mtrain._icc.clone(); + sc.addIterationScore(_state._iter, model._output._log_likelihood, mtrain._var_residual); + } + } else { + Log.info("Scoring on validation dataset."); + model.score(_parms.valid(), null, CFuncRef.from(_parms._custom_metric_func)).delete(); + ModelMetricsRegressionHGLM mvalid = (ModelMetricsRegressionHGLM) ModelMetrics.getFromDKV(model, _parms.valid()); + if (null != mvalid) { + model._output._validation_metrics = mvalid; + model._output._log_likelihood_valid = ((ModelMetricsRegressionHGLM) model._output._validation_metrics).llg(); + sc.addIterationScore(_state._iter, model._output._log_likelihood_valid, model._output._tau_e_var); + } + } + } + + /** + * Build HGLM model using EM (Expectation Maximization) described in section II of the doc. + */ + void fitEM(HGLMModel model, Job job, ScoringHistory scTrain, ScoringHistory scValid) { + int iteration = 0; + // form fixed arrays and matrices whose values do not change + HGLMTask.ComputationEngineTask engineTask = new HGLMTask.ComputationEngineTask(job, _parms, _dinfo); + engineTask.doAll(_dinfo._adaptedFrame); + model._output.setModelOutput(engineTask); + if (_parms._showFixedMatVecs) + model._output.setModelOutputFixMatVec(engineTask); + _state = new ComputationStateHGLM(_job, _parms, _dinfo, engineTask, iteration); + try { + if (_parms._max_iterations > 0) { + // grab current value of fixed beta, tauEVar, tauUVar + double[] beta = _state.getBeta().clone(); + double[][] ubeta; + double tauEVarE10 = _state.getTauEVarE10(); + double[][] tMat = copy2DArray(_state.getT()); + double[][][] cjInv; + double[][] tMatInv; + + while (true) { + iteration++; + // E step: estimate the random beta (random effect coefficient, need to grab Cj (inverse) + tMatInv = generateTInverse(tMat); + cjInv = generateCJInverse(engineTask._ArjTArj, tauEVarE10, tMatInv); // for each level 2 value + ubeta = estimateNewRandomEffects(cjInv, engineTask._ArjTYj, engineTask._ArjTAfj, beta);// new random coefficients + // M step + beta = estimateFixedCoeff(engineTask._AfTAftInv, engineTask._AfjTYjSum, engineTask._AfjTArj, ubeta);// new fixed coeficients + tMat = estimateNewtMat(ubeta, tauEVarE10, cjInv, engineTask._oneOverJ); // provide better estimate of tauEVar + HGLMTask.ResidualLLHTask rLlhE10 = new HGLMTask.ResidualLLHTask(_job, _parms, _dinfo, ubeta, beta, engineTask); + rLlhE10.doAll(_dinfo._adaptedFrame); + tauEVarE10 = rLlhE10._residualSquare * engineTask._oneOverN; // from equation 10 of the doc + // check to make sure determinant of V is positive, see section II.V of the doc + if (!checkPositiveG(engineTask._numLevel2Units, tMat)) + Log.info("HGLM model building is stopped due to matrix G in section II.V of the doc is no longer PSD"); + // check if stopping conditions are satisfied + if (!progress(beta, ubeta, tMat, tauEVarE10, scTrain, scValid, model, rLlhE10)) + return; + } + } + } catch(Exception ex) { // will catch matrix singular during loglikelihood calculation + if (iteration > 1) // some coefficients are valid, just return + return; + else + throw new RuntimeException(ex); // bad matrix from the start, no model is built. + } + } + + public boolean progress(double[] beta, double[][] ubeta, double[][] tmat, double tauEVarE10, ScoringHistory scTrain, + ScoringHistory scValid, HGLMModel model, HGLMTask.ResidualLLHTask rLlh) { + _state._iter++; + if (_state._iter >= _parms._max_iterations || stop_requested()) + return false; + double[] betaDiff = new double[beta.length]; + minus(betaDiff, beta, _state.getBeta()); + double maxBetaDiff = maxMag(betaDiff) / maxMag(beta); + double[][] tmatDiff = new double[tmat.length][tmat[0].length]; + minus(tmatDiff, tmat, _state.getT()); + double maxTmatDiff = maxMag(tmatDiff) / maxMag(tmat); + double[][] ubetaDiff = new double[ubeta.length][ubeta[0].length]; + minus(ubetaDiff, ubeta, _state.getUbeta()); + double maxUBetaDiff = maxMag(ubetaDiff) / maxMag(ubeta); + double tauEVarDiff = Math.abs(tauEVarE10 - _state.getTauEVarE10()) / tauEVarE10; + boolean converged = ((maxBetaDiff <= _parms._em_epsilon) && (maxTmatDiff <= _parms._em_epsilon) && (maxUBetaDiff + <= _parms._em_epsilon) && (tauEVarDiff <= _parms._em_epsilon)); + if (!converged) { // update values in _state + _state.setBeta(beta); + _state.setUbeta(ubeta); + _state.setT(tmat); + _state.setTauEVarE10(tauEVarE10); + if (_parms._score_each_iteration || ((_parms._score_iteration_interval % _state._iter) == 0)) { + model._output.setModelOutputFields(_state); + scoreAndUpdateModel(model, true, scTrain); // perform scoring and updating scoring history + if (_parms.valid() != null) + scoreAndUpdateModel(model, false, scValid); + } else { + // calculate log likelihood with current parameter settings + double logLikelihood = calHGLMLlg(_state._nobs, tmat, tauEVarE10, model._output._arjtarj, rLlh._sse_fixed, + rLlh._yMinusXTimesZ); + scTrain.addIterationScore(_state._iter, logLikelihood, tauEVarE10); + } + } + return !converged; + } + } +} diff --git a/h2o-algos/src/main/java/hex/hglm/HGLMModel.java b/h2o-algos/src/main/java/hex/hglm/HGLMModel.java new file mode 100644 index 000000000000..dd89ea0ed32d --- /dev/null +++ b/h2o-algos/src/main/java/hex/hglm/HGLMModel.java @@ -0,0 +1,302 @@ +package hex.hglm; + +import hex.DataInfo; +import hex.Model; +import hex.ModelCategory; +import hex.ModelMetrics; +import hex.deeplearning.DeepLearningModel; +import hex.glm.GLM; +import hex.glm.GLMModel; +import water.*; +import water.fvec.Frame; +import water.fvec.Vec; +import water.udf.CFuncRef; +import water.util.TwoDimTable; + +import java.io.Serializable; +import java.util.Arrays; + +import static hex.glm.GLMModel.GLMParameters.Family.gaussian; +import static hex.hglm.HGLMModel.HGLMParameters.Method.EM; +import static hex.hglm.HGLMUtils.*; +import static water.util.ArrayUtils.copy2DArray; + +public class HGLMModel extends Model { + /** + * the doc = document attached to https://github.com/h2oai/h2o-3/issues/8487, title HGLM_H2O_Implementation.pdf + * I will be referring to the doc and different parts of it to explain my implementation. + */ + public HGLMModel(Key selfKey, HGLMParameters parms, HGLMModelOutput output) { + super(selfKey, parms, output); + } + + @Override + public ModelMetrics.MetricBuilder makeMetricBuilder(String[] domain) { + return new MetricBuilderHGLM(domain, true, true, _parms._random_intercept, _output); + } + + @Override + public String[] makeScoringNames() { + return new String[]{"predict"}; + } + + @Override + protected double[] score0(double[] data, double[] preds) { + throw new UnsupportedOperationException("HGLMModel.score0 should never be called"); + } + + @Override + protected PredictScoreResult predictScoreImpl(Frame fr, Frame adaptFrm, String destination_key, Job j, + boolean computeMetrics, CFuncRef customMetricFunc) { + String[] predictNames = makeScoringNames(); + String[][] domains = new String[predictNames.length][]; + boolean forTraining = _parms.train().getKey().equals(fr.getKey()); + HGLMScore gs = makeScoringTask(adaptFrm, true, j, computeMetrics && !_parms._gen_syn_data); + gs.doAll(predictNames.length, Vec.T_NUM, gs._dinfo._adaptedFrame); + MetricBuilderHGLM mb = null; + Frame rawFrame = null; + if (gs._computeMetrics) { // only calculate log-likelihood, mse and other metrics if _computeMetrics + mb = gs._mb; + if (forTraining) { + _output._yMinusXTimesZ = gs._yMinusXTimesZ; + _output._yMinusFixPredSquare = mb._yMinusFixPredSquare; + } else { // store for all frames other than the training frame + _output._yMinusXTimesZValid = gs._yMinusXTimesZ; + _output._yMinusFixPredSquareValid = mb._yMinusFixPredSquare; + } + rawFrame = gs.outputFrame(); + } + domains[0] = gs._predDomains; + Frame outputFrame = gs.outputFrame(Key.make(destination_key), predictNames, domains); + return new PredictScoreResult(mb, rawFrame, outputFrame); + } + + private HGLMScore makeScoringTask(Frame adaptFrm, boolean makePredictions, Job j, boolean computeMetrics) { + int responseId = adaptFrm.find(_output.responseName()); + if(responseId > -1 && adaptFrm.vec(responseId).isBad()) { // remove inserted invalid response + adaptFrm = new Frame(adaptFrm.names(),adaptFrm.vecs()); + adaptFrm.remove(responseId); + } + final boolean detectedComputeMetrics = computeMetrics && (adaptFrm.vec(_output.responseName()) != null && !adaptFrm.vec(_output.responseName()).isBad()); + String [] domain = _output.nclasses()<=1 ? null : (!detectedComputeMetrics ? _output._domains[_output._domains.length-1] : adaptFrm.lastVec().domain()); + return new HGLMScore(j, this, _output._dinfo.scoringInfo(_output._names, adaptFrm), domain, computeMetrics, makePredictions); + } + + public static class HGLMParameters extends Model.Parameters { + public long _seed = -1; + public GLMModel.GLMParameters.Family _family; + public int _max_iterations = -1; + public double[] _initial_fixed_effects; // initial values of fixed coefficients + public Key _initial_random_effects; // frame key that contains the initial starting values of random coefficient effects + public Key _initial_t_matrix; // frame key taht contains the initial starting values of T matrix + public double _tau_u_var_init = 0; // initial random coefficient effects variance estimate, set by user + public double _tau_e_var_init = 0; // initial random noise variance estimate, set by user + public GLMModel.GLMParameters.Family _random_family = gaussian; + public String[] _random_columns; // store predictors that have random components in the coefficients + public Method _method; + public double _em_epsilon = 1e-3; + public boolean _random_intercept = true; + public String _group_column; + public Serializable _missing_values_handling = GLMModel.GLMParameters.MissingValuesHandling.MeanImputation; + public Key _plug_values = null; + public boolean _use_all_factor_levels = false; + public boolean _showFixedMatVecs = false; // internal parameter, if true, will show AfjTY, ArjTY, ArjTArj, AfjTAfj, AfjTArj + public int _score_iteration_interval = 5; + public boolean _score_each_iteration = false; + public boolean _gen_syn_data = false; + + @Override + public String algoName() { + return "HGLM"; + } + + @Override + public String fullName() { + return "Hierarchical Generalized Linear Model"; + } + + @Override + public String javaName() { + return HGLMModel.class.getName(); + } + + @Override + public long progressUnits() { + return 1; + } + + public enum Method {EM}; // EM: expectation maximization + + public HGLMParameters() { + super(); + _family = gaussian; + _method = EM; + } + + public GLMModel.GLMParameters.MissingValuesHandling missingValuesHandling() { + if (_missing_values_handling instanceof GLMModel.GLMParameters.MissingValuesHandling) + return (GLMModel.GLMParameters.MissingValuesHandling) _missing_values_handling; + assert _missing_values_handling instanceof DeepLearningModel.DeepLearningParameters.MissingValuesHandling; + switch ((DeepLearningModel.DeepLearningParameters.MissingValuesHandling) _missing_values_handling) { + case MeanImputation: + return GLMModel.GLMParameters.MissingValuesHandling.MeanImputation; + case Skip: + return GLMModel.GLMParameters.MissingValuesHandling.Skip; + default: + throw new IllegalStateException("Unsupported missing values handling value: " + _missing_values_handling); + } + } + + public boolean imputeMissing() { + return missingValuesHandling() == GLMModel.GLMParameters.MissingValuesHandling.MeanImputation || + missingValuesHandling() == GLMModel.GLMParameters.MissingValuesHandling.PlugValues; + } + + public DataInfo.Imputer makeImputer() { + if (missingValuesHandling() == GLMModel.GLMParameters.MissingValuesHandling.PlugValues) { + if (_plug_values == null || _plug_values.get() == null) { + throw new IllegalStateException("Plug values frame needs to be specified when Missing Value Handling = PlugValues."); + } + return new GLM.PlugValuesImputer(_plug_values.get()); + } else { // mean/mode imputation and skip (even skip needs an imputer right now! PUBDEV-6809) + return new DataInfo.MeanImputer(); + } + } + } + + public static class HGLMModelOutput extends Model.Output { + public DataInfo _dinfo; + final GLMModel.GLMParameters.Family _family; + final GLMModel.GLMParameters.Family _random_family; + public String[] _fixed_coefficient_names; // include intercept only if _parms._intercept is true + public String[] _random_coefficient_names; // include intercept only if _parms._random_intercept = true + public String[] _group_column_names; + public long _training_time_ms; + public double[] _beta; // fixed coefficients + public double[][] _ubeta; // random coefficients + public double[][] _tmat; // calculated with non-standardize random effects coefficients + double _tauUVar; + public double _tau_e_var; + // test parameters + public double[][] _afjtyj; + public double[][] _arjtyj; + public double[][][] _afjtafj; + public double[][][] _arjtarj; + public double[][][] _afjtarj; + public double[][] _yMinusXTimesZ; // generate during training + public double[][] _yMinusXTimesZValid; // store same value for frames other than training frame + public int _num_fixed_coeffs; + public int _num_random_coeffs; + int[] _randomCatIndices; + int[] _randomNumIndices; + int[] _randomCatArrayStartIndices; + int _predStartIndexRandom; + boolean _randomSlopeToo; + int[] _fixedCatIndices; + int _numLevel2Units; + int _level2UnitIndex; // store column index of level 2 predictor column + int _predStartIndexFixed; + public double[] _icc; + public double _log_likelihood; + public double _log_likelihood_valid; // store for frames other than training + public int _iterations; + public int _nobs; + public int _nobs_valid; + public double _yMinusFixPredSquare; + public double _yMinusFixPredSquareValid; + public TwoDimTable _scoring_history_valid; + + /** + * For debugging only. Copy over the generated fixed matrices to model._output. + */ + public void setModelOutputFixMatVec(HGLMTask.ComputationEngineTask comp) { + _afjtyj = copy2DArray(comp._AfjTYj); + _arjtyj = copy2DArray(comp._ArjTYj); + _afjtafj = copy3DArray(comp._AfjTAfj); + _afjtarj = copy3DArray(comp._AfjTArj); + _nobs = comp._nobs; + } + + public void setModelOutput(HGLMTask.ComputationEngineTask comp) { + _randomCatIndices = comp._randomCatIndices; + _randomNumIndices = comp._randomNumIndices; + _randomCatArrayStartIndices = comp._randomCatArrayStartIndices; + _predStartIndexRandom = comp._predStartIndexRandom; + _randomSlopeToo = !(comp._numRandomCoeffs == 1 && comp._parms._random_intercept); + _fixedCatIndices = comp._fixedCatIndices; + _predStartIndexFixed = comp._predStartIndexFixed; + _arjtarj = copy3DArray(comp._ArjTArj); + _log_likelihood = Double.NEGATIVE_INFINITY; + } + + public HGLMModelOutput(HGLM b, DataInfo dinfo) { + super(b, dinfo._adaptedFrame); + _dinfo = dinfo; + _domains = dinfo._adaptedFrame.domains(); + _family = b._parms._family; + _random_family = b._parms._random_family; + } + + public void setModelOutputFields(ComputationStateHGLM state) { + _fixed_coefficient_names = state.getFixedCofficientNames(); + _random_coefficient_names = state.getRandomCoefficientNames(); + _group_column_names = state.getGroupColumnNames(); + _tauUVar = state.getTauUVar(); + _tau_e_var = state.getTauEVarE10(); + _tmat = state.getT(); + _num_fixed_coeffs = state.getNumFixedCoeffs(); + _num_random_coeffs = state.getNumRandomCoeffs(); + _numLevel2Units = state.getNumLevel2Units(); + _level2UnitIndex = state.getLevel2UnitIndex(); + _nobs = state._nobs; + _beta = state.getBeta(); + _ubeta = state.getUbeta(); + _num_random_coeffs = _ubeta[0].length; + _iterations = state._iter; + } + + @Override + public int nclasses() { // only support Gaussian now + return 1; + } + + @Override public ModelCategory getModelCategory() { + return ModelCategory.Regression; + } + } + + @Override + protected Futures remove_impl(Futures fs, boolean cascade) { + super.remove_impl(fs, cascade); + return fs; + } + + @Override + protected AutoBuffer writeAll_impl(AutoBuffer ab) { + return super.writeAll_impl(ab); + } + + @Override + protected Keyed readAll_impl(AutoBuffer ab, Futures fs) { + return super.readAll_impl(ab, fs); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(super.toString()); + sb.append(" loglikelihood: "+this._output._log_likelihood); + sb.append(" fixed effect coefficients: "+ Arrays.toString(this._output._beta)); + int numLevel2 = this._output._ubeta.length; + for (int index=0; index { + // the doc = document attached to https://github.com/h2oai/h2o-3/issues/8487, title HGLM_H2O_Implementation.pdf + // I will be referring to the doc and different parts of it to explain my implementation. + DataInfo _dinfo; + double[] _beta; // non-standardized coefficients + double[][] _ubeta; // non-standardized coefficients + final Job _job; + boolean _computeMetrics; + boolean _makePredictions; + final HGLMModel _model; + MetricBuilderHGLM _mb; + String[] _predDomains; + int _nclass; + HGLMModel.HGLMParameters _parms; + int _level2UnitIndex; + int[] _fixedCatIndices; + int _numLevel2Units; + int _predStartIndexFixed; + int[] _randomCatIndices; + int[] _randomNumIndices; + int[] _randomCatArrayStartIndices; + int _predStartIndexRandom; + final boolean _randomSlopeToo; + final boolean _randomIntercept; // true if present + public double[][] _yMinusXTimesZ; // use non-normalized coefficients + double[][] _tmat; + Random randomObj; + final double _noiseStd; + + public HGLMScore(final Job j, final HGLMModel model, DataInfo dinfo, final String[] respDomain, + final boolean computeMetrics, final boolean makePredictions) { + _job = j; + _model = model; + _dinfo = dinfo; + _computeMetrics = computeMetrics; // can be true only if the response column is available and calcualte loglikelihood + _makePredictions = makePredictions; + _beta = model._output._beta; // non-standardized/non-normalized coefficients + _ubeta = model._output._ubeta; // non-standardized/non-normalized coefficients + _predDomains = respDomain; + _nclass = model._output.nclasses(); + _parms = model._parms; + _level2UnitIndex = model._output._level2UnitIndex; + _fixedCatIndices = model._output._fixedCatIndices; + _numLevel2Units = model._output._numLevel2Units; + _predStartIndexFixed = model._output._predStartIndexFixed; + _randomCatIndices = model._output._randomCatIndices; + _randomNumIndices = model._output._randomNumIndices; + _randomCatArrayStartIndices = model._output._randomCatArrayStartIndices; + _predStartIndexRandom = model._output._predStartIndexRandom; + _randomSlopeToo = model._output._randomSlopeToo; + _randomIntercept = _parms._random_intercept; + _tmat = model._output._tmat; // generated from non-standardized random coefficients + randomObj = new Random(_parms._seed); + _noiseStd = Math.sqrt(_parms._tau_e_var_init); // not affected by standardization/normalization + } + + @Override + public void map(Chunk[] chks, NewChunk[] nc) { + if (isCancelled() || (_job != null && _job.stop_requested())) return; + float[] response = null; // store response column value if exists + int numPredValues = _nclass <= 1 ? 1 : _nclass + 1; + double[] predictVals = MemoryManager.malloc8d(numPredValues); + double[] xji = MemoryManager.malloc8d(_model._output._beta.length); + double[] zji = MemoryManager.malloc8d(_model._output._ubeta[0].length); + if (_computeMetrics) { + _mb = (MetricBuilderHGLM) _model.makeMetricBuilder(_predDomains); + response = new float[1]; + _yMinusXTimesZ = new double[_numLevel2Units][zji.length]; + } + DataInfo.Row r = _dinfo.newDenseRow(); + + if (_computeMetrics && (r.response == null || r.response.length == 0)) + throw new IllegalArgumentException("computeMetrics can only be set to true if the response column exists in" + + " dataset passed to prediction function."); + int chkLen = chks[0].len(); + int level2Index; + for (int rid = 0; rid < chkLen; rid++) { + _dinfo.extractDenseRow(chks, rid, r); + level2Index = _parms._use_all_factor_levels ? r.binIds[_level2UnitIndex] - _dinfo._catOffsets[_level2UnitIndex] : + (int) chks[_level2UnitIndex].at8(rid); + processRow(r, predictVals, nc, numPredValues, xji, zji, level2Index); + if (_computeMetrics && !r.response_bad) { // calculate metrics + response[0] = (float) r.response[0]; + _mb.perRow(predictVals, response, r.weight, r.offset, xji, zji, _yMinusXTimesZ, level2Index, _model); + } + } + } + + @Override + public void reduce(HGLMScore other) { + if (_mb != null) + _mb.reduce(other._mb); + if (_computeMetrics) + ArrayUtils.add(_yMinusXTimesZ, other._yMinusXTimesZ); + } + + private void processRow(DataInfo.Row r, double[] ps, NewChunk[] preds, int numPredCols, double[] xji, double[] zji, + int level2Index) { + if (r.predictors_bad) { + Arrays.fill(ps, Double.NaN); + return; + } else if (r.weight == 0) { + Arrays.fill(ps, 0.0); + return; + } + ps = scoreRow(r, ps, xji, zji, level2Index); // weight is not zero and response is valid + if (_makePredictions) + for (int predCol = 0; predCol < numPredCols; predCol++) { // write prediction to NewChunk + preds[predCol].addNum(ps[predCol]); + } + } + + /** + * only processing gaussian for now. + */ + public double[] scoreRow(DataInfo.Row r, double[] preds, double[] xji, double[] zji, int level2Index) { + fillInFixedRowValues(r, xji, _parms, _fixedCatIndices, _level2UnitIndex, + _numLevel2Units, _predStartIndexFixed, _dinfo); + fillInRandomRowValues(r, zji, _parms, _randomCatIndices, _randomNumIndices, _randomCatArrayStartIndices, + _predStartIndexRandom, _dinfo, _randomSlopeToo, _randomIntercept); + preds[0] = innerProduct(xji, _beta) + innerProduct(zji, _ubeta[level2Index]) + r.offset; + preds[0] = _parms._gen_syn_data ? preds[0]+randomObj.nextGaussian()*_noiseStd : preds[0]; + return preds; + } +} diff --git a/h2o-algos/src/main/java/hex/hglm/HGLMTask.java b/h2o-algos/src/main/java/hex/hglm/HGLMTask.java new file mode 100644 index 000000000000..a26d03c10a3e --- /dev/null +++ b/h2o-algos/src/main/java/hex/hglm/HGLMTask.java @@ -0,0 +1,439 @@ +package hex.hglm; + +import Jama.Matrix; +import hex.DataInfo; +import water.Job; +import water.MRTask; +import water.MemoryManager; +import water.fvec.Chunk; +import water.util.ArrayUtils; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import static hex.hglm.HGLMUtils.fillZTTimesZ; +import static water.util.ArrayUtils.*; + +public abstract class HGLMTask { + // the doc = document attached to https://github.com/h2oai/h2o-3/issues/8487, title HGLM_H2O_Implementation.pdf + // I will be referring to the doc and different parts of it to explain my implementation. + + /*** + * This class will calculate the residual Yj-Afj*beta-Arj*ubetaj for level 2 unit j. It implements step 2 of + * section II.VIII of the doc. + */ + public static class ResidualLLHTask extends MRTask { + final public double[][] _ubeta; + final public double[] _beta; // new fixed coefficients calculated + final HGLMModel.HGLMParameters _parms; + final DataInfo _dinfo; + double _residualSquare; + double[] _residualSquareLevel2; + final int[] _fixedCatIndices; + final int _level2UnitIndex; + final int _numLevel2Units; + final int _predStartIndexFixed; + final int[] _randomCatIndices; + final int[] _randomNumIndices; + final int[] _randomCatArrayStartIndices; + final int _predStartIndexRandom; + final int _numFixedCoeffs; + final int _numRandomCoeffs; + double[][] _yMinusXTimesZ; // standarized if parms._standardize=true and vice versa + double _sse_fixed; + Job _job; + final boolean _randomSlopeToo; + + public ResidualLLHTask(Job job, HGLMModel.HGLMParameters parms, DataInfo dataInfo, double[][] ubeta, + double[] beta, ComputationEngineTask computeEngine) { + _parms = parms; + _dinfo = dataInfo; + _ubeta = ubeta; + _beta = beta; + _job = job; + _fixedCatIndices = computeEngine._fixedCatIndices; + _level2UnitIndex = computeEngine._level2UnitIndex; + _numLevel2Units = computeEngine._numLevel2Units; + _predStartIndexFixed = computeEngine._predStartIndexFixed; + _randomCatIndices = computeEngine._randomCatIndices; + _randomNumIndices = computeEngine._randomNumIndices; + _randomCatArrayStartIndices = computeEngine._randomCatArrayStartIndices; + _predStartIndexRandom = computeEngine._predStartIndexRandom; + _numFixedCoeffs = computeEngine._numFixedCoeffs; + _numRandomCoeffs = computeEngine._numRandomCoeffs; + _randomSlopeToo = _parms._random_columns != null && _parms._random_columns.length > 0; + } + + @Override + public void map(Chunk[] chks) { + if(_job != null && _job.stop_requested()) return; + _residualSquare = 0.0; + _residualSquareLevel2 = new double[_numLevel2Units]; + double[] xji = MemoryManager.malloc8d(_numFixedCoeffs); + double[] zji = MemoryManager.malloc8d(_numRandomCoeffs); + int chkLen = chks[0].len(); + _yMinusXTimesZ = new double[_numLevel2Units][_numRandomCoeffs]; + int level2Index; + double residual, y, residualSquare; + double residualFixed; + DataInfo.Row r = _dinfo.newDenseRow(); + for (int rowInd = 0; rowInd < chkLen; rowInd++) { + _dinfo.extractDenseRow(chks, rowInd, r); + if (!r.isBad() && !(r.weight == 0)) { + y = r.response(0); + level2Index = _parms._use_all_factor_levels ? r.binIds[_level2UnitIndex] - _dinfo._catOffsets[_level2UnitIndex] : + (int) chks[_level2UnitIndex].at8(rowInd); + ComputationEngineTask.fillInFixedRowValues(r, xji, _parms, _fixedCatIndices, _level2UnitIndex, _numLevel2Units, + _predStartIndexFixed, _dinfo); // read in predictors for fixed coefficient effects + ComputationEngineTask.fillInRandomRowValues(r, zji, _parms, _randomCatIndices, _randomNumIndices, + _randomCatArrayStartIndices, _predStartIndexRandom, _dinfo, _randomSlopeToo, _parms._random_intercept); // read in random coefficient effects + residualFixed = y - innerProduct(xji, _beta) - r.offset; + _sse_fixed += residualFixed * residualFixed; + residual = residualFixed - innerProduct(zji, _ubeta[level2Index]); + residualSquare = residual*residual; + _residualSquare += residualSquare; + _residualSquareLevel2[level2Index] += residualSquare; + add(_yMinusXTimesZ[level2Index], mult(zji, residualFixed)); + } + } + } + + @Override + public void reduce(ResidualLLHTask otherTask) { + add(_residualSquareLevel2, otherTask._residualSquareLevel2); + _residualSquare += otherTask._residualSquare; + add(_yMinusXTimesZ, otherTask._yMinusXTimesZ); + _sse_fixed += otherTask._sse_fixed; + } + } + + /*** + * This class will pre-calculate arrays (double[]) or matrices (double[][]) that will be used in later calculations + * that are part of the CDSS described in equation 11 of the doc. + * + */ + public static class ComputationEngineTask extends MRTask { + double _YjTYjSum; // calculate sum of transpose(Yj)*Yj across all level 2 units + public double[][] _AfjTYj; // calculate transpose(Afj)*Yj for each level 2 unit, Y + public double[][] _ArjTYj; + public double[][][] _AfjTAfj; // equivalent to transpose(Xj)*Xj for each j + public double[][][] _ArjTArj; // equivalent to tranpose(Zj)*Zj for each j + public double[][][] _AfjTArj; + public double[][][] _ArjTAfj; + public double[][] _AfTAftInv; + public double[] _AfTAftInvAfjTYj; // vectors are represented in row array. Need to transpose it if used as Matrix + public double[] _AfjTYjSum; + double _oneOverJ; + double _oneOverN; + int _numFixedCoeffs; + int _numRandomCoeffs; + String[] _fixedCoeffNames; + String[] _randomCoeffNames; + String[] _level2UnitNames; + int _numLevel2Units; + final HGLMModel.HGLMParameters _parms; + int _nobs; + double _weightedSum; + final DataInfo _dinfo; + int _level2UnitIndex; + int[] _randomPredXInterceptIndices; + int[] _randomCatIndices; + int[] _randomNumIndices; + int[] _randomCatArrayStartIndices; // starting index of random cat predictors + int[] _fixedPredXInterceptIndices; + int[] _fixedCatIndices; + int[] _fixedNumIndices; + String[] _fixedPredNames; + String[] _randomPredNames; + int _predStartIndexFixed; + int _predStartIndexRandom; + Job _job; + final boolean _randomSlopeToo; + double[][] _zTTimesZ; + + public ComputationEngineTask(Job job, HGLMModel.HGLMParameters parms, DataInfo dinfo) { + _parms = parms; + _dinfo = dinfo; + _job = job; + _randomSlopeToo = _parms._random_columns != null && _parms._random_columns.length > 0; + extractNamesNIndices(); + } + + void setPredXInterceptIndices(List predictorNames) { + boolean randomColsExist = _parms._random_columns != null; + _randomPredXInterceptIndices = randomColsExist ? new int[_parms._random_columns.length] : null; + List fixedPredNames = new ArrayList<>(); + List randomPredNames = new ArrayList<>(); + + List randomCatPredList = new ArrayList<>(); + List randomNumPredList = new ArrayList<>(); + _fixedPredXInterceptIndices = new int[predictorNames.size() - 1]; + List fixedCatPredList = new ArrayList<>(); + List fixedNumPredList = new ArrayList<>(); + if (randomColsExist) { + for (int index = 0; index < _randomPredXInterceptIndices.length; index++) { + _randomPredXInterceptIndices[index] = predictorNames.indexOf(_parms._random_columns[index]); + if (_randomPredXInterceptIndices[index] < _dinfo._cats) + randomCatPredList.add(_randomPredXInterceptIndices[index]); + else + randomNumPredList.add(_randomPredXInterceptIndices[index]); + randomPredNames.add(predictorNames.get(_randomPredXInterceptIndices[index])); + } + } + if (randomCatPredList.size() > 0) { + _randomCatIndices = randomCatPredList.stream().mapToInt(x -> x).toArray(); + Arrays.sort(_randomCatIndices); + List randomCatLevels = Arrays.stream(_randomCatIndices).map(x -> _dinfo._adaptedFrame.vec(x).domain().length).boxed().collect(Collectors.toList()); + randomCatLevels.add(0, _parms._use_all_factor_levels ? 0 : 1); + int[] randomCatArrayStartIndices = randomCatLevels.stream().map(x -> _parms._use_all_factor_levels ? x : (x - 1)).mapToInt(x -> x).toArray(); + _randomCatArrayStartIndices = ArrayUtils.cumsum(randomCatArrayStartIndices); + } + if (randomNumPredList.size() > 0) { + _randomNumIndices = randomNumPredList.stream().mapToInt(x -> x).toArray(); + Arrays.sort(_randomNumIndices); + } + for (int index = 0; index < _fixedPredXInterceptIndices.length; index++) { + String predName = predictorNames.get(index); + if (!predName.equals(_parms._group_column)) { + if (index < _dinfo._cats) + fixedCatPredList.add(index); + else + fixedNumPredList.add(index); + fixedPredNames.add(predName); + } + } + if (fixedCatPredList.size() > 0) { + _fixedCatIndices = fixedCatPredList.stream().mapToInt(x -> x).toArray(); + Arrays.sort(_fixedCatIndices); + } + if (fixedNumPredList.size() > 0) { + _fixedNumIndices = fixedNumPredList.stream().mapToInt(x -> x).toArray(); + Arrays.sort(_fixedNumIndices); + } + + _fixedPredNames = fixedPredNames.stream().toArray(String[]::new); + _randomPredNames = randomPredNames.stream().toArray(String[]::new); + _predStartIndexFixed = fixedCatPredList.size() == 0 ? 0 : (_parms._use_all_factor_levels ? + Arrays.stream(_fixedCatIndices).map(x -> _dinfo._adaptedFrame.vec(x).domain().length).sum() : + Arrays.stream(_fixedCatIndices).map(x -> (_dinfo._adaptedFrame.vec(x).domain().length - 1)).sum()); + _predStartIndexRandom = randomCatPredList.size() == 0 ? 0 : (_parms._use_all_factor_levels ? + Arrays.stream(_randomCatIndices).map(x -> _dinfo._adaptedFrame.vec(x).domain().length).sum() : + Arrays.stream(_randomCatIndices).map(x -> (_dinfo._adaptedFrame.vec(x).domain().length - 1)).sum()); + } + + void extractNamesNIndices() { + List predictorNames = Arrays.stream(_dinfo._adaptedFrame.names()).collect(Collectors.toList()); + _level2UnitIndex = predictorNames.indexOf(_parms._group_column); + + // assign coefficient names for fixed, random and group column + List allCoeffNames = Arrays.stream(_dinfo.coefNames()).collect(Collectors.toList()); + String groupCoeffStarts = _parms._group_column + "."; + _level2UnitNames = Arrays.stream(_dinfo._adaptedFrame.vec(_level2UnitIndex).domain()).map(x -> groupCoeffStarts + x).toArray(String[]::new); + List groupCoeffNames = Arrays.stream(_level2UnitNames).collect(Collectors.toList()); + + // fixed Coefficients are all coefficient names excluding group_column + List fixedCoeffNames = allCoeffNames.stream().filter(x -> !groupCoeffNames.contains(x)).collect(Collectors.toList()); + fixedCoeffNames.add("intercept"); + _fixedCoeffNames = fixedCoeffNames.stream().toArray(String[]::new); + List randomPredictorNames = new ArrayList<>(); + if (_randomSlopeToo) { + // random coefficients names + int[] randomColumnsIndicesSorted = Arrays.stream(_parms._random_columns).mapToInt(x -> predictorNames.indexOf(x)).toArray(); + Arrays.sort(randomColumnsIndicesSorted); + _parms._random_columns = Arrays.stream(randomColumnsIndicesSorted).mapToObj(x -> predictorNames.get(x)).toArray(String[]::new); + for (String coefName : _parms._random_columns) { + String startCoef = coefName + "."; + randomPredictorNames.addAll(allCoeffNames.stream().filter(x -> x.startsWith(startCoef) || x.equals(coefName)).collect(Collectors.toList())); + } + } + if (_parms._random_intercept) + randomPredictorNames.add("intercept"); + + _randomCoeffNames = randomPredictorNames.stream().toArray(String[]::new); + _numLevel2Units = _level2UnitNames.length; + _numFixedCoeffs = _fixedCoeffNames.length; + _numRandomCoeffs = _randomCoeffNames.length; + setPredXInterceptIndices(predictorNames); + } + + @Override + public void map(Chunk[] chks) { + if(_job != null && _job.stop_requested()) return; + initializeArraysVar(); + double y; + double[] xji = MemoryManager.malloc8d(_numFixedCoeffs); + double[] zji = MemoryManager.malloc8d(_numRandomCoeffs); + int level2Index; + int chkLen = chks[0].len(); + DataInfo.Row r = _dinfo.newDenseRow(); + for (int rowInd = 0; rowInd < chkLen; rowInd++) { + _dinfo.extractDenseRow(chks, rowInd, r); + if (!r.isBad() && !(r.weight == 0)) { + y = r.response(0); + _YjTYjSum += y * y; + _nobs++; + _weightedSum += r.weight; + level2Index = _parms._use_all_factor_levels ? r.binIds[_level2UnitIndex] - _dinfo._catOffsets[_level2UnitIndex] : + (int) chks[_level2UnitIndex].at8(rowInd); + fillInFixedRowValues(r, xji, _parms, _fixedCatIndices, _level2UnitIndex, _numLevel2Units, + _predStartIndexFixed, _dinfo); // read in predictors for fixed coefficient effects + fillInRandomRowValues(r, zji, _parms, _randomCatIndices, _randomNumIndices, _randomCatArrayStartIndices, + _predStartIndexRandom, _dinfo, _randomSlopeToo, _parms._random_intercept); // read in random coefficient effects + formFixedMatricesVectors(level2Index, xji, y, _AfjTYj, _AfjTAfj); // form _AfjTYj, _AfjTAfj + formFixedMatricesVectors(level2Index, zji, y, _ArjTYj, _ArjTArj); // form ArjTYj, _ArjTArj + outerProductCum(_AfjTArj[level2Index], xji, zji); // form AfjTArj + } + } + } + + /** + * It does two things: + * a. form output product of one row of data set (matMat[level2Ind]) + * b. form product of one row of data and response y. + */ + void formFixedMatricesVectors(int level2Ind, double[] xji, double y, double[][] matVec, double[][][] matMat) { + outputProductSymCum(matMat[level2Ind], xji); + multCum(xji, matVec[level2Ind], y); + } + + static void fillInRandomRowValues(DataInfo.Row r, double[] zji, HGLMModel.HGLMParameters parms, + int[] randomCatIndices, int[] randomNumIndices, int[] randomCatArrayStartIndices, + int predStartIndexRandom, DataInfo dinfo, boolean randomSlopeToo, boolean randomIntercept) { + // read in predictors for random coefficient effects + Arrays.fill(zji, 0.0); + int catPredInd; + int startEnumInd = 0; + int catVal; + if (randomSlopeToo) { + if (randomCatIndices != null) { + for (int catInd = 0; catInd < randomCatIndices.length; catInd++) { + catPredInd = randomCatIndices[catInd]; + catVal = r.binIds[catPredInd]; + if (!parms._use_all_factor_levels) { + RowInfo rowInfo = grabCatIndexVal(r, startEnumInd, catPredInd, dinfo); + catVal = rowInfo._catVal; + startEnumInd = rowInfo._rowEnumInd; + } + if (catVal >= 0) + zji[catVal - dinfo._catOffsets[catPredInd] + randomCatArrayStartIndices[catInd]] = 1; + } + } + + if (randomNumIndices != null) + for (int numInd = 0; numInd < randomNumIndices.length; numInd++) + zji[numInd + predStartIndexRandom] = r.numVals[randomNumIndices[numInd] - dinfo._cats]; + } + + if (randomIntercept) + zji[zji.length - 1] = 1.0; + } + + public static void fillInFixedRowValues(DataInfo.Row r, double[] xji, HGLMModel.HGLMParameters parms, int[] fixedCatIndices, + int level2UnitIndex, int numLevel2Units, int predStartIndexFixed, DataInfo dinfo) { + // read in predictors for fixed coefficient effects + Arrays.fill(xji, 0.0); + int startEnumInd = 0; + int catPredInd; + int catVal; + if (r.nBins > 1) { // will always have at least one enum column + for (int catInd = 0; catInd < fixedCatIndices.length; catInd++) { + catPredInd = fixedCatIndices[catInd]; + catVal = r.binIds[catPredInd]; + if (!parms._use_all_factor_levels) { + RowInfo rowInfo = grabCatIndexVal(r, startEnumInd, catPredInd, dinfo); + catVal = rowInfo._catVal; + startEnumInd = rowInfo._rowEnumInd; + } + if (catVal > -1) { + if (catPredInd < level2UnitIndex) { + xji[catVal] = 1; + } else if (catPredInd > level2UnitIndex) { + xji[catVal - (parms._use_all_factor_levels ? numLevel2Units : (numLevel2Units - 1))] = 1; + } + } + } + } + for (int numInd = 0; numInd < r.nNums; numInd++) { + xji[numInd + predStartIndexFixed] = r.numVals[numInd]; + } + xji[xji.length - 1] = 1.0; // for intercept + } + + public static RowInfo grabCatIndexVal(DataInfo.Row r, int startEnumInd, int enumIndexOfInterest, DataInfo dinfo) { + int startInd = startEnumInd; + for (int index = startEnumInd; index < r.nBins; index++) { + if (dinfo._catOffsets[enumIndexOfInterest] <= r.binIds[index] && r.binIds[index] < dinfo._catOffsets[enumIndexOfInterest + 1]) + return new RowInfo(index, r.binIds[index]); + + if (r.binIds[index] >= dinfo._catOffsets[enumIndexOfInterest + 1]) + return new RowInfo(index, -1); + startInd = index; + } + return new RowInfo(startInd, -1); + } + + static class RowInfo { + int _rowEnumInd; + int _catVal; + + public RowInfo(int rowEnumInd, int catVal) { + _rowEnumInd = rowEnumInd; + _catVal = catVal; + } + } + + void initializeArraysVar() { + _YjTYjSum = 0; + _nobs = 0; + _weightedSum = 0.0; + _AfjTYj = MemoryManager.malloc8d(_numLevel2Units, _numFixedCoeffs); + _ArjTYj = MemoryManager.malloc8d(_numLevel2Units, _numRandomCoeffs); + _AfjTAfj = MemoryManager.malloc8d(_numLevel2Units, _numFixedCoeffs, _numFixedCoeffs); + _ArjTArj = MemoryManager.malloc8d(_numLevel2Units, _numRandomCoeffs, _numRandomCoeffs); + _AfjTArj = MemoryManager.malloc8d(_numLevel2Units, _numFixedCoeffs, _numRandomCoeffs); + } + + @Override + public void reduce(ComputationEngineTask otherTask) { + _YjTYjSum += otherTask._YjTYjSum; + _nobs += otherTask._nobs; + _weightedSum += otherTask._weightedSum; + add(_AfjTYj, otherTask._AfjTYj); + add(_ArjTYj, otherTask._ArjTYj); + add(_AfjTAfj, otherTask._AfjTAfj); + add(_ArjTArj, otherTask._ArjTArj); + add(_AfjTArj, otherTask._AfjTArj); + } + + @Override + public void postGlobal() { + _ArjTAfj = new double[_numLevel2Units][][]; + _AfjTYjSum = MemoryManager.malloc8d(_numFixedCoeffs); + _AfTAftInvAfjTYj = MemoryManager.malloc8d(_numFixedCoeffs); + + _oneOverJ = 1.0 / _numLevel2Units; + _oneOverN = 1.0 / _nobs; + + double[][] sumAfjAfj = MemoryManager.malloc8d(_numFixedCoeffs, _numFixedCoeffs); + sumAfjAfjAfjTYj(_AfjTAfj, _AfjTYj, sumAfjAfj, _AfjTYjSum); + for (int index = 0; index < _numLevel2Units; index++) + _ArjTAfj[index] = new Matrix(_AfjTArj[index]).transpose().getArray(); + + _zTTimesZ = fillZTTimesZ(_ArjTArj); + if (_parms._max_iterations > 0) { // only proceed if max_iterations is not zero + _AfTAftInv = (new Matrix(sumAfjAfj)).inverse().getArray(); + matrixVectorMult(_AfTAftInvAfjTYj, _AfTAftInv, _AfjTYjSum); + } + } + + public static void sumAfjAfjAfjTYj(double[][][] afjTAfj, double[][] afjTYj, double[][] sumAfjAfj, double[] sumAfjTYj) { + int numLevel2 = afjTAfj.length; + for (int index=0; index threshold) + return false; + } + return true; + } + + public static double[][] generateTInverse(double[][] tMat) { + Matrix tMatrix = new Matrix(tMat); + return tMatrix.inverse().getArray(); + } + public static double[][][] generateCJInverse(double[][][] arjTArj, double tauEVar, double[][] tMatInv) { + int numLevel2Unit = arjTArj.length; + double[][][] cJInverse = new double[numLevel2Unit][][]; + int arjTArjSize = arjTArj[0].length; + double[][] tempResult = new double[arjTArjSize][arjTArjSize]; + double[][] sigmaTimestMatInv = new double[arjTArjSize][arjTArjSize]; + mult(tMatInv, sigmaTimestMatInv, tauEVar); + for (int index = 0; index < numLevel2Unit; index++) { + add(tempResult, arjTArj[index], sigmaTimestMatInv); + cJInverse[index] = new Matrix(tempResult).inverse().getArray(); + } + return cJInverse; + } + + /** + * Note that the term ArjTYj and ArjTAfj are fixed and won't change. They are stored in engineTask + */ + public static double[][] estimateNewRandomEffects(double[][][] cjInv, double[][] ArjTYj, double[][][] ArjTAfj, double[] beta) { + int numLevel2Unit = cjInv.length; + int numRandCoef = cjInv[0].length; + double[][] ubeta = new double[numLevel2Unit][numRandCoef]; + double[] arjTafjbeta = new double[numRandCoef]; + double[] result = new double[numRandCoef]; + for (int index=0; index < numLevel2Unit; index++) { + matrixVectorMult(arjTafjbeta, ArjTAfj[index], beta); // ArjTAfj*betaUtil + minus(result, ArjTYj[index], arjTafjbeta); // (ArjTYj-ArjTAfj*beta) + matrixVectorMult(ubeta[index], cjInv[index], result); + Arrays.fill(arjTafjbeta, 0.0); + } + return ubeta; + } + + public static double[] estimateFixedCoeff(double[][] AfjTAfjSumInv, double[] AfjTYjSum, double[][][] AfjTArj, double[][] ubeta) { + int numLevel2 = ubeta.length; + int numFixedCoeffs = AfjTAfjSumInv.length; + double[] betaFixed = new double[numFixedCoeffs]; + double[] AfjTArjTimesBrj = new double[numFixedCoeffs]; + for (int index=0; index= 0; + } + + public static double[][] generateNewTmat(double[][] ubeta) { + int numIndex2 = ubeta.length; + double oneOverJ = 1.0/numIndex2; + int numRandCoeff = ubeta[0].length; + double[][] newTmat = new double[numRandCoeff][numRandCoeff]; + for (int index=0; index { + // the doc = document attached to https://github.com/h2oai/h2o-3/issues/8487, title HGLM_H2O_Implementation.pdf + // I will be referring to the doc and different parts of it to explain my implementation. + public static final double LOG_2PI = Math.log(2*Math.PI); + ModelMetrics.MetricBuilder _metricBuilder; // point to generic model metric classes + final boolean _intercept; + final boolean _random_intercept; + final boolean _computeMetrics; + public double[] _beta; + public double[][] _ubeta; + public double[][] _tmat; + public double _yMinusFixPredSquare; + public double _sse; + public int _nobs; + + public MetricBuilderHGLM(String[] domain, boolean computeMetrics, boolean intercept, boolean random_intercept, + HGLMModel.HGLMModelOutput output) { + super(domain == null ? 0 : domain.length, domain); + _intercept = intercept; + _computeMetrics = computeMetrics; + _random_intercept = random_intercept; + _metricBuilder = new ModelMetricsRegression.MetricBuilderRegression(); // everything else goes back regression + _beta = output._beta; + _ubeta = output._ubeta; + _tmat = output._tmat; + } + + public double[] perRow(double[] ds, float[] yact, double weight, double offset, double[] xji, double[] zji, + double[][] yMinusXTimesZ, int level2Index, Model m) { + if (weight == 0) return ds; + _metricBuilder.perRow(ds, yact, weight, offset, m); + add2(yact[0], ds[0], weight, xji, zji, yMinusXTimesZ, level2Index, offset); + return ds; + } + + private void add2(double yresp, double predictedVal, double weight, double[] input, double[] randomInput, + double[][] yMinusXTimesZ, int level2Index, double offset) { + double temp = yresp- ArrayUtils.innerProduct(_beta, input)-offset; + _yMinusFixPredSquare += temp*temp; + ArrayUtils.add(yMinusXTimesZ[level2Index], ArrayUtils.mult(randomInput, temp)); + _nobs++; + temp = yresp-predictedVal; + _sse += temp*temp; + } + + @Override + public void reduce(MetricBuilderHGLM other) { + _metricBuilder.reduce(other._metricBuilder); + _yMinusFixPredSquare += other._yMinusFixPredSquare; + _sse += other._sse; + _nobs += other._nobs; + } + + @Override + public double[] perRow(double[] ds, float[] yact, Model m) { + return ds; + } + + @Override + public ModelMetrics makeModelMetrics(Model m, Frame f, Frame adaptedFrame, Frame preds) { + HGLMModel hglmM = (HGLMModel) m; + ModelMetrics mm = _metricBuilder.makeModelMetrics(hglmM, f, null, null); + ModelMetricsRegression metricsRegression = (ModelMetricsRegression) mm; + boolean forTraining = m._parms.train().getKey().equals(f.getKey()); + double[][] tmat = hglmM._output._tmat; // already set with non-standardized random coefficients + + if (forTraining) { + double loglikelihood = calHGLMLlg(metricsRegression._nobs, tmat, hglmM._output._tau_e_var, hglmM._output._arjtarj, + this._yMinusFixPredSquare, hglmM._output._yMinusXTimesZ); + mm = new ModelMetricsRegressionHGLM(m, f, metricsRegression._nobs, this.weightedSigma(), loglikelihood, + this._customMetric, hglmM._output._iterations, hglmM._output._beta, hglmM._output._ubeta, + tmat, hglmM._output._tau_e_var, metricsRegression._MSE, this._yMinusFixPredSquare / metricsRegression._nobs, + metricsRegression.mae(), metricsRegression._root_mean_squared_log_error, + metricsRegression._mean_residual_deviance, metricsRegression.aic()); + } else { + List colNames = Arrays.asList(f.names()); + boolean hasWeights = hglmM._parms._weights_column != null && colNames.contains(hglmM._parms._weights_column); + boolean hasOffsets = hglmM._parms._offset_column != null && colNames.contains(hglmM._parms._offset_column); + DataInfo dinfo = new DataInfo(adaptedFrame, null, 1, hglmM._parms._use_all_factor_levels, + DataInfo.TransformType.NONE, DataInfo.TransformType.NONE, + hglmM._parms.missingValuesHandling() == Skip, + hglmM._parms.missingValuesHandling() == MeanImputation + || hglmM._parms.missingValuesHandling() == PlugValues, + hglmM._parms.makeImputer(), false, hasWeights, hasOffsets, false, null); + HGLMTask.ComputationEngineTask engineTask = new HGLMTask.ComputationEngineTask(null, hglmM._parms, dinfo); + engineTask.doAll(dinfo._adaptedFrame); + double loglikelihood = calHGLMLlg(engineTask._nobs, tmat, hglmM._output._tau_e_var, engineTask._ArjTArj, + this._yMinusFixPredSquare, hglmM._output._yMinusXTimesZValid); + mm = new ModelMetricsRegressionHGLM(m, f, metricsRegression._nobs, this.weightedSigma(), loglikelihood, + this._customMetric, hglmM._output._iterations, hglmM._output._beta, hglmM._output._ubeta, tmat, + hglmM._output._tau_e_var,metricsRegression._MSE, this._yMinusFixPredSquare /metricsRegression._nobs, + metricsRegression.mae(), metricsRegression._root_mean_squared_log_error, + metricsRegression._mean_residual_deviance, metricsRegression.aic()); + hglmM._output._nobs_valid = engineTask._nobs; + } + + if (m != null) + m.addModelMetrics(mm); + return mm; + } + + /** + * + * This method calculates the log-likelihood as described in section II.V of the doc. + */ + public static double calHGLMLlg(long nobs, double[][] tmat, double varResidual, double[][][] zjTTimesZj, + double yMinsXFixSquared, double[][] yMinusXFixTimesZ) { + int numLevel2 = zjTTimesZj.length; + double[][] tmatInv = new Matrix(tmat).inverse().getArray(); + double tmatDeterminant = new Matrix(tmat).det(); + double oneOVar = 1.0 / varResidual; + double oneOVarSq = oneOVar * oneOVar; + double llg = nobs * LOG_2PI + oneOVar * yMinsXFixSquared; + double[][] invTPlusZjTZ; + Matrix yMinusXjFixed; + Matrix yjMinusXjFixed; + for (int ind2 = 0; ind2 < numLevel2; ind2++) { + invTPlusZjTZ = calInvTPZjTZ(tmatInv, zjTTimesZj[ind2], oneOVar); + llg += Math.log(varResidual * new Matrix(invTPlusZjTZ).det() * tmatDeterminant); + yMinusXjFixed = new Matrix(new double[][]{yMinusXFixTimesZ[ind2]}); + yjMinusXjFixed = yMinusXjFixed.times(new Matrix(invTPlusZjTZ).inverse().times(yMinusXjFixed.transpose())); + llg -= oneOVarSq * yjMinusXjFixed.getArray()[0][0]; + } + return -0.5 * llg; + } + + public static double[][] calInvTPZjTZ(double[][] tmatInv, double[][] zjTTimesZj, double oneOVar) { + return new Matrix(tmatInv).plus(new Matrix(zjTTimesZj).times(oneOVar)).getArray(); + } +} diff --git a/h2o-algos/src/main/java/hex/schemas/GLMModelV3.java b/h2o-algos/src/main/java/hex/schemas/GLMModelV3.java index 4d8d79663980..5316478ac820 100644 --- a/h2o-algos/src/main/java/hex/schemas/GLMModelV3.java +++ b/h2o-algos/src/main/java/hex/schemas/GLMModelV3.java @@ -26,9 +26,6 @@ public static final class GLMModelOutputV3 extends ModelOutputSchemaV3 0 and <=1, used by negative binomial @@ -62,10 +60,8 @@ public static final class GLMParametersV3 extends ModelParametersSchemaV3 { + public static final class HGLMModelOutputV3 extends ModelOutputSchemaV3 { + // the doc == document described our HGLM implementation attached to issue: https://github.com/h2oai/h2o-3/issues/8487 + @API(help="Table of Fixed Coefficients") + TwoDimTableV3 coefficients_table; + + @API(help="Table of Random Coefficients") + TwoDimTableV3 random_coefficients_table; + + @API(help="Table of Scoring History for Validation Dataset") + TwoDimTableV3 scoring_history_valid; + + @API(help="Fixed Effects Coefficient Names") + public String[] coefficient_names; // include intercept only if _parms._intercept is true + + @API(help="Random Effects Coefficient Names") + public String[] random_coefficient_names; // include intercept only if _parms._random_intercept = true + + @API(help="Level 2 Indice Names") + public String[] group_column_names; + + @API(help="Fixed Effects Coefficients") + public double[] beta; // fixed coefficients + + @API(help="Random Effects Coefficients") + public double[][] ubeta; // random coefficients + + @API(help="Covariance Matrix for Random Effects (= Tj in section II.I of the doc") + public double[][] tmat; + + @API(help="Ratio of each random effect variance and (sum of all random effect variances plus the residual noise" + + " variance).") + double[] icc; + + @API(help="Residual noise variance") + double residual_variance; + + @API(help="Mean residual error with fixed effect coefficients only") + double mean_residual_fixed; + + @API(help="Mean residual error with fixed effect coefficients only") + double mean_residual_fixed_valid; + + @Override + public HGLMModelOutputV3 fillFromImpl(HGLMModel.HGLMModelOutput impl) { + super.fillFromImpl(impl); + coefficient_names = impl._fixed_coefficient_names; + random_coefficient_names = impl._random_coefficient_names; + group_column_names = impl._group_column_names; + beta = impl._beta; + ubeta = impl._ubeta; + coefficients_table = new TwoDimTableV3(); + coefficients_table.fillFromImpl(generateCoeffTable("fixed effect oefficients", + "HGLM fixed effect coefficients", beta, coefficient_names)); + random_coefficients_table = new TwoDimTableV3(); + random_coefficients_table.fillFromImpl(generate2DCoeffTable("random effect coefficients", + "HGLM random effect coefficients", ubeta, random_coefficient_names, impl._group_column_names)); + icc = impl._icc; + residual_variance = impl._tau_e_var; + mean_residual_fixed = impl._yMinusFixPredSquare /impl._nobs; + if (impl._nobs_valid > 0) + mean_residual_fixed_valid = impl._yMinusFixPredSquareValid /impl._nobs_valid; + return this; + } + } + + public static TwoDimTable generateCoeffTable(String title1, String title2, double[] coeffs, String[] coeffNames) { + String[] colnames = new String[] {"coefficients"}; + String[] colFormats = new String[] {"%.5f"}; + String[] colTypes = new String[] {"double"}; + TwoDimTable tdt = new TwoDimTable(title1, title2, coeffNames, colnames, colTypes, colFormats, "names"); + int tableLen = coeffs.length; + for (int index=0; index { + + public static final class HGLMParametersV3 extends ModelParametersSchemaV3 { + public static final String[] fields = new String[] { + "model_id", + "training_frame", + "validation_frame", + "response_column", + "ignored_columns", + "ignore_const_cols", + "offset_column", + "weights_column", + "max_runtime_secs", + "custom_metric_func", + "score_each_iteration", + "score_iteration_interval", + "seed", + "missing_values_handling", + "plug_values", + "family", + "rand_family", + "max_iterations", + "initial_fixed_effects", + "initial_random_effects", + "initial_t_matrix", + "tau_u_var_init", + "tau_e_var_init", + "random_columns", + "method", + "em_epsilon", + "random_intercept", + "group_column", + "gen_syn_data" + }; + + @API(help = "Perform scoring for every score_iteration_interval iterations.", level = Level.secondary) + public int score_iteration_interval; + + @API(help = "Seed for pseudo random number generator (if applicable).", gridable = true) + public long seed; + + @API(help = "Handling of missing values. Either MeanImputation, Skip or PlugValues.", + values = { "MeanImputation", "Skip", "PlugValues"}, level = API.Level.expert, + direction=API.Direction.INOUT, gridable = true) + public GLMParameters.MissingValuesHandling missing_values_handling; + + @API(help = "Plug Values (a single row frame containing values that will be used to impute missing values of the" + + " training/validation frame, use with conjunction missing_values_handling = PlugValues).", + direction = API.Direction.INPUT) + public KeyV3.FrameKeyV3 plug_values; + + // Input fields + @API(help = "Family. Only gaussian is supported now.", + values = {"gaussian"}, level = Level.critical) + public GLMParameters.Family family; + + @API(help = "Set distribution of random effects. Only Gaussian is implemented now.", + values = {"gaussian"}, level = Level.critical) + public GLMParameters.Family rand_family; + + @API(help = "Maximum number of iterations. Value should >=1. A value of 0 is only set when only the model " + + "coefficient names and model coefficient dimensions are needed.", level = Level.secondary) + public int max_iterations; + + @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable=true, + help = "An array that contains initial values of the fixed effects coefficient.") + public double[] initial_fixed_effects; + + @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable=true, + help = "A H2OFrame id that contains initial values of the random effects coefficient. The row names should" + + "be the random coefficient names. If you are not sure what the random coefficient names are," + + " build HGLM model with max_iterations = 0 and checkout the model output field " + + "random_coefficient_names. The number of rows of this frame should be the number of level 2" + + " units. Again, to figure this out, build HGLM model with max_iterations=0 and check out " + + "the model output field group_column_names. The number of rows should equal the length of the" + + "group_column_names.") + public KeyV3.FrameKeyV3 initial_random_effects; + + @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable=true, + help = "A H2OFrame id that contains initial values of the T matrix. It should be a positive symmetric matrix.") + public KeyV3.FrameKeyV3 initial_t_matrix; + + @API(help = "Initial variance of random coefficient effects. If set, should provide a value > 0.0. If not set, " + + "will be randomly set in the model building process." + , level = Level.expert, gridable = true) + public double tau_u_var_init; + + @API(help = "Initial variance of random noise. If set, should provide a value > 0.0. If not set, will be randomly" + + " set in the model building process." + , level = Level.expert, gridable = true) + public double tau_e_var_init; + + @API(help = "Random columns indices for HGLM.", gridable=true) + public String[] random_columns; + + @API(help = "We only implemented EM as a method to obtain the fixed, random coefficients and the various variances.", + values = {"EM"}, level = Level.critical) + public HGLMModel.HGLMParameters.Method method; + + @API(help = "Converge if beta/ubeta/tmat/tauEVar changes less (using L-infinity norm) than em esilon. ONLY applies to EM method." + , level = Level.expert) + public double em_epsilon; + + @API(help="If true, will allow random component to the GLM coefficients.", direction=Direction.INPUT, gridable=true) + public boolean random_intercept; + + @API(help="Group column is the column that is categorical and used to generate the groups in HGLM", gridable=true) + public String group_column; + + @API(help="If true, add gaussian noise with variance specified in parms._tau_e_var_init.", + direction=Direction.INPUT, gridable=true) + public boolean gen_syn_data; + } +} diff --git a/h2o-algos/src/main/java/hex/schemas/ModelSelectionV3.java b/h2o-algos/src/main/java/hex/schemas/ModelSelectionV3.java index 38184ce88bc3..3dd7f5455ef7 100644 --- a/h2o-algos/src/main/java/hex/schemas/ModelSelectionV3.java +++ b/h2o-algos/src/main/java/hex/schemas/ModelSelectionV3.java @@ -48,7 +48,7 @@ public static final class ModelSelectionParametersV3 extends ModelParametersSche "objective_epsilon", "beta_epsilon", "gradient_epsilon", - "startval", // initial starting values for fixed and randomized coefficients, double array + "startval", // initial starting values for coefficients, double array "prior", "cold_start", // if true, will start GLM model from initial values and conditions "lambda_min_ratio", @@ -186,11 +186,11 @@ public static final class ModelSelectionParametersV3 extends ModelParametersSche "inverse", "tweedie", "ologit"}) //"oprobit", "ologlog": will be supported. public GLMModel.GLMParameters.Link link; - @API(help = "double array to initialize fixed and random coefficients for HGLM, coefficients for GLM.", + @API(help = "Double array to initialize coefficients for GLM.", gridable=true) public double[] startval; - @API(help = "if true, will return likelihood function value for HGLM.") // not gridable + @API(help = "If true, will return likelihood function value for GLM.") // not gridable public boolean calc_like; @API(level = API.Level.critical, direction = API.Direction.INOUT, diff --git a/h2o-algos/src/main/resources/META-INF/services/water.api.Schema b/h2o-algos/src/main/resources/META-INF/services/water.api.Schema index 18c3ea3dc972..fcc18aa1585e 100644 --- a/h2o-algos/src/main/resources/META-INF/services/water.api.Schema +++ b/h2o-algos/src/main/resources/META-INF/services/water.api.Schema @@ -114,6 +114,10 @@ hex.schemas.UpliftDRFModelV3 hex.schemas.UpliftDRFModelV3$UpliftDRFModelOutputV3 hex.schemas.UpliftDRFV3 hex.schemas.UpliftDRFV3$UpliftDRFParametersV3 +hex.schemas.HGLMModelV3 +hex.schemas.HGLMModelV3$HGLMModelOutputV3 +hex.schemas.HGLMV3 +hex.schemas.HGLMV3$HGLMParametersV3 hex.schemas.AdaBoostModelV3 hex.schemas.AdaBoostModelV3$AdaBoostModelOutputV3 hex.schemas.AdaBoostV3 diff --git a/h2o-algos/src/test/java/hex/generic/GenericModelTest.java b/h2o-algos/src/test/java/hex/generic/GenericModelTest.java index cc310ad7a5c4..12c44698485c 100644 --- a/h2o-algos/src/test/java/hex/generic/GenericModelTest.java +++ b/h2o-algos/src/test/java/hex/generic/GenericModelTest.java @@ -707,7 +707,6 @@ public void downloadable_mojo_glm() throws IOException { parms._train = trainingFrame._key; parms._distribution = AUTO; parms._response_column = trainingFrame._names[1]; - parms._rand_family = new GLMModel.GLMParameters.Family[] {GLMModel.GLMParameters.Family.AUTO}; GLM job = new GLM(parms); final GLMModel originalModel = job.trainModel().get(); diff --git a/h2o-algos/src/test/java/hex/glm/GLMBasicTestHGLM.java b/h2o-algos/src/test/java/hex/glm/GLMBasicTestHGLM.java deleted file mode 100644 index b2252c9ee97e..000000000000 --- a/h2o-algos/src/test/java/hex/glm/GLMBasicTestHGLM.java +++ /dev/null @@ -1,94 +0,0 @@ -package hex.glm; - -import hex.ModelMetricsHGLMGaussianGaussian; -import hex.glm.GLMModel.GLMParameters; -import hex.glm.GLMModel.GLMParameters.Family; -import org.junit.BeforeClass; -import org.junit.Test; -import water.DKV; -import water.Scope; -import water.TestUtil; -import water.fvec.Frame; - -import java.io.IOException; - -import static org.junit.Assert.assertEquals; - -/** - * Created by tomasnykodym on 6/4/15. - */ -public class GLMBasicTestHGLM extends TestUtil { - - @BeforeClass - public static void setup() throws IOException { - stall_till_cloudsize(1); - } - - @Test - public void testSemiconductor(){ - try { - Scope.enter(); - Frame fr = parseTestFile("smalldata/glm_test/semiconductor.csv"); - fr.replace(0, fr.vec(0).toCategoricalVec()).remove(); - DKV.put(fr); - Scope.track(fr); - GLMParameters parms = new GLMParameters(); - parms._train = fr._key; - parms._response_column = "y"; - parms._ignored_columns = new String[]{"x2","x4","Device"}; - parms._ignore_const_cols = true; - parms._family = Family.gaussian; - parms._link = GLMParameters.Link.identity; - parms._HGLM=true; - parms._rand_family = new Family[] {Family.gaussian}; - parms._rand_link = new GLMParameters.Link[] {GLMParameters.Link.identity}; - parms._random_columns = new int[]{0}; - parms._calc_like = true; - - // just make sure it runs - GLMModel model = new GLM(parms).trainModel().get(); - Scope.track_generic(model); - ModelMetricsHGLMGaussianGaussian mmetrics = (ModelMetricsHGLMGaussianGaussian) model._output._training_metrics; - Scope.track_generic(mmetrics); - assertEquals(363.6833, mmetrics._hlik, 1e-4); - System.out.println("**************** testSemiconductor test completed. ****************"); - } finally { - Scope.exit(); - } - } - - @Test - public void testMultiChunkData(){ - try { - Scope.enter(); - Frame fr = parseTestFile("smalldata/glm_test/HGLM_5KRows_100Z.csv"); - fr.replace(0, fr.vec(0).toCategoricalVec()).remove(); - fr.replace(1, fr.vec(1).toCategoricalVec()).remove(); - fr.replace(2, fr.vec(2).toCategoricalVec()).remove(); - fr.replace(3, fr.vec(3).toCategoricalVec()).remove(); - DKV.put(fr); - Scope.track(fr); - GLMParameters parms = new GLMParameters(); - parms._train = fr._key; - parms._response_column = "response"; - parms._ignored_columns = new String[]{"Z"}; - parms._ignore_const_cols = true; - parms._family = Family.gaussian; - parms._link = GLMParameters.Link.identity; - parms._HGLM=true; - parms._rand_family = new Family[] {Family.gaussian}; - parms._rand_link = new GLMParameters.Link[] {GLMParameters.Link.identity}; - parms._random_columns = new int[]{0}; - parms._calc_like = true; - - // just make sure it runs - GLMModel model = new GLM(parms).trainModel().get(); - Scope.track_generic(model); - ModelMetricsHGLMGaussianGaussian mmetrics = (ModelMetricsHGLMGaussianGaussian) model._output._training_metrics; - Scope.track_generic(mmetrics); - assertEquals(-23643.3076231, mmetrics._hlik, 1e-4); - } finally { - Scope.exit(); - } - } -} diff --git a/h2o-algos/src/test/java/hex/glm/GLMCheckpointTest.java b/h2o-algos/src/test/java/hex/glm/GLMCheckpointTest.java index 07407bca4472..149e5f1da6ba 100644 --- a/h2o-algos/src/test/java/hex/glm/GLMCheckpointTest.java +++ b/h2o-algos/src/test/java/hex/glm/GLMCheckpointTest.java @@ -133,12 +133,12 @@ private int[] restoreScoringHistoryFromCheckpoint(TwoDimTable scoringHistory, GL "deviance_test", "alpha"} : new String[]{"iteration", "timestamp", "negative_log_likelihood", "objective", "sum(etai-eta0)^2", "convergence"}; - int num2Copy = parms._HGLM || parms._lambda_search ? colHeaders2Restore.length : colHeaders2Restore.length-2; + int num2Copy = parms._lambda_search ? colHeaders2Restore.length : colHeaders2Restore.length-2; int[] colHeadersIndex = grabHeaderIndex(scoringHistory, num2Copy, colHeaders2Restore); if (parms._lambda_search) lscHistory.restoreFromCheckpoint(scoringHistory, colHeadersIndex); else - scHistory.restoreFromCheckpoint(scoringHistory, colHeadersIndex, parms._HGLM); + scHistory.restoreFromCheckpoint(scoringHistory, colHeadersIndex); return colHeadersIndex; } diff --git a/h2o-algos/src/test/java/hex/glm/GLMConstrainedTest.java b/h2o-algos/src/test/java/hex/glm/GLMConstrainedTest.java index a4ae87ea0b03..c21d39da36d5 100644 --- a/h2o-algos/src/test/java/hex/glm/GLMConstrainedTest.java +++ b/h2o-algos/src/test/java/hex/glm/GLMConstrainedTest.java @@ -1,7 +1,6 @@ package hex.glm; import Jama.Matrix; -import hex.gam.MatrixFrameUtils.GamUtils; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -31,6 +30,7 @@ import static org.junit.Assert.assertTrue; import static water.fvec.Vec.T_NUM; import static water.fvec.Vec.T_STR; +import static water.util.ArrayUtils.copy2DArray; @RunWith(H2ORunner.class) @CloudSize(1) @@ -917,7 +917,7 @@ public void testFindDropZeroColumns() { Matrix initMat = Matrix.random(11, 11); double[][] doubleValsOrig = (initMat.plus(initMat.transpose())).getArray(); double[][] doubleVals = new double[doubleValsOrig.length][doubleValsOrig.length]; - GamUtils.copy2DArray(doubleValsOrig, doubleVals); + copy2DArray(doubleValsOrig, doubleVals); // no zero columns int[] numZeroCol = findZeroCols(doubleVals); assertTrue("number of zero columns is zero in this case but is not.", numZeroCol.length==0); diff --git a/h2o-algos/src/test/java/hex/hglm/HGLMBasicTest.java b/h2o-algos/src/test/java/hex/hglm/HGLMBasicTest.java new file mode 100644 index 000000000000..396ba4005781 --- /dev/null +++ b/h2o-algos/src/test/java/hex/hglm/HGLMBasicTest.java @@ -0,0 +1,905 @@ +package hex.hglm; + +import Jama.Matrix; +import hex.SplitFrame; +import hex.glm.GLMModel; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import water.DKV; +import water.Key; +import water.Scope; +import water.TestUtil; +import water.fvec.Frame; +import water.fvec.TestFrameBuilder; +import water.runner.CloudSize; +import water.runner.H2ORunner; +import water.util.ArrayUtils; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; + +import static org.junit.Assert.assertEquals; +import static water.fvec.Vec.T_CAT; +import static water.fvec.Vec.T_NUM; +import static water.util.ArrayUtils.innerProduct; +import static water.util.ArrayUtils.outerProduct; + +@RunWith(H2ORunner.class) +@CloudSize(1) +public class HGLMBasicTest extends TestUtil { + public static final double TOL = 1e-4; + Frame _simpleFrame1; + Frame _simpleFrameSortEnum1; + Frame _simpleFrameSortEnum2; + Frame _simpleFrameSortEnum3; + Frame _simpleFrameSortEnum4; + Frame _simpleFrameSortEnum5; + Frame _simpleFrameSortEnum6; + + @Before + public void setup() { + Scope.enter(); + _simpleFrame1 = Scope.track(getSimpleFrame1()); + _simpleFrameSortEnum1 = new Frame(_simpleFrame1.sort(new int[]{0})); + DKV.put(_simpleFrameSortEnum1); + Scope.track(_simpleFrameSortEnum1); + _simpleFrameSortEnum2 = new Frame(_simpleFrame1.sort(new int[]{1})); + Scope.track(_simpleFrameSortEnum2); + DKV.put(_simpleFrameSortEnum2); + _simpleFrameSortEnum3 = new Frame(_simpleFrame1.sort(new int[]{3})); + Scope.track(_simpleFrameSortEnum3); + DKV.put(_simpleFrameSortEnum3); + _simpleFrameSortEnum4 = new Frame(_simpleFrame1.sort(new int[]{4})); + Scope.track(_simpleFrameSortEnum4); + DKV.put(_simpleFrameSortEnum4); + _simpleFrameSortEnum5 = new Frame(_simpleFrame1.sort(new int[]{5})); + Scope.track(_simpleFrameSortEnum5); + DKV.put(_simpleFrameSortEnum5); + _simpleFrameSortEnum6 = new Frame(_simpleFrame1.sort(new int[]{6})); + Scope.track(_simpleFrameSortEnum6); + DKV.put(_simpleFrameSortEnum6); + } + + public Frame getSimpleFrame1() { + Frame frame1 = new TestFrameBuilder() + .withColNames("enum1", "enum2", "enum3", "enum4", "enum5", "enum6", "num1", "num2", "num3", "response") + .withVecTypes(T_CAT, T_CAT, T_CAT, T_CAT, T_CAT, T_CAT, T_NUM, T_NUM, T_NUM, T_NUM) + .withDataForCol(0, new String[]{"4","6","1","5","0","3","2","4","6","1","5","0","3","2","2","4","5", + "0","3","1","6"}) + .withDataForCol(1, new String[]{"4","2","1","3","5","0","5","4","3","0","1","2","4","0","1","5","2", + "3","5","4","1"}) + .withDataForCol(2, new String[]{"2","4","1","3","0","3","2","1","4","0","3","4","1","2","0","2","1", + "3","4","0","4"}) + .withDataForCol(3, new String[]{"1","0","3","2","2","3","0","1","1","2","3","0","3","2","1","0","1", + "3","2","0","0"}) + .withDataForCol(4, new String[]{"1","2","0","0","2","1","2","0","1","1","0","2","1","0","2","2","1", + "0","2","1","0"}) + .withDataForCol(5, new String[]{"0","1","0","1","0","1","0","0","1","0","1","0","1","0","1","0","1", + "0","1","0","1"}) + .withDataForCol(6, new double[]{1.8927, 0.7133, 0.08293, 0.6011, 0.2649, 0.8661, 0.8842, 0.63299, + 0.4035, 0.8388, 0.8383, 0.0594, 0.6184, 0.5409, 0.4051, 0.6057, 0.8923, 0.5943, 0.0418, 0.6039, 0.5505}) + .withDataForCol(7, new double[]{0.8234, 1.7421, 0.6429, 0.0266, 0.1297, 0.14113, 0.9964, 0.2733, + 0.2033, 0.202, 0.5686, 0.6647, 0.348, 0.2829, 0.3381, 0.1031, 0.0311, 0.6848, 0.4419, 0.1148, 0.4001}) + .withDataForCol(8, new double[]{0.70475, 0.3551, 1.4599, 0.3418, 0.8728, 0.74046, 0.8455, 0.7969, + 0.78093, 0.39793, 0.73438, 0.8195, 0.556, 0.1135, 0.0814, 0.1734, 0.1343, 0.4957, 0.3189, 0.7773, 0.1559}) + .withDataForCol(9, new double[]{0.3489, 0.4978, 0.1525, 1.9239, 0.8210, 0.4121, 0.0462, 0.4824, + 0.6821, 0.7671, 0.8811, 0.8045, 0.65, 0.4112, 0.972, 0.112, 0.6828, 0.237, 0.541, 0.6329, 0.4035}) + .build(); + return frame1; + } + + @After + public void teardown() { + Scope.exit(); + } + + // The next set of tests will check and make sure we generate the correct fixed matrices and vectors and they are: + // AfjTAfj, AfjTYj, AfjTrj, ArjTArj, ArjTYj + + // In this test, we choose the enum pred with the highest number of enum levels to be the cluster group + @Test + public void testLevel2enum1() { + Scope.enter(); + try { + HGLMModel.HGLMParameters params = new HGLMModel.HGLMParameters(); + params._train = _simpleFrame1._key; + params._response_column = "response"; + params._group_column = "enum1"; + params._use_all_factor_levels = true; + params._random_columns = new String[]{"enum6", "enum4", "enum2", "num1", "num3"}; + params._showFixedMatVecs = true; + params._max_iterations = 0; + HGLMModel model = new HGLM(params).trainModel().get(); + Scope.track_generic(model); + checkWithManualResults1(model, params._response_column, _simpleFrame1, params._group_column, + new String[]{"enum2", "enum3", "enum4", "enum5", "enum6"}, new String[]{"num1", "num2", "num3"}, + new String[]{"enum2", "enum4", "enum6"}, new String[]{"num1", "num3"}); + } finally { + Scope.exit(); + } + } + + @Test + public void testLevel2enum1NoIntercept() { + Scope.enter(); + try { + HGLMModel.HGLMParameters params = new HGLMModel.HGLMParameters(); + params._train = _simpleFrame1._key; + params._response_column = "response"; + params._group_column = "enum1"; + params._use_all_factor_levels = true; + params._random_columns = new String[]{"enum6", "enum4", "enum2", "num1", "num3"}; + params._showFixedMatVecs = true; + params._max_iterations = 0; + params._random_intercept = true; + HGLMModel model = new HGLM(params).trainModel().get(); + Scope.track_generic(model); + checkWithManualResults1(model, params._response_column, _simpleFrame1, params._group_column, + new String[]{"enum2", "enum3", "enum4", "enum5", "enum6"}, new String[]{"num1", "num2", "num3"}, + new String[]{"enum2", "enum4", "enum6"}, new String[]{"num1", "num3"}); + } finally { + Scope.exit(); + } + } + + @Test + public void testLevel2enum1V2() { + Scope.enter(); + try { + HGLMModel.HGLMParameters params = new HGLMModel.HGLMParameters(); + params._train = _simpleFrame1._key; + params._response_column = "response"; + params._group_column = "enum1"; + params._use_all_factor_levels = false; + params._random_columns = new String[]{"enum6", "enum4", "enum2", "num1", "num3"}; + params._showFixedMatVecs = true; + params._max_iterations = 0; + HGLMModel model = new HGLM(params).trainModel().get(); + Scope.track_generic(model); + checkWithManualResults1(model, params._response_column, _simpleFrame1, params._group_column, + new String[]{"enum2", "enum3", "enum4", "enum5", "enum6"}, new String[]{"num1", "num2", "num3"}, + new String[]{"enum2", "enum4", "enum6"}, new String[]{"num1", "num3"}); + } finally { + Scope.exit(); + } + } + + // In this test, we choose the enum2 to be the cluster group + @Test + public void testLevel2enum2() { + Scope.enter(); + try { + HGLMModel.HGLMParameters params = new HGLMModel.HGLMParameters(); + params._train = _simpleFrame1._key; + params._response_column = "response"; + params._group_column = "enum2"; + params._use_all_factor_levels = true; + params._random_columns = new String[]{"enum1", "enum4", "enum6", "num3", "num2"}; + params._showFixedMatVecs = true; + params._max_iterations = 0; + HGLMModel model = new HGLM(params).trainModel().get(); + Scope.track_generic(model); + checkWithManualResults1(model, params._response_column, _simpleFrame1, params._group_column, + new String[]{"enum1", "enum3", "enum4", "enum5", "enum6"}, new String[]{"num1", "num2", "num3"}, + new String[]{"enum1", "enum4", "enum6"}, new String[]{"num2", "num3"}); + } finally { + Scope.exit(); + } + } + + @Test + public void testLevel2enum2V2() { + Scope.enter(); + try { + HGLMModel.HGLMParameters params = new HGLMModel.HGLMParameters(); + params._train = _simpleFrame1._key; + params._response_column = "response"; + params._group_column = "enum2"; + params._use_all_factor_levels = false; + params._random_columns = new String[]{"enum1", "enum4", "enum6", "num3", "num2"}; + params._showFixedMatVecs = true; + params._max_iterations = 0; + HGLMModel model = new HGLM(params).trainModel().get(); + Scope.track_generic(model); + checkWithManualResults1(model, params._response_column, _simpleFrame1, params._group_column, + new String[]{"enum1", "enum3", "enum4", "enum5", "enum6"}, new String[]{"num1", "num2", "num3"}, + new String[]{"enum1", "enum4", "enum6"}, new String[]{"num2", "num3"}); + } finally { + Scope.exit(); + } + } + + @Test + public void testLevel2enum2V2NoRandomIntercept() { + Scope.enter(); + try { + HGLMModel.HGLMParameters params = new HGLMModel.HGLMParameters(); + params._train = _simpleFrame1._key; + params._response_column = "response"; + params._group_column = "enum2"; + params._use_all_factor_levels = false; + params._random_columns = new String[]{"enum1", "enum4", "enum6", "num3", "num2"}; + params._showFixedMatVecs = true; + params._max_iterations = 0; + params._random_intercept = false; + HGLMModel model = new HGLM(params).trainModel().get(); + Scope.track_generic(model); + checkWithManualResults1(model, params._response_column, _simpleFrame1, params._group_column, + new String[]{"enum1", "enum3", "enum4", "enum5", "enum6"}, new String[]{"num1", "num2", "num3"}, + new String[]{"enum1", "enum4", "enum6"}, new String[]{"num2", "num3"}); + } finally { + Scope.exit(); + } + } + + // In this test, we choose the enum3 to be the cluster group + @Test + public void testLevel2enum3() { + Scope.enter(); + try { + HGLMModel.HGLMParameters params = new HGLMModel.HGLMParameters(); + params._train = _simpleFrame1._key; + params._response_column = "response"; + params._group_column = "enum3"; + params._use_all_factor_levels = true; + params._random_columns = new String[]{"enum4", "enum1", "num1", "num2"}; + params._showFixedMatVecs = true; + params._max_iterations = 0; + HGLMModel model = new HGLM(params).trainModel().get(); + Scope.track_generic(model); + checkWithManualResults1(model, params._response_column, _simpleFrame1, params._group_column, + new String[]{"enum1", "enum2", "enum4", "enum5", "enum6"}, new String[]{"num1", "num2", "num3"}, + new String[]{"enum1", "enum4"}, new String[]{"num1", "num2"}); + } finally { + Scope.exit(); + } + } + + @Test + public void testLevel2enum3V2() { + Scope.enter(); + try { + HGLMModel.HGLMParameters params = new HGLMModel.HGLMParameters(); + params._train = _simpleFrame1._key; + params._response_column = "response"; + params._group_column = "enum3"; + params._use_all_factor_levels = false; + params._random_columns = new String[]{"enum4", "enum1", "num1", "num2"}; + params._showFixedMatVecs = true; + params._max_iterations = 0; + HGLMModel model = new HGLM(params).trainModel().get(); + Scope.track_generic(model); + checkWithManualResults1(model, params._response_column, _simpleFrame1, params._group_column, + new String[]{"enum1", "enum2", "enum4", "enum5", "enum6"}, new String[]{"num1", "num2", "num3"}, + new String[]{"enum1", "enum4"}, new String[]{"num1", "num2"}); + } finally { + Scope.exit(); + } + } + + // In this test, we choose the enum4 to be the cluster group + @Test + public void testLevel2enum4() { + Scope.enter(); + try { + HGLMModel.HGLMParameters params = new HGLMModel.HGLMParameters(); + params._train = _simpleFrame1._key; + params._response_column = "response"; + params._group_column = "enum4"; + params._use_all_factor_levels = true; + params._random_columns = new String[]{"enum3", "enum1", "enum2", "enum6", "num1", "num2"}; + params._showFixedMatVecs = true; + params._max_iterations = 0; + HGLMModel model = new HGLM(params).trainModel().get(); + Scope.track_generic(model); + checkWithManualResults1(model, params._response_column, _simpleFrame1, params._group_column, + new String[]{"enum1", "enum2", "enum3", "enum5", "enum6"}, new String[]{"num1", "num2", "num3"}, + new String[]{"enum1", "enum2", "enum3", "enum6"}, new String[]{"num1", "num2"}); + } finally { + Scope.exit(); + } + } + + @Test + public void testLevel2enum4V2() { + Scope.enter(); + try { + HGLMModel.HGLMParameters params = new HGLMModel.HGLMParameters(); + params._train = _simpleFrame1._key; + params._response_column = "response"; + params._group_column = "enum4"; + params._use_all_factor_levels = false; + params._random_columns = new String[]{"enum3", "enum1", "enum2", "enum6", "num1", "num2"}; + params._showFixedMatVecs = true; + params._max_iterations = 0; + HGLMModel model = new HGLM(params).trainModel().get(); + Scope.track_generic(model); + checkWithManualResults1(model, params._response_column, _simpleFrame1, params._group_column, + new String[]{"enum1", "enum2", "enum3", "enum5", "enum6"}, new String[]{"num1", "num2", "num3"}, + new String[]{"enum1", "enum2", "enum3", "enum6"}, new String[]{"num1", "num2"}); + } finally { + Scope.exit(); + } + } + + @Test + public void testLevel2enum5() { + Scope.enter(); + try { + HGLMModel.HGLMParameters params = new HGLMModel.HGLMParameters(); + params._train = _simpleFrame1._key; + params._response_column = "response"; + params._group_column = "enum5"; + params._use_all_factor_levels = true; + params._random_columns = new String[]{"enum3", "enum1", "enum2", "enum6", "num1", "num3"}; + params._showFixedMatVecs = true; + params._max_iterations = 0; + HGLMModel model = new HGLM(params).trainModel().get(); + Scope.track_generic(model); + checkWithManualResults1(model, params._response_column, _simpleFrame1, params._group_column, + new String[]{"enum1", "enum2", "enum3", "enum4", "enum6"}, new String[]{"num1", "num2", "num3"}, + new String[]{"enum1", "enum2", "enum3", "enum6"}, new String[]{"num1", "num3"}); + } finally { + Scope.exit(); + } + } + + @Test + public void testLevel2enum5V2() { + Scope.enter(); + try { + HGLMModel.HGLMParameters params = new HGLMModel.HGLMParameters(); + params._train = _simpleFrame1._key; + params._response_column = "response"; + params._group_column = "enum5"; + params._use_all_factor_levels = false; + params._random_columns = new String[]{"enum3", "enum1", "enum2", "enum6", "num1", "num3"}; + params._showFixedMatVecs = true; + params._max_iterations = 0; + HGLMModel model = new HGLM(params).trainModel().get(); + Scope.track_generic(model); + checkWithManualResults1(model, params._response_column, _simpleFrame1, params._group_column, + new String[]{"enum1", "enum2", "enum3", "enum4", "enum6"}, new String[]{"num1", "num2", "num3"}, + new String[]{"enum1", "enum2", "enum3", "enum6"}, new String[]{"num1", "num3"}); + } finally { + Scope.exit(); + } + } + + @Test + public void testLevel2enum6() { + Scope.enter(); + try { + HGLMModel.HGLMParameters params = new HGLMModel.HGLMParameters(); + params._train = _simpleFrame1._key; + params._response_column = "response"; + params._group_column = "enum6"; + params._use_all_factor_levels = true; + params._random_columns = new String[]{"enum3", "enum1", "enum2", "enum5", "num1", "num3"}; + params._showFixedMatVecs = true; + params._max_iterations = 0; + HGLMModel model = new HGLM(params).trainModel().get(); + Scope.track_generic(model); + checkWithManualResults1(model, params._response_column, _simpleFrame1, params._group_column, + new String[]{"enum1", "enum2", "enum3", "enum4", "enum5"}, new String[]{"num1", "num2", "num3"}, + new String[]{"enum1", "enum2", "enum3", "enum5"}, new String[]{"num1", "num3"}); + } finally { + Scope.exit(); + } + } + + @Test + public void testLevel2enum6V2() { + Scope.enter(); + try { + HGLMModel.HGLMParameters params = new HGLMModel.HGLMParameters(); + params._train = _simpleFrame1._key; + params._response_column = "response"; + params._group_column = "enum6"; + params._max_iterations = 0; + params._use_all_factor_levels = false; + params._random_columns = new String[]{"enum3", "enum1", "enum2", "enum5", "num1", "num3"}; + params._showFixedMatVecs = true; + HGLMModel model = new HGLM(params).trainModel().get(); + Scope.track_generic(model); + checkWithManualResults1(model, params._response_column, _simpleFrame1, params._group_column, + new String[]{"enum1", "enum2", "enum3", "enum4", "enum5"}, new String[]{"num1", "num2", "num3"}, + new String[]{"enum1", "enum2", "enum3", "enum5"}, new String[]{"num1", "num3"}); + } finally { + Scope.exit(); + } + } + + // todo: add check _yMinusXTimesZ and _yMinusfixPredSquare + public void checkWithManualResults1(HGLMModel model, String response, Frame fr, String level2Name, + String[] enumFixed, String[] numFixed, String[] enumRandom, String[] numRandom) { + double[] fixedRowValues; + double[] randomRowValues; + int numLevel2Vals = fr.vec(level2Name).domain().length; + double[][][] afjTAfj = new double[numLevel2Vals][][]; + double[][][] arjTArj = new double[numLevel2Vals][][]; + double[][][] afjTArj = new double[numLevel2Vals][][]; + double[][] afjTYj = new double[numLevel2Vals][]; + double[][] arjTYj = new double[numLevel2Vals][]; + double yMinusfixPredSquare = 0; + double[][] yMinusXTimesZ = new double[numLevel2Vals][model._output._yMinusXTimesZ[0].length]; + double[] beta = model._output._beta; + + int numRow = (int) fr.numRows(); + double responseVal; + int unit2Level; + double fixEffect; + double respMinusFix; + + for (int rowInd = 0; rowInd < numRow; rowInd++) { + fixedRowValues = grabRow2Arrays(enumFixed, numFixed, true, rowInd, fr, model._parms._use_all_factor_levels); + randomRowValues = grabRow2Arrays(enumRandom, numRandom, model._parms._random_intercept, rowInd, fr, model._parms._use_all_factor_levels); + responseVal = fr.vec(response).at(rowInd); + unit2Level = (int) fr.vec(level2Name).at(rowInd); + // calculate the various matrices and vectors + formMatrix(afjTAfj, unit2Level, fixedRowValues, fixedRowValues); // calculate afjTAfj + formMatrix(arjTArj, unit2Level, randomRowValues, randomRowValues); // calculate arjTArj + formMatrix(afjTArj, unit2Level, fixedRowValues, randomRowValues); // calculate afjTArj + formVector(afjTYj, unit2Level, fixedRowValues, responseVal); // calculate afjTYj + formVector(arjTYj, unit2Level, randomRowValues, responseVal); // calculate arjTYj + fixEffect = innerProduct(fixedRowValues, beta); + respMinusFix = responseVal - fixEffect; + yMinusfixPredSquare += respMinusFix*respMinusFix; + ArrayUtils.add(yMinusXTimesZ[unit2Level], ArrayUtils.mult(randomRowValues, respMinusFix)); + } + + // make sure manually generated matrices/vectors and those from model._output are the same + checkDoubleArrays(model._output._afjtyj, afjTYj, TOL); + checkDoubleArrays(model._output._arjtyj, arjTYj, TOL); + check3DArrays(model._output._afjtafj, afjTAfj, TOL); + check3DArrays(model._output._afjtarj, afjTArj, TOL); + check3DArrays(model._output._arjtarj, arjTArj, TOL); + checkDoubleArrays(model._output._yMinusXTimesZ, yMinusXTimesZ, TOL); + assertEquals(model._output._yMinusFixPredSquare, yMinusfixPredSquare, TOL); + } + + public void formVector(double[][] matrix, int level2Unit, double[] vector, double response) { + int len = vector.length; + + if (matrix[level2Unit] == null) + matrix[level2Unit] = new double[len]; + + for (int ind=0; ind < len; ind++) + matrix[level2Unit][ind] += vector[ind]*response; + + } + + public void formZTTimesZ(double[][] zTTimesZ, int unit2LevelS, double[] randomRowValuesS, double[][] result) { + int numRandVal = randomRowValuesS.length; + outerProduct(result, randomRowValuesS, randomRowValuesS); + int rowIndexStart = unit2LevelS*numRandVal; + int colIndexStart = unit2LevelS*numRandVal; + for (int index=0; index rowValues = new ArrayList<>(); + int catVal; + for (String enumName : enumPredNames) { + Double[] enumVal = new Double[useAllFactorLevels ? fr.vec(enumName).domain().length : (fr.vec(enumName).domain().length-1)]; + Arrays.fill(enumVal, 0.0); + catVal = (int) fr.vec(enumName).at(rowInd); + if (useAllFactorLevels && catVal >= 0) + enumVal[catVal] = 1.0; + if (!useAllFactorLevels && catVal > 0) + enumVal[(catVal-1)] = 1.0; + rowValues.addAll(Arrays.asList(enumVal)); + } + for (String numName:numPredNames) { + double val = fr.vec(numName).at(rowInd); + rowValues.add(val); + } + + if (hasIntercept) + rowValues.add(1.0); + return rowValues.stream().mapToDouble(Double::doubleValue).toArray(); + } + + // when we specify random columns in different permutation, the fixed matrices and vectors generated should be the + // same. + @Test + public void testMatVecFormation() { + Scope.enter(); + try { + HGLMModel.HGLMParameters params = new HGLMModel.HGLMParameters(); + params._train = _simpleFrame1._key; + params._response_column = "response"; + params._group_column = "enum1"; + params._use_all_factor_levels = true; + params._random_columns = new String[]{"enum2", "enum3", "num1", "num2"}; + params._showFixedMatVecs = true; + params._max_iterations = 0; + HGLMModel model = new HGLM(params).trainModel().get(); + Scope.track_generic(model); + + params._random_columns = new String[]{"num2", "num1", "enum3", "enum2"}; + HGLMModel model2 = new HGLM(params).trainModel().get(); + Scope.track_generic(model2); + checkDoubleArrays(model._output._afjtyj, model2._output._afjtyj, TOL); + checkDoubleArrays(model._output._arjtyj, model2._output._arjtyj, TOL); + check3DArrays(model._output._afjtafj, model2._output._afjtafj, TOL); + check3DArrays(model._output._afjtarj, model2._output._afjtarj, TOL); + check3DArrays(model._output._arjtarj, model2._output._arjtarj, TOL); + } finally { + Scope.exit(); + } + } + + public void checkCorrectInitValue(HGLMModel model, double[] initBetas, Frame ubetaFrame, Frame tMat, double sigmaEpsilon) { + // check fixed coefficient initialization + checkArrays(initBetas, model._output._beta, TOL); + // check random coefficient initialization + double[][] ubetaInit = new double[(int) ubetaFrame.numRows()][(int) ubetaFrame.numCols()]; + final ArrayUtils.FrameToArray f2a = new ArrayUtils.FrameToArray(0, ubetaInit[0].length-1, + ubetaInit.length, ubetaInit); + ubetaInit = f2a.doAll(ubetaFrame).getArray(); + checkDoubleArrays(ubetaInit, model._output._ubeta, TOL); + // check T matrix initialization + double[][] tMatInit = new double[tMat.numCols()][tMat.numCols()]; + final ArrayUtils.FrameToArray f2a2 = new ArrayUtils.FrameToArray(0, tMat.numCols()-1, tMatInit.length, tMatInit); + tMatInit = f2a2.doAll(tMat).getArray(); + checkDoubleArrays(tMatInit, model._output._tmat, TOL); + // check sigma epsilon initializaiton + assertEquals(sigmaEpsilon, model._output._tau_e_var, TOL); + } + + /** + * Here I am testing a different way to set the T matrix + */ + @Test + public void testSetInitT() { + Scope.enter(); + try { + Frame prostate = parseAndTrackTestFile("smalldata/prostate/prostate.csv"); + prostate.replace(3, prostate.vec(3).toCategoricalVec()).remove(); + prostate.replace(4, prostate.vec(4).toCategoricalVec()).remove(); + prostate.replace(5, prostate.vec(5).toCategoricalVec()).remove(); + DKV.put(prostate); + double sigmaU = 0.09847638; + HGLMModel.HGLMParameters params = new HGLMModel.HGLMParameters(); + params._train = prostate._key; + params._response_column = "VOL"; + params._ignored_columns = new String[]{"ID"}; + params._group_column = "RACE"; + params._use_all_factor_levels = true; + params._random_columns = new String[]{"GLEASON", "DPROS", "DCAPS"}; + params._tau_u_var_init = sigmaU; + params._max_iterations = 0; + HGLMModel model = new HGLM(params).trainModel().get(); + Scope.track_generic(model); + checkCorrectTMat(model, sigmaU); + } finally { + Scope.exit(); + } + } + + public void checkCorrectTMat(HGLMModel model, double sigmaU) { + double[][] correctTMat = new double[model._output._tmat.length][model._output._tmat.length]; + for (int ind=0; ind val) { // may not want to check all rows for large dataset + // grab xval and zval + fillDataRows(fr, index, coeffNames, rCoeffNames, xvals, zvals); + level2Val = (int) fr.vec(level2Col).at(index); + // produce estimated response from fixed effect + estimatedY = innerProduct(beta, xvals) + innerProduct(ubeta[level2Val], zvals); + // compare our answer with generated answer executed in parallel + assertEquals(estimatedY, predFrame.vec(0).at(index), TOL); + } + } + } + + public void fillDataRows(Frame fr, int rowInd, String[] coefNames, String[] rCoeffNames, double[] xvals, + double[] zvals) { + Arrays.fill(xvals, 0.0); + int interceptInd = xvals.length-1; + xvals[interceptInd] = 1.0; + Arrays.fill(zvals, 0.0); + if (zvals.length > rCoeffNames.length || rCoeffNames[rCoeffNames.length-1] == "intercept") + zvals[zvals.length-1] = 1.0; + for (int index=0; index>> import h2o + >>> from h2o.estimators import H2OHGLMEstimator + >>> h2o.init() + >>> prostate_path <- system.file("extdata", "prostate.csv", package = "h2o") + >>> prostate <- h2o.uploadFile(path = prostate_path) + >>> prostate$CAPSULE <- as.factor(prostate$CAPSULE) + >>> hglm_model =H2OHGLMEstimator(random_columns = ["AGE"], group_column = "RACE") + >>> hglm_model.train(x=c("AGE","RACE","DPROS"), y="CAPSULE", training_frame=prostate) + """ +) + + diff --git a/h2o-bindings/bin/gen_R.py b/h2o-bindings/bin/gen_R.py index fdb0150d4d28..e83e6134b6c4 100644 --- a/h2o-bindings/bin/gen_R.py +++ b/h2o-bindings/bin/gen_R.py @@ -272,6 +272,7 @@ def algo_to_modelname(algo): if algo == "modelselection": return "Model Selection" if algo == "infogram": return "Infogram" if algo == "adaboost": return "AdaBoost Model" + if algo == "hglm": return "HGLM Model" return algo @@ -350,6 +351,7 @@ def main(): if name == "pca": module = "prcomp" if name == "modelselection": module = "modelSelection" if name == "adaboost": module = "adaBoost" + if name == "hglm": model = "hglm" bi.vprint("Generating model: " + name) bi.write_to_file("%s.R" % file_name, gen_module(mb, name, module)) diff --git a/h2o-bindings/bin/gen_python.py b/h2o-bindings/bin/gen_python.py index 6fa044f77d0e..ccee92bd1384 100755 --- a/h2o-bindings/bin/gen_python.py +++ b/h2o-bindings/bin/gen_python.py @@ -352,6 +352,7 @@ def algo_to_classname(algo): if algo == "modelselection": return "H2OModelSelectionEstimator" if algo == "isotonicregression": return "H2OIsotonicRegressionEstimator" if algo == "adaboost": return "H2OAdaBoostEstimator" + if algo == "hglm": return "H2OHGLMEstimator" return "H2O" + algo.capitalize() + "Estimator" diff --git a/h2o-core/src/main/java/hex/ModelMetricHGLMGaussianGaussianGeneric.java b/h2o-core/src/main/java/hex/ModelMetricHGLMGaussianGaussianGeneric.java deleted file mode 100644 index 1e70e1c73be2..000000000000 --- a/h2o-core/src/main/java/hex/ModelMetricHGLMGaussianGaussianGeneric.java +++ /dev/null @@ -1,16 +0,0 @@ -package hex; - -import water.fvec.Frame; - -public class ModelMetricHGLMGaussianGaussianGeneric extends ModelMetricsHGLMGeneric { - public ModelMetricHGLMGaussianGaussianGeneric(Model model, Frame frame, long nobs, double mse, String[] domain, - double sigma, CustomMetric customMetric, double[] sefe, double[] sere, - double varfix, double[] varranef, boolean converge, double dfrefe, - double[] summvc1, double[][] summvc2, double hlik, double pvh, - double pbvh, double cAIC, long bad, double sumEtaDiffSq, - double convergence, int[] randC, double[] fixef, double[] ranef, - int iteration) { - super(model, frame, nobs, mse, domain, sigma, customMetric, sefe, sere, varfix, varranef, converge, dfrefe, - summvc1, summvc2, hlik, pvh, pbvh, cAIC, bad, sumEtaDiffSq, convergence, randC, fixef, ranef, iteration); - } -} diff --git a/h2o-core/src/main/java/hex/ModelMetricsHGLM.java b/h2o-core/src/main/java/hex/ModelMetricsHGLM.java deleted file mode 100644 index f96f5a31596d..000000000000 --- a/h2o-core/src/main/java/hex/ModelMetricsHGLM.java +++ /dev/null @@ -1,158 +0,0 @@ -package hex; - -import water.exceptions.H2OIllegalArgumentException; -import water.fvec.Frame; - -import java.util.Arrays; - -public class ModelMetricsHGLM extends ModelMetricsSupervised { - public final double[] _sefe; // standard error of fixed predictors/effects - public final double[] _sere; // standard error of random effects - public final double[] _fixef; // fixed coefficients - public final double[] _ranef; // random coefficients - public final int[] _randc; // column indices of random columns - public final double _varfix; // dispersion parameter of the mean model (residual variance for LMM) - public final double[] _varranef; // dispersion parameter of the random effects (variance of random effects for GLMM) - public final boolean _converge; // true if model has converged - public final double _dfrefe; // deviance degrees of freedom for mean part of the model - public final double[] _summvc1; // estimates, standard errors of the linear predictor in the dispersion model - public final double[][] _summvc2;// estimates, standard errors of the linear predictor for dispersion parameter of random effects - public final double _hlik; // log h-likelihood - public final double _pvh; // adjusted profile log-likelihood profiled over random effects - public final double _pbvh; // adjusted profile log-likelihood profiled over fixed and random effects - public final double _caic; // conditional AIC - public final long _bad; // index of the most influential observation - public final double _sumetadiffsquare; // sum(etai-eta0)^2 - public final double _convergence; // sum(etai-eta0)^2/sum(etai)^2 - public final int _iterations; - - public ModelMetricsHGLM(Model model, Frame frame, long nobs, double mse, String[] domain, double sigma, - CustomMetric customMetric, double[] sefe, double[] sere, double varfix, double[] varranef, - boolean converge, double dfrefe, double[] summvc1, double[][] summvc2, double hlik, - double pvh, double pbvh, double cAIC, long bad, double sumEtaDiffSq, double convergence, - int[] randC, double[] fixef, double[] ranef, int iter) { - super(model, frame, nobs, mse, domain, sigma, customMetric); - _sefe = sefe; - _sere = sere; - _varfix = varfix; - _varranef = varranef; - _converge = converge; - _dfrefe = dfrefe; - _summvc1 = summvc1; - _summvc2 = summvc2; - _hlik = hlik; - _pvh = pvh; - _pbvh = pbvh; - _caic = cAIC; - _bad = bad; - _sumetadiffsquare = sumEtaDiffSq; - _convergence = convergence; - _randc = randC; - _fixef = fixef; - _ranef = ranef; - _iterations = iter; - } - - public static ModelMetricsHGLM getFromDKV(Model model, Frame frame) { - ModelMetrics mm = ModelMetrics.getFromDKV(model, frame); - if( !(mm instanceof ModelMetricsHGLM) ) - throw new H2OIllegalArgumentException("Expected to find a HGLM ModelMetrics for model: " + model._key.toString() - + " and frame: " + frame._key.toString(), "Expected to find a ModelMetricsHGLM for model: " + - model._key.toString() + " and frame: " + frame._key.toString() + " but found a: " + (mm == null ? null : mm.getClass())); - return (ModelMetricsHGLM) mm; - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append(super.toString()); - sb.append(" standard error of fixed predictors effects: "+Arrays.toString(_sefe)); - sb.append(" standard error of random effects: "+Arrays.toString(_sere)); - sb.append(" dispersion parameter of the mean model (residual variance for LMM): "+ _varfix); - sb.append(" dispersion parameter of the random effects (variance of random effects for GLMM): "+ _varranef); - if (_converge) - sb.append(" HGLM has converged."); - else - sb.append(" HGLM has failed to converge."); - sb.append(" deviance degrees of freedom for mean part of the model: "+ _dfrefe); - sb.append(" estimates, standard errors of the linear predictor in the dispersion model: "+Arrays.toString(_summvc1)); - sb.append(" estimates, standard errors of the linear predictor for dispersion parameter of random effects: "+ - Arrays.toString(_summvc2)); - sb.append(" log h-likelihood: "+_hlik); - sb.append(" adjusted profile log-likelihood profiled over random effects: "+_pvh); - sb.append(" adjusted profile log-likelihood profiled over fixed and random effects: "+_pbvh); - sb.append(" conditional AIC: "+_caic); - sb.append(" index of the most influential observation: "+_bad); - sb.append(" sum(etai-eta0)^2: "+ _sumetadiffsquare); - sb.append("convergence (sum(etai-eta0)^2/sum(etai)^2): "+_convergence); - return sb.toString(); - } - - public static class MetricBuilderHGLM> extends MetricBuilderSupervised { - public double[] _sefe; // standard error of fixed predictors/effects - public double[] _sere; // standard error of random effects - public double _varfix; // dispersion parameter of the mean model (residual variance for LMM) - public double[] _varranef; // dispersion parameter of the random effects (variance of random effects for GLMM) - public boolean _converge; // true if model has converged - public double _dfrefe; // deviance degrees of freedom for mean part of the model - public double[] _summvc1; // estimates, standard errors of the linear predictor in the dispersion model - public double[][] _summvc2;// estimates, standard errors of the linear predictor for dispersion parameter of random effects - public double _hlik; // log h-likelihood - public double _pvh; // adjusted profile log-likelihood profiled over random effects - public double _pbvh; // adjusted profile log-likelihood profiled over fixed and random effects - public double _caic; // conditional AIC - public long _bad; // index of the most influential observation - public double _sumetadiffsquare; // sum(etai-eta0)^2 - public double _convergence; // sum(etai-eta0)^2/sum(etai)^2 - public double[] _fixef; - public double[] _ranef; - public int[] _randc; - public int _iterations; // number of iterations - public long _nobs; - - public MetricBuilderHGLM(String[] domain) { - super(0,domain); - } - - public void updateCoeffs(double[] fixedCoeffs, double[] randCoeffs) { - int fixfLen = fixedCoeffs.length; - if (_fixef ==null) - _fixef = new double[fixfLen]; - System.arraycopy(fixedCoeffs, 0, _fixef, 0, fixfLen); - - int randLen = randCoeffs.length; - if (_ranef == null) - _ranef = new double[randLen]; - System.arraycopy(randCoeffs, 0, _ranef, 0, randLen); - - } - - public void updateSummVC(double[] VC1, double[][] VC2, int[] randc) { - if (_summvc1 ==null) - _summvc1 = new double[2]; - System.arraycopy(VC1, 0, _summvc1, 0, 2); - - if (_summvc2 == null) { - _randc = randc; - _summvc2 = new double[randc.length][2]; - } - - for (int index = 0; index < randc.length; index++) - System.arraycopy(VC2[index], 0, _summvc2[index], 0, 2); - } - - @Override - public double[] perRow(double[] ds, float[] yact, Model m) { - return new double[0]; - } - - @Override - public ModelMetrics makeModelMetrics(Model m, Frame f, Frame adaptedFrame, Frame preds) { - ModelMetricsHGLM mm = new ModelMetricsHGLM(m, f, _nobs, 0, _domain, 0, _customMetric, _sefe, _sere, - _varfix, _varranef, _converge, _dfrefe, _summvc1, _summvc2, _hlik, _pvh, _pbvh, _caic, _bad, - _sumetadiffsquare, _convergence, _randc, _fixef, _ranef, _iterations); - if (m!=null) m.addModelMetrics(mm); - return mm; - } - } -} diff --git a/h2o-core/src/main/java/hex/ModelMetricsHGLMGaussianGaussian.java b/h2o-core/src/main/java/hex/ModelMetricsHGLMGaussianGaussian.java deleted file mode 100644 index 905c323dea8d..000000000000 --- a/h2o-core/src/main/java/hex/ModelMetricsHGLMGaussianGaussian.java +++ /dev/null @@ -1,58 +0,0 @@ -package hex; - -import water.fvec.Frame; -import java.util.Arrays; - -public class ModelMetricsHGLMGaussianGaussian extends ModelMetricsHGLM implements GLMMetrics { - public ModelMetricsHGLMGaussianGaussian(Model model, Frame frame, long nobs, double mse, String[] domain, double sigma, - CustomMetric customMetric, double[] sefe, double[] sere, double varfix, double[] varranef, - boolean converge, double dfrefe, double[] summvc1, double[][] summvc2, double hlik, - double pvh, double pbvh, double cAIC, long bad, double sumEtaDiffSq, - double convergence, int[] RandC, double[] fixef, double[] ranef, int iteration) { - super(model, frame, nobs, mse, domain, sigma, customMetric, sefe, sere, varfix, varranef, converge, dfrefe, - summvc1, summvc2, hlik, pvh, pbvh, cAIC, bad, sumEtaDiffSq, convergence, RandC, fixef, ranef, iteration); - } - - @Override - public double residual_deviance() { - return Double.NaN; - } - - @Override - public double null_deviance() { - return Double.NaN; - } - - @Override - public long residual_degrees_of_freedom() { return 0;} - - @Override - public long null_degrees_of_freedom() { - return 0; - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append(super.toString()); - return sb.toString(); - } - - @Override - public boolean equals(Object o) { - if (!(o instanceof ModelMetricsHGLMGaussianGaussian)) - return false; - ModelMetricsHGLMGaussianGaussian mm = (ModelMetricsHGLMGaussianGaussian)o; - boolean isEquals = Arrays.equals(_sefe, mm._sefe) && Arrays.equals(_sere, mm._sere)&& _varfix ==mm._varfix && - _varranef ==mm._varranef &&_converge==mm._converge && Arrays.equals(_summvc1, mm._summvc1) && _hlik==mm._hlik - && _pvh==mm._pvh && _pbvh==mm._pbvh && _caic==mm._caic && _bad==mm._bad && - _sumetadiffsquare ==mm._sumetadiffsquare && _convergence==mm._convergence; - if (!isEquals) - return false; - for (int k = 0; k < _summvc2.length; k++) { - if (!Arrays.equals(_summvc2[k], mm._summvc2[k])) - return false; - } - return true; - } -} diff --git a/h2o-core/src/main/java/hex/ModelMetricsHGLMGeneric.java b/h2o-core/src/main/java/hex/ModelMetricsHGLMGeneric.java deleted file mode 100644 index 95c1b575b44c..000000000000 --- a/h2o-core/src/main/java/hex/ModelMetricsHGLMGeneric.java +++ /dev/null @@ -1,14 +0,0 @@ -package hex; - - import water.fvec.Frame; - -public class ModelMetricsHGLMGeneric extends ModelMetricsHGLM { - public ModelMetricsHGLMGeneric(Model model, Frame frame, long nobs, double mse, String[] domain, double sigma, - CustomMetric customMetric, double[] sefe, double[] sere, double varfix, double[] varranef, - boolean converge, double dfrefe, double[] summvc1, double[][] summvc2, double hlik, - double pvh, double pbvh, double cAIC, long bad, double sumEtaDiffSq, double convergence, - int[] randC, double[] fixef, double[] ranef, int iteration) { - super(model, frame, nobs, mse, domain, sigma, customMetric, sefe, sere, varfix, varranef, converge, dfrefe, - summvc1, summvc2, hlik, pvh, pbvh, cAIC, bad, sumEtaDiffSq, convergence, randC, fixef, ranef, iteration); - } -} diff --git a/h2o-core/src/main/java/hex/ModelMetricsRegressionHGLM.java b/h2o-core/src/main/java/hex/ModelMetricsRegressionHGLM.java new file mode 100644 index 000000000000..0a055d83e694 --- /dev/null +++ b/h2o-core/src/main/java/hex/ModelMetricsRegressionHGLM.java @@ -0,0 +1,118 @@ +package hex; + +import Jama.Matrix; +import water.exceptions.H2OIllegalArgumentException; +import water.fvec.Frame; +import water.util.ArrayUtils; + +import java.util.Arrays; + +import static water.util.ArrayUtils.*; + +public class ModelMetricsRegressionHGLM extends ModelMetricsRegression { + // the doc = document attached to https://github.com/h2oai/h2o-3/issues/8487, title HGLM_H2O_Implementation.pdf + // I will be referring to the doc and different parts of it to explain my implementation. + public static final double LOG_2PI = Math.log(2*Math.PI); + public final double[] _beta; // fixed coefficients + public final double[][] _ubeta; // random coefficients + public final double[] _icc; + public final int _iterations; + public final double[][] _tmat; + public final double _var_residual; // variance of residual error + public final double _log_likelihood; // llg from reference [2] of the doc + public final double _mse_fixed; // mse of with fixed effect only + + public ModelMetricsRegressionHGLM(Model model, Frame frame, long nobs, double sigma, double loglikelihood, + CustomMetric customMetric, int iter, double[] beta, double[][] ubeta, + double[][] tmat, double varResidual, double mse, double mse_fixed, double mae, + double rmlse, double meanResidualDeviance, double aic) { + super(model, frame, nobs, mse, sigma, mae, rmlse, meanResidualDeviance, customMetric, loglikelihood, aic); + _beta = beta; + _ubeta = ubeta; + _iterations = iter; + _tmat = tmat; + _var_residual = varResidual; + _icc = calICC(tmat, varResidual); + _log_likelihood = loglikelihood; + _mse_fixed = mse_fixed; + + } + + + /*** + * + * This method calculates the log-likelihood as described in section II.VI of the doc. Please keep this method + * even though nobody is calling it. + */ + public static double calHGLMllg2(long nobs, double[][] tmat, double varResidual, double[][] zTTimesZ, + double yMinsXFixSqure, double[][] yMinusXFixTimesZ) { + double llg = nobs*LOG_2PI; + double oneOVar = 1.0/varResidual; + double oneOVarSq = oneOVar*oneOVar; + double[][] gMat = expandMat(tmat, yMinusXFixTimesZ.length); + double[][] tInvPlusZTT = calInnverV(gMat, zTTimesZ, oneOVar); + llg += Math.log(varResidual * new Matrix(tInvPlusZTT).det() * new Matrix(gMat).det()); + double[] yMinusXFixTimesZVec = flattenArray(yMinusXFixTimesZ); + Matrix yMinusXFixTimesZMat = new Matrix(new double[][] {yMinusXFixTimesZVec}).transpose(); + llg += oneOVar*yMinsXFixSqure - + yMinusXFixTimesZMat.transpose().times(new Matrix(tInvPlusZTT).inverse()).times(yMinusXFixTimesZMat).times(oneOVarSq).getArray()[0][0]; + return -0.5*llg; + } + + /** + * See the doc section II.V, calculates G inverse + transpose(Z)*Z/var_e. + */ + public static double[][] calInnverV(double[][] gmat, double[][] zTTimesZ, double oneOVar) { + try { + double[][] gmatInv = new Matrix(gmat).inverse().getArray(); + double[][] tempzTTimesZ = copy2DArray(zTTimesZ); + ArrayUtils.mult(tempzTTimesZ, oneOVar); + ArrayUtils.add(gmatInv, tempzTTimesZ); + return gmatInv; + } catch(Exception ex) { + throw new RuntimeException("Tmat matrix is singular."); + } + } + + public static ModelMetricsRegressionHGLM getFromDKV(Model model, Frame frame) { + ModelMetrics mm = ModelMetrics.getFromDKV(model, frame); + if (!(mm instanceof ModelMetricsRegressionHGLM)) + throw new H2OIllegalArgumentException("Expected to find a HGLM ModelMetrics for model: " + model._key.toString() + + " and frame: " + frame._key.toString(), "Expected to find a ModelMetricsHGLM for model: " + + model._key.toString() + " and frame: " + frame._key.toString() + " but found a: " + (mm == null ? null : mm.getClass())); + return (ModelMetricsRegressionHGLM) mm; + } + + public static double[] calICC(double[][] tmat, double varResidual) { + int numLevel2 = tmat.length; + double[] icc = new double[numLevel2]; + double denom = varResidual; + denom += new Matrix(tmat).trace(); // sum of diagonal + double oOverDenom = 1.0/denom; + for (int index=0; index> -extends ModelMetricsHGLMGenericV3 { - - @Override - public S fillFromImpl(ModelMetricHGLMGaussianGaussianGeneric modelMetrics) { - super.fillFromImpl(modelMetrics); - return (S) this; - } -} diff --git a/h2o-core/src/main/java/water/api/schemas3/ModelMetricsHGLMGaussianGaussianV3.java b/h2o-core/src/main/java/water/api/schemas3/ModelMetricsHGLMGaussianGaussianV3.java deleted file mode 100644 index 950d23bbb3bb..000000000000 --- a/h2o-core/src/main/java/water/api/schemas3/ModelMetricsHGLMGaussianGaussianV3.java +++ /dev/null @@ -1,11 +0,0 @@ -package water.api.schemas3; - -import hex.ModelMetricsHGLMGaussianGaussian; - -public class ModelMetricsHGLMGaussianGaussianV3 extends ModelMetricsHGLMV3 { -@Override -public ModelMetricsHGLMGaussianGaussianV3 fillFromImpl(ModelMetricsHGLMGaussianGaussian modelMetrics) { - super.fillFromImpl(modelMetrics); - return this; - } -} diff --git a/h2o-core/src/main/java/water/api/schemas3/ModelMetricsHGLMGenericV3.java b/h2o-core/src/main/java/water/api/schemas3/ModelMetricsHGLMGenericV3.java deleted file mode 100644 index 37709abdaf76..000000000000 --- a/h2o-core/src/main/java/water/api/schemas3/ModelMetricsHGLMGenericV3.java +++ /dev/null @@ -1,12 +0,0 @@ -package water.api.schemas3; - -import hex.ModelMetricsHGLMGeneric; - -public class ModelMetricsHGLMGenericV3> extends ModelMetricsHGLMV3 { - -@Override - public S fillFromImpl(ModelMetricsHGLMGeneric modelMetric) { - super.fillFromImpl(modelMetric); - return (S) this; - } -} diff --git a/h2o-core/src/main/java/water/api/schemas3/ModelMetricsHGLMV3.java b/h2o-core/src/main/java/water/api/schemas3/ModelMetricsHGLMV3.java deleted file mode 100644 index c35ad37502e6..000000000000 --- a/h2o-core/src/main/java/water/api/schemas3/ModelMetricsHGLMV3.java +++ /dev/null @@ -1,96 +0,0 @@ -package water.api.schemas3; - -import hex.ModelMetricsHGLM; -import water.api.API; - -public class ModelMetricsHGLMV3> - extends ModelMetricsBaseV3 { - @API(help="standard error of fixed predictors/effects", direction=API.Direction.OUTPUT) - public double[] sefe; // standard error of fixed predictors/effects - - @API(help="standard error of random effects", direction=API.Direction.OUTPUT) - public double[] sere; // standard error of random effects - - @API(help="dispersion parameter of the mean model (residual variance for LMM)", direction=API.Direction.OUTPUT) - public double varfix; // dispersion parameter of the mean model (residual variance for LMM) - - @API(help="dispersion parameter of the random effects (variance of random effects for GLMM", direction=API.Direction.OUTPUT) - public double[] varranef; // dispersion parameter of the random effects (variance of random effects for GLMM) - - @API(help="fixed coefficient)", direction=API.Direction.OUTPUT) - public double[] fixef; // dispersion parameter of the mean model (residual variance for LMM) - - @API(help="random coefficients", direction=API.Direction.OUTPUT) - public double[] ranef; // dispersion parameter of the random effects (variance of random effects for GLMM) - - @API(help="true if model has converged", direction=API.Direction.OUTPUT) - public boolean converge; // true if model has converged - - @API(help="number of random columns", direction=API.Direction.OUTPUT) - public int[] randc; // indices of random columns - - @API(help="deviance degrees of freedom for mean part of the model", direction=API.Direction.OUTPUT) - public double dfrefe; // deviance degrees of freedom for mean part of the model - - @API(help="estimates, standard errors of the linear predictor in the dispersion model", direction=API.Direction.OUTPUT) - public double[] summvc1; // estimates, standard errors of the linear predictor in the dispersion model - - @API(help="estimates, standard errors of the linear predictor for dispersion parameter of random effects", direction=API.Direction.OUTPUT) - public double[][] summvc2;// estimates, standard errors of the linear predictor for dispersion parameter of random effects - - @API(help="log h-likelihood", direction=API.Direction.OUTPUT) - public double hlik; // log h-likelihood - - @API(help="adjusted profile log-likelihood profiled over random effects", direction=API.Direction.OUTPUT) - public double pvh; // adjusted profile log-likelihood profiled over random effects - - @API(help="adjusted profile log-likelihood profiled over fixed and random effects", direction=API.Direction.OUTPUT) - public double pbvh; // adjusted profile log-likelihood profiled over fixed and random effects - - @API(help="conditional AIC", direction=API.Direction.OUTPUT) - public double caic; // conditional AIC - - @API(help="index of the most influential observation", direction=API.Direction.OUTPUT) - public long bad; // index of the most influential observation - - @API(help="sum(etai-eta0)^2 where etai is current eta and eta0 is the previous one", direction=API.Direction.OUTPUT) - public double sumetadiffsquare; // sum(etai-eta0)^2 - - @API(help="sum(etai-eta0)^2/sum(etai)^2 ", direction=API.Direction.OUTPUT) - public double convergence; // sum(etai-eta0)^2/sum(etai)^2 - - @Override - public S fillFromImpl(ModelMetricsHGLM modelMetrics) { - super.fillFromImpl(modelMetrics); - hlik = modelMetrics._hlik; - pvh = modelMetrics._pvh; - pbvh = modelMetrics._pbvh; - caic = modelMetrics._caic; - bad = modelMetrics._bad; - sumetadiffsquare =modelMetrics._sumetadiffsquare; - convergence = modelMetrics._convergence; - randc = modelMetrics._randc; - fixef = modelMetrics._fixef; - ranef = modelMetrics._ranef; - varfix = modelMetrics._varfix; - varranef = modelMetrics._varranef; - converge = modelMetrics._converge; - dfrefe = modelMetrics._dfrefe; - sefe = new double[modelMetrics._sefe.length]; - System.arraycopy(modelMetrics._sefe, 0, sefe, 0, sefe.length); - sere = new double[modelMetrics._sere.length]; - System.arraycopy(modelMetrics._sere, 0, sere, 0, sere.length); - varranef = new double[modelMetrics._varranef.length]; - System.arraycopy(modelMetrics._varranef, 0, varranef, 0, varranef.length); - summvc1 = new double[modelMetrics._summvc1.length]; - System.arraycopy(modelMetrics._summvc1, 0, summvc1, 0, summvc1.length); - int numRandCol = randc.length; - summvc2 = new double[numRandCol][]; - for (int index=0; index < numRandCol; index++) { - int l = modelMetrics._summvc2[index].length; - summvc2[index] = new double[l]; - System.arraycopy(modelMetrics._summvc2[index], 0, summvc2[index], 0, l); - } - return (S) this; - } -} diff --git a/h2o-core/src/main/java/water/api/schemas3/ModelMetricsRegressionHGLMGenericV3.java b/h2o-core/src/main/java/water/api/schemas3/ModelMetricsRegressionHGLMGenericV3.java new file mode 100644 index 000000000000..e0c099719cae --- /dev/null +++ b/h2o-core/src/main/java/water/api/schemas3/ModelMetricsRegressionHGLMGenericV3.java @@ -0,0 +1,21 @@ +package water.api.schemas3; + +import hex.ModelMetricsRegressionHGLMGeneric; + +public class ModelMetricsRegressionHGLMGenericV3> + extends ModelMetricsRegressionHGLMV3 { + + @Override + public S fillFromImpl(ModelMetricsRegressionHGLMGeneric modelMetrics) { + super.fillFromImpl(modelMetrics); + log_likelihood = modelMetrics._log_likelihood; + icc = modelMetrics._icc; + beta = modelMetrics._beta; + ubeta = modelMetrics._ubeta; + iterations = modelMetrics._iterations; + tmat = modelMetrics._tmat; + var_residual = modelMetrics._var_residual; + mse_fixed = modelMetrics._mse_fixed; + return (S) this; + } +} diff --git a/h2o-core/src/main/java/water/api/schemas3/ModelMetricsRegressionHGLMV3.java b/h2o-core/src/main/java/water/api/schemas3/ModelMetricsRegressionHGLMV3.java new file mode 100644 index 000000000000..163fbac01c47 --- /dev/null +++ b/h2o-core/src/main/java/water/api/schemas3/ModelMetricsRegressionHGLMV3.java @@ -0,0 +1,45 @@ +package water.api.schemas3; + +import hex.ModelMetricsRegressionHGLM; +import water.api.API; + +public class ModelMetricsRegressionHGLMV3> + extends ModelMetricsBaseV3 { + @API(help="fixed coefficient)", direction=API.Direction.OUTPUT) + public double[] beta; // dispersion parameter of the mean model (residual variance for LMM) + + @API(help="random coefficients", direction=API.Direction.OUTPUT) + public double[][] ubeta; // dispersion parameter of the random effects (variance of random effects for GLMM) + + @API(help="log likelihood", direction=API.Direction.OUTPUT) + public double log_likelihood; // log h-likelihood + + @API(help="interclass correlation", direction=API.Direction.OUTPUT) + public double[] icc; + + @API(help="iterations taken to build model", direction=API.Direction.OUTPUT) + public int iterations; + + @API(help="covariance matrix of random effects", direction=API.Direction.OUTPUT) + public double[][] tmat; + + @API(help="variance of residual error", direction=API.Direction.OUTPUT) + public double var_residual; + + @API(help="mean square error of fixed effects only", direction=API.Direction.OUTPUT) + public double mse_fixed; + + @Override + public S fillFromImpl(ModelMetricsRegressionHGLM modelMetrics) { + super.fillFromImpl(modelMetrics); + log_likelihood = modelMetrics._log_likelihood; + icc = modelMetrics._icc; + beta = modelMetrics._beta; + ubeta = modelMetrics._ubeta; + iterations = modelMetrics._iterations; + tmat = modelMetrics._tmat; + var_residual = modelMetrics._var_residual; + mse_fixed = modelMetrics._mse_fixed; + return (S) this; + } +} diff --git a/h2o-core/src/main/java/water/util/ArrayUtils.java b/h2o-core/src/main/java/water/util/ArrayUtils.java index cc603198cf5e..dea7b10410f2 100644 --- a/h2o-core/src/main/java/water/util/ArrayUtils.java +++ b/h2o-core/src/main/java/water/util/ArrayUtils.java @@ -1,5 +1,6 @@ package water.util; +import Jama.Matrix; import water.*; import water.fvec.*; @@ -16,14 +17,91 @@ public class ArrayUtils { public static int[] cumsum(final int[] from) { int arryLen = from.length; int[] cumsumR = new int[arryLen]; - int result = 0; + int result=0; for (int index = 0; index < arryLen; index++) { - result += result+from[index]; + result += from[index]; cumsumR[index] = result; } return cumsumR; } + /*** + * Given an array with first dimension J and second dimension q, this function will flatten the 2-D array into + * 1-D array of length J*q. It basically concates each row of arr into one big 1-D array. + */ + public static double[] flattenArray(double[][] arr) { + int numRandCoeff = arr[0].length; + int numLevel2 = arr.length; + int len = numRandCoeff * numLevel2; + double[] flatA = new double[len]; + int longIndex; + for (int index2 = 0; index2 < numLevel2; index2++) { + for (int coefInd = 0; coefInd < numRandCoeff; coefInd++) { + longIndex = index2*numRandCoeff+coefInd; + flatA[longIndex] = arr[index2][coefInd]; + } + } + return flatA; + } + + public static String[] flattenArray(String[][] arr) { + int numRandCoeff = arr[0].length; + int numLevel2 = arr.length; + int len = numRandCoeff * numLevel2; + String[] flatA = new String[len]; + int longIndex; + for (int index2 = 0; index2 < numLevel2; index2++) { + for (int coefInd = 0; coefInd < numRandCoeff; coefInd++) { + longIndex = index2*numRandCoeff+coefInd; + flatA[longIndex] = arr[index2][coefInd]; + } + } + return flatA; + } + + public static void copy2DArray(double[][] src_array, double[][] dest_array) { + int numRows = src_array.length; + for (int colIdx = 0; colIdx < numRows; colIdx++) { // save zMatrix for debugging purposes or later scoring on training dataset + System.arraycopy(src_array[colIdx], 0, dest_array[colIdx], 0, + src_array[colIdx].length); + } + } + + // copy a square array + public static double[][] copy2DArray(double[][] src_array) { + double[][] dest_array = MemoryManager.malloc8d(src_array.length, src_array[0].length); + copy2DArray(src_array, dest_array); + return dest_array; + } + + public static void copy2DArray(int[][] src_array, int[][] dest_array) { + int numRows = src_array.length; + for (int colIdx = 0; colIdx < numRows; colIdx++) { // save zMatrix for debugging purposes or later scoring on training dataset + System.arraycopy(src_array[colIdx], 0, dest_array[colIdx], 0, + src_array[colIdx].length); + } + } + + /*** + * This method will take a 2D array and expand it to be of size numLevel2*tmat.length. Basically, it will copy tmat + * into the diagonal block a bigger matrix of size numLevel2*tmat.length. + */ + public static double[][] expandMat(double[][] tmat, int numLevel2) { + int numRandomCoeff = tmat.length; + int qTimesJ = numRandomCoeff * numLevel2; + double[][] gMat = new double[qTimesJ][qTimesJ]; + int colInd = 0; + int rowInd = 0; + for (int ind2 = 0; ind2 < numLevel2; ind2++) { + for (int ind = 0; ind < numRandomCoeff; ind++) { + System.arraycopy(tmat[ind], 0, gMat[rowInd], colInd, numRandomCoeff); + rowInd++; + } + colInd += numRandomCoeff; + } + return gMat; + } + // Sum elements of an array public static long sum(final long[] from) { long result = 0; @@ -117,6 +195,28 @@ public static double[][] outerProduct(double[][] result, double[] x, double[] y) return result; } + public static double[][] outerProductCum(double[][] result, double[] x, double[] y) { + if (result == null) + result = new double[x.length][y.length]; + for(int i = 0; i < x.length; i++) { + for(int j = 0; j < y.length; j++) + result[i][j] += x[i] * y[j]; + } + return result; + } + + public static void outputProductSymCum(double[][] result, double[] x) { + if (result == null) + throw new IllegalArgumentException("result should have been a double[][] array of size x.length."); + int xLen = x.length; + for (int rInd = 0; rInd < xLen; rInd++) + for (int cInd=0; cInd <= rInd; cInd++) { + result[rInd][cInd] += x[rInd] * x[cInd]; + if (rInd != cInd) + result[cInd][rInd] = result[rInd][cInd]; + } + } + // return the sqrt of each element of the array. Will overwrite the original array in this case public static double[] sqrtArr(double [] x){ assert (x != null); @@ -307,11 +407,47 @@ public static double[] add(double[] a, double[] b, double [] c) { a[i] = b[i] + c[i]; return a; } + + /** + * Note that this add is cumulative, meaning if you have double matrices a, b, c and you do + * add(a, b) followed by add(a, c), you will get a+b+c. + */ public static double[][] add(double[][] a, double[][] b) { if (a == null) return b; for(int i = 0; i < a.length; i++ ) a[i] = add(a[i], b[i]); return a; } + + /** + * This add is not cumulative. It will simply return result = a+b. + */ + public static void add(double[][] result, double[][] a, double[][] b) { + if (result == null || result.length != a.length || result[0].length != a[0].length || a.length != b.length || + a[0].length != b[0].length) + throw new IllegalArgumentException("matrices must be of the same size."); + int numRow = a.length; + int numCol = a[0].length; + for (int rInd = 0; rInd < numRow; rInd++) + for (int cInd = 0; cInd < numCol; cInd++) + result[rInd][cInd] = a[rInd][cInd] + b[rInd][cInd]; + } + + public static void minus(double[] result, double[] a, double[] b){ + if (result == null || result.length != a.length || a.length != b.length) + throw new IllegalArgumentException("matrices must be of the same size."); + int numRow = a.length; + for (int rInd = 0; rInd < numRow; rInd++) + result[rInd] = a[rInd] - b[rInd]; + } + + public static void minus(double[][] result, double[][] a, double[][] b){ + if (result == null || result.length != a.length || a.length != b.length || result[0].length != a[0].length || a[0].length != b[0].length) + throw new IllegalArgumentException("matrices must be of the same size."); + int numRow = a.length; + for (int rInd = 0; rInd < numRow; rInd++) + minus(result[rInd], a[rInd], b[rInd]); + } + public static double[][][] add(double[][][] a, double[][][] b) { for(int i = 0; i < a.length; i++ ) a[i] = add(a[i],b[i]); return a; @@ -386,6 +522,21 @@ public static double[] mult(double[] source, double[] dest, double n) { dest[i]=source[i]*n; return dest; } + + public static void mult(double[][] source, double[][] dest, double n) { + if (dest != null && source.length == dest.length && source[0].length == dest[0].length) { + int numRow = source.length; + for (int i=0; i maxValue) + maxValue = maxRowValue; + } + maxValue += 20; + mat[maxValueRow][maxValueCol] = maxValue; + double maxValueFound = maxMag(mat); + assertTrue(Math.abs(maxValue - maxValueFound) < 1e-12); + } + + public void checkCorrectMaxMag(int arrayLength, int maxValueIndex) { + double randValue = genRandomMatrix(1, 1, 123)[0][0]; + double maxValue = Math.abs(randValue) + 10; + double[] arr = new double[arrayLength]; + Arrays.fill(arr, randValue); + arr[maxValueIndex] = maxValue; + double maxFound = maxMag(arr); + assertTrue(Math.abs(maxValue-maxFound) < 1e-12); + } + + @Test + public void testFlattenArray() { + checkFlattenArray(2, 1); + checkFlattenArray(10, 20); + checkFlattenArray(25, 3); + } + + public void checkFlattenArray(int numLevel2, int numRandomCoeffs) { + double[][] originalMat = genRandomMatrix(numLevel2, numRandomCoeffs, 123); + double[] flattenArr = flattenArray(originalMat); + int oneDArrayInd = 0; + for (int level2Ind = 0; level2Ind < numLevel2; level2Ind++) { + for (int coefInd = 0; coefInd < numRandomCoeffs; coefInd++) { + assertEquals(originalMat[level2Ind][coefInd], flattenArr[oneDArrayInd], 1e-6); + oneDArrayInd++; + } + } + } + + @Test + public void testExpandMat() { + checkExpandMat(2, 1); + checkExpandMat(10, 2); + checkExpandMat(5, 20); + checkExpandMat(13, 13); + } + + public void checkExpandMat(int numLevel2, int numRandomCoeff) { + double[][] tmat = genRandomMatrix(numRandomCoeff, numRandomCoeff, 123); + double[][] tmatBig = expandMat(tmat, numLevel2); + int bigRowInd; + int bigColInd; + int offset; + for (int ind = 0; ind < numLevel2; ind++) { + for (int ind2 = 0; ind2 < numRandomCoeff; ind2++) { + offset = ind*numRandomCoeff; + bigRowInd = offset + ind2; + for (int index = 0; index < numRandomCoeff; index++) { + bigColInd = offset+index; + assertEquals(tmatBig[bigRowInd][bigColInd], tmat[ind2][index], 1e-6); + } + } + } + } } diff --git a/h2o-genmodel/src/main/java/hex/ModelCategory.java b/h2o-genmodel/src/main/java/hex/ModelCategory.java index c0b1449467b8..8ed727d932b8 100644 --- a/h2o-genmodel/src/main/java/hex/ModelCategory.java +++ b/h2o-genmodel/src/main/java/hex/ModelCategory.java @@ -11,7 +11,6 @@ public enum ModelCategory { Multinomial, Ordinal, Regression, - HGLMRegression, Clustering, AutoEncoder, TargetEncoder, diff --git a/h2o-genmodel/src/main/java/hex/genmodel/GenModel.java b/h2o-genmodel/src/main/java/hex/genmodel/GenModel.java index 9e5de0db7dc1..50d689a623dd 100755 --- a/h2o-genmodel/src/main/java/hex/genmodel/GenModel.java +++ b/h2o-genmodel/src/main/java/hex/genmodel/GenModel.java @@ -177,7 +177,6 @@ public String[][] getOutputDomains() { outputDomains[0] = getDomainValues(getResponseIdx()); break; case Regression: - case HGLMRegression: case Clustering: case AutoEncoder: case TargetEncoder: diff --git a/h2o-py/docs/modeling.rst b/h2o-py/docs/modeling.rst index 9ceecb83b361..a2cc32b4f258 100644 --- a/h2o-py/docs/modeling.rst +++ b/h2o-py/docs/modeling.rst @@ -56,6 +56,12 @@ Supervised :show-inheritance: :members: +:mod:`H2OHGLMEstimator` +----------------------- +.. autoclass:: h2o.estimators.hglm.H2OHGLMEstimator + :show-inheritance: + :members: + :mod:`H2OInfogram` ------------------ .. autoclass:: h2o.estimators.infogram.H2OInfogram diff --git a/h2o-py/h2o/estimators/__init__.py b/h2o-py/h2o/estimators/__init__.py index 766e1678b950..838ab8cea7d5 100644 --- a/h2o-py/h2o/estimators/__init__.py +++ b/h2o-py/h2o/estimators/__init__.py @@ -21,6 +21,7 @@ from .generic import H2OGenericEstimator from .glm import H2OGeneralizedLinearEstimator from .glrm import H2OGeneralizedLowRankEstimator +from .hglm import H2OHGLMEstimator from .infogram import H2OInfogram from .isolation_forest import H2OIsolationForestEstimator from .isotonicregression import H2OIsotonicRegressionEstimator @@ -64,10 +65,10 @@ def create_estimator(algo, **params): "H2OAdaBoostEstimator", "H2OAggregatorEstimator", "H2OANOVAGLMEstimator", "H2OCoxProportionalHazardsEstimator", "H2ODecisionTreeEstimator", "H2OAutoEncoderEstimator", "H2ODeepLearningEstimator", "H2OEstimator", "H2OExtendedIsolationForestEstimator", "H2OGeneralizedAdditiveEstimator", "H2OGradientBoostingEstimator", - "H2OGenericEstimator", "H2OGeneralizedLinearEstimator", "H2OGeneralizedLowRankEstimator", "H2OInfogram", - "H2OIsolationForestEstimator", "H2OIsotonicRegressionEstimator", "H2OKMeansEstimator", "H2OModelSelectionEstimator", - "H2ONaiveBayesEstimator", "H2OPrincipalComponentAnalysisEstimator", "H2OSupportVectorMachineEstimator", - "H2ORandomForestEstimator", "H2ORuleFitEstimator", "H2OStackedEnsembleEstimator", - "H2OSingularValueDecompositionEstimator", "H2OTargetEncoderEstimator", "H2OUpliftRandomForestEstimator", - "H2OWord2vecEstimator", "H2OXGBoostEstimator" + "H2OGenericEstimator", "H2OGeneralizedLinearEstimator", "H2OGeneralizedLowRankEstimator", "H2OHGLMEstimator", + "H2OInfogram", "H2OIsolationForestEstimator", "H2OIsotonicRegressionEstimator", "H2OKMeansEstimator", + "H2OModelSelectionEstimator", "H2ONaiveBayesEstimator", "H2OPrincipalComponentAnalysisEstimator", + "H2OSupportVectorMachineEstimator", "H2ORandomForestEstimator", "H2ORuleFitEstimator", + "H2OStackedEnsembleEstimator", "H2OSingularValueDecompositionEstimator", "H2OTargetEncoderEstimator", + "H2OUpliftRandomForestEstimator", "H2OWord2vecEstimator", "H2OXGBoostEstimator" ) diff --git a/h2o-py/h2o/estimators/glm.py b/h2o-py/h2o/estimators/glm.py index 13ccdbb0a5c5..8326d85b8282 100644 --- a/h2o-py/h2o/estimators/glm.py +++ b/h2o-py/h2o/estimators/glm.py @@ -52,14 +52,12 @@ def __init__(self, fold_column=None, # type: Optional[str] response_column=None, # type: Optional[str] ignored_columns=None, # type: Optional[List[str]] - random_columns=None, # type: Optional[List[int]] ignore_const_cols=True, # type: bool score_each_iteration=False, # type: bool score_iteration_interval=-1, # type: int offset_column=None, # type: Optional[str] weights_column=None, # type: Optional[str] family="auto", # type: Literal["auto", "gaussian", "binomial", "fractionalbinomial", "quasibinomial", "ordinal", "multinomial", "poisson", "gamma", "tweedie", "negativebinomial"] - rand_family=None, # type: Optional[List[Literal["[gaussian]"]]] tweedie_variance_power=0.0, # type: float tweedie_link_power=1.0, # type: float theta=1e-10, # type: float @@ -83,10 +81,8 @@ def __init__(self, beta_epsilon=0.0001, # type: float gradient_epsilon=-1.0, # type: float link="family_default", # type: Literal["family_default", "identity", "logit", "log", "inverse", "tweedie", "ologit"] - rand_link=None, # type: Optional[List[Literal["[identity]", "[family_default]"]]] startval=None, # type: Optional[List[float]] calc_like=False, # type: bool - HGLM=False, # type: bool prior=-1.0, # type: float cold_start=False, # type: bool lambda_min_ratio=-1.0, # type: float @@ -169,9 +165,6 @@ def __init__(self, :param ignored_columns: Names of columns to ignore for training. Defaults to ``None``. :type ignored_columns: List[str], optional - :param random_columns: random columns indices for HGLM. - Defaults to ``None``. - :type random_columns: List[int], optional :param ignore_const_cols: Ignore constant columns. Defaults to ``True``. :type ignore_const_cols: bool @@ -199,10 +192,6 @@ def __init__(self, Defaults to ``"auto"``. :type family: Literal["auto", "gaussian", "binomial", "fractionalbinomial", "quasibinomial", "ordinal", "multinomial", "poisson", "gamma", "tweedie", "negativebinomial"] - :param rand_family: Random Component Family array. One for each random component. Only support gaussian for - now. - Defaults to ``None``. - :type rand_family: List[Literal["[gaussian]"]], optional :param tweedie_variance_power: Tweedie variance power Defaults to ``0.0``. :type tweedie_variance_power: float @@ -293,20 +282,13 @@ def __init__(self, :param link: Link function. Defaults to ``"family_default"``. :type link: Literal["family_default", "identity", "logit", "log", "inverse", "tweedie", "ologit"] - :param rand_link: Link function array for random component in HGLM. - Defaults to ``None``. - :type rand_link: List[Literal["[identity]", "[family_default]"]], optional - :param startval: double array to initialize fixed and random coefficients for HGLM, coefficients for GLM. If - standardize is true, the standardized coefficients should be used. Otherwise, use the regular - coefficients. + :param startval: double array to initialize coefficients for GLM. If standardize is true, the standardized + coefficients should be used. Otherwise, use the regular coefficients. Defaults to ``None``. :type startval: List[float], optional :param calc_like: if true, will return likelihood function value. Defaults to ``False``. :type calc_like: bool - :param HGLM: If set to true, will return HGLM model. Otherwise, normal GLM model will be returned. - Defaults to ``False``. - :type HGLM: bool :param prior: Prior probability for y==1. To be used only for logistic regression iff the data has been sampled and the mean of response does not reflect reality. Defaults to ``-1.0``. @@ -474,14 +456,12 @@ def __init__(self, self.fold_column = fold_column self.response_column = response_column self.ignored_columns = ignored_columns - self.random_columns = random_columns self.ignore_const_cols = ignore_const_cols self.score_each_iteration = score_each_iteration self.score_iteration_interval = score_iteration_interval self.offset_column = offset_column self.weights_column = weights_column self.family = family - self.rand_family = rand_family self.tweedie_variance_power = tweedie_variance_power self.tweedie_link_power = tweedie_link_power self.theta = theta @@ -505,10 +485,8 @@ def __init__(self, self.beta_epsilon = beta_epsilon self.gradient_epsilon = gradient_epsilon self.link = link - self.rand_link = rand_link self.startval = startval self.calc_like = calc_like - self.HGLM = HGLM self.prior = prior self.cold_start = cold_start self.lambda_min_ratio = lambda_min_ratio @@ -895,20 +873,6 @@ def ignored_columns(self, ignored_columns): assert_is_type(ignored_columns, None, [str]) self._parms["ignored_columns"] = ignored_columns - @property - def random_columns(self): - """ - random columns indices for HGLM. - - Type: ``List[int]``. - """ - return self._parms.get("random_columns") - - @random_columns.setter - def random_columns(self, random_columns): - assert_is_type(random_columns, None, [int]) - self._parms["random_columns"] = random_columns - @property def ignore_const_cols(self): """ @@ -1080,20 +1044,6 @@ def family(self, family): assert_is_type(family, None, Enum("auto", "gaussian", "binomial", "fractionalbinomial", "quasibinomial", "ordinal", "multinomial", "poisson", "gamma", "tweedie", "negativebinomial")) self._parms["family"] = family - @property - def rand_family(self): - """ - Random Component Family array. One for each random component. Only support gaussian for now. - - Type: ``List[Literal["[gaussian]"]]``. - """ - return self._parms.get("rand_family") - - @rand_family.setter - def rand_family(self, rand_family): - assert_is_type(rand_family, None, [Enum("[gaussian]")]) - self._parms["rand_family"] = rand_family - @property def tweedie_variance_power(self): """ @@ -1764,25 +1714,11 @@ def link(self, link): assert_is_type(link, None, Enum("family_default", "identity", "logit", "log", "inverse", "tweedie", "ologit")) self._parms["link"] = link - @property - def rand_link(self): - """ - Link function array for random component in HGLM. - - Type: ``List[Literal["[identity]", "[family_default]"]]``. - """ - return self._parms.get("rand_link") - - @rand_link.setter - def rand_link(self, rand_link): - assert_is_type(rand_link, None, [Enum("[identity]", "[family_default]")]) - self._parms["rand_link"] = rand_link - @property def startval(self): """ - double array to initialize fixed and random coefficients for HGLM, coefficients for GLM. If standardize is - true, the standardized coefficients should be used. Otherwise, use the regular coefficients. + double array to initialize coefficients for GLM. If standardize is true, the standardized coefficients should + be used. Otherwise, use the regular coefficients. Type: ``List[float]``. """ @@ -1807,20 +1743,6 @@ def calc_like(self, calc_like): assert_is_type(calc_like, None, bool) self._parms["calc_like"] = calc_like - @property - def HGLM(self): - """ - If set to true, will return HGLM model. Otherwise, normal GLM model will be returned. - - Type: ``bool``, defaults to ``False``. - """ - return self._parms.get("HGLM") - - @HGLM.setter - def HGLM(self, HGLM): - assert_is_type(HGLM, None, bool) - self._parms["HGLM"] = HGLM - @property def prior(self): """ diff --git a/h2o-py/h2o/estimators/hglm.py b/h2o-py/h2o/estimators/hglm.py new file mode 100644 index 000000000000..ec49122ced29 --- /dev/null +++ b/h2o-py/h2o/estimators/hglm.py @@ -0,0 +1,685 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +# +# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py +# Copyright 2016 H2O.ai; Apache License Version 2.0 (see LICENSE for details) +# + +from h2o.estimators.estimator_base import H2OEstimator +from h2o.exceptions import H2OValueError +from h2o.frame import H2OFrame +from h2o.utils.typechecks import assert_is_type, Enum, numeric + + +class H2OHGLMEstimator(H2OEstimator): + """ + Hierarchical Generalized Linear Model + + Fits a HGLM model with both the residual noise and random effect being modeled by Gaussian distribution. The fixed + effect coefficients are specified in parameter x, the random effect coefficients are specified in parameter + random_columns. The column specified in group_column will contain the level 2 index value and must be an enum column. + """ + + algo = "hglm" + supervised_learning = True + + def __init__(self, + model_id=None, # type: Optional[Union[None, str, H2OEstimator]] + training_frame=None, # type: Optional[Union[None, str, H2OFrame]] + validation_frame=None, # type: Optional[Union[None, str, H2OFrame]] + response_column=None, # type: Optional[str] + ignored_columns=None, # type: Optional[List[str]] + ignore_const_cols=True, # type: bool + offset_column=None, # type: Optional[str] + weights_column=None, # type: Optional[str] + max_runtime_secs=0.0, # type: float + custom_metric_func=None, # type: Optional[str] + score_each_iteration=False, # type: bool + score_iteration_interval=5, # type: int + seed=-1, # type: int + missing_values_handling="mean_imputation", # type: Literal["mean_imputation", "skip", "plug_values"] + plug_values=None, # type: Optional[Union[None, str, H2OFrame]] + family="gaussian", # type: Literal["gaussian"] + rand_family=None, # type: Optional[Literal["gaussian"]] + max_iterations=-1, # type: int + initial_fixed_effects=None, # type: Optional[List[float]] + initial_random_effects=None, # type: Optional[Union[None, str, H2OFrame]] + initial_t_matrix=None, # type: Optional[Union[None, str, H2OFrame]] + tau_u_var_init=0.0, # type: float + tau_e_var_init=0.0, # type: float + random_columns=None, # type: Optional[List[str]] + method="em", # type: Literal["em"] + em_epsilon=0.001, # type: float + random_intercept=True, # type: bool + group_column=None, # type: Optional[str] + gen_syn_data=False, # type: bool + ): + """ + :param model_id: Destination id for this model; auto-generated if not specified. + Defaults to ``None``. + :type model_id: Union[None, str, H2OEstimator], optional + :param training_frame: Id of the training data frame. + Defaults to ``None``. + :type training_frame: Union[None, str, H2OFrame], optional + :param validation_frame: Id of the validation data frame. + Defaults to ``None``. + :type validation_frame: Union[None, str, H2OFrame], optional + :param response_column: Response variable column. + Defaults to ``None``. + :type response_column: str, optional + :param ignored_columns: Names of columns to ignore for training. + Defaults to ``None``. + :type ignored_columns: List[str], optional + :param ignore_const_cols: Ignore constant columns. + Defaults to ``True``. + :type ignore_const_cols: bool + :param offset_column: Offset column. This will be added to the combination of columns before applying the link + function. + Defaults to ``None``. + :type offset_column: str, optional + :param weights_column: Column with observation weights. Giving some observation a weight of zero is equivalent + to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating + that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do + not increase the size of the data frame. This is typically the number of times a row is repeated, but + non-integer values are supported as well. During training, rows with higher weights matter more, due to + the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at + that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0. + Defaults to ``None``. + :type weights_column: str, optional + :param max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable. + Defaults to ``0.0``. + :type max_runtime_secs: float + :param custom_metric_func: Reference to custom evaluation function, format: `language:keyName=funcName` + Defaults to ``None``. + :type custom_metric_func: str, optional + :param score_each_iteration: Whether to score during each iteration of model training. + Defaults to ``False``. + :type score_each_iteration: bool + :param score_iteration_interval: Perform scoring for every score_iteration_interval iterations. + Defaults to ``5``. + :type score_iteration_interval: int + :param seed: Seed for pseudo random number generator (if applicable). + Defaults to ``-1``. + :type seed: int + :param missing_values_handling: Handling of missing values. Either MeanImputation, Skip or PlugValues. + Defaults to ``"mean_imputation"``. + :type missing_values_handling: Literal["mean_imputation", "skip", "plug_values"] + :param plug_values: Plug Values (a single row frame containing values that will be used to impute missing values + of the training/validation frame, use with conjunction missing_values_handling = PlugValues). + Defaults to ``None``. + :type plug_values: Union[None, str, H2OFrame], optional + :param family: Family. Only gaussian is supported now. + Defaults to ``"gaussian"``. + :type family: Literal["gaussian"] + :param rand_family: Set distribution of random effects. Only Gaussian is implemented now. + Defaults to ``None``. + :type rand_family: Literal["gaussian"], optional + :param max_iterations: Maximum number of iterations. Value should >=1. A value of 0 is only set when only the + model coefficient names and model coefficient dimensions are needed. + Defaults to ``-1``. + :type max_iterations: int + :param initial_fixed_effects: An array that contains initial values of the fixed effects coefficient. + Defaults to ``None``. + :type initial_fixed_effects: List[float], optional + :param initial_random_effects: A H2OFrame id that contains initial values of the random effects coefficient. + The row names shouldbe the random coefficient names. If you are not sure what the random coefficient + names are, build HGLM model with max_iterations = 0 and checkout the model output field + random_coefficient_names. The number of rows of this frame should be the number of level 2 units. + Again, to figure this out, build HGLM model with max_iterations=0 and check out the model output field + group_column_names. The number of rows should equal the length of thegroup_column_names. + Defaults to ``None``. + :type initial_random_effects: Union[None, str, H2OFrame], optional + :param initial_t_matrix: A H2OFrame id that contains initial values of the T matrix. It should be a positive + symmetric matrix. + Defaults to ``None``. + :type initial_t_matrix: Union[None, str, H2OFrame], optional + :param tau_u_var_init: Initial variance of random coefficient effects. If set, should provide a value > 0.0. + If not set, will be randomly set in the model building process. + Defaults to ``0.0``. + :type tau_u_var_init: float + :param tau_e_var_init: Initial variance of random noise. If set, should provide a value > 0.0. If not set, + will be randomly set in the model building process. + Defaults to ``0.0``. + :type tau_e_var_init: float + :param random_columns: Random columns indices for HGLM. + Defaults to ``None``. + :type random_columns: List[str], optional + :param method: We only implemented EM as a method to obtain the fixed, random coefficients and the various + variances. + Defaults to ``"em"``. + :type method: Literal["em"] + :param em_epsilon: Converge if beta/ubeta/tmat/tauEVar changes less (using L-infinity norm) than em esilon. ONLY + applies to EM method. + Defaults to ``0.001``. + :type em_epsilon: float + :param random_intercept: If true, will allow random component to the GLM coefficients. + Defaults to ``True``. + :type random_intercept: bool + :param group_column: Group column is the column that is categorical and used to generate the groups in HGLM + Defaults to ``None``. + :type group_column: str, optional + :param gen_syn_data: If true, add gaussian noise with variance specified in parms._tau_e_var_init. + Defaults to ``False``. + :type gen_syn_data: bool + """ + super(H2OHGLMEstimator, self).__init__() + self._parms = {} + self._id = self._parms['model_id'] = model_id + self.training_frame = training_frame + self.validation_frame = validation_frame + self.response_column = response_column + self.ignored_columns = ignored_columns + self.ignore_const_cols = ignore_const_cols + self.offset_column = offset_column + self.weights_column = weights_column + self.max_runtime_secs = max_runtime_secs + self.custom_metric_func = custom_metric_func + self.score_each_iteration = score_each_iteration + self.score_iteration_interval = score_iteration_interval + self.seed = seed + self.missing_values_handling = missing_values_handling + self.plug_values = plug_values + self.family = family + self.rand_family = rand_family + self.max_iterations = max_iterations + self.initial_fixed_effects = initial_fixed_effects + self.initial_random_effects = initial_random_effects + self.initial_t_matrix = initial_t_matrix + self.tau_u_var_init = tau_u_var_init + self.tau_e_var_init = tau_e_var_init + self.random_columns = random_columns + self.method = method + self.em_epsilon = em_epsilon + self.random_intercept = random_intercept + self.group_column = group_column + self.gen_syn_data = gen_syn_data + + @property + def training_frame(self): + """ + Id of the training data frame. + + Type: ``Union[None, str, H2OFrame]``. + """ + return self._parms.get("training_frame") + + @training_frame.setter + def training_frame(self, training_frame): + self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame') + + @property + def validation_frame(self): + """ + Id of the validation data frame. + + Type: ``Union[None, str, H2OFrame]``. + """ + return self._parms.get("validation_frame") + + @validation_frame.setter + def validation_frame(self, validation_frame): + self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame') + + @property + def response_column(self): + """ + Response variable column. + + Type: ``str``. + """ + return self._parms.get("response_column") + + @response_column.setter + def response_column(self, response_column): + assert_is_type(response_column, None, str) + self._parms["response_column"] = response_column + + @property + def ignored_columns(self): + """ + Names of columns to ignore for training. + + Type: ``List[str]``. + """ + return self._parms.get("ignored_columns") + + @ignored_columns.setter + def ignored_columns(self, ignored_columns): + assert_is_type(ignored_columns, None, [str]) + self._parms["ignored_columns"] = ignored_columns + + @property + def ignore_const_cols(self): + """ + Ignore constant columns. + + Type: ``bool``, defaults to ``True``. + """ + return self._parms.get("ignore_const_cols") + + @ignore_const_cols.setter + def ignore_const_cols(self, ignore_const_cols): + assert_is_type(ignore_const_cols, None, bool) + self._parms["ignore_const_cols"] = ignore_const_cols + + @property + def offset_column(self): + """ + Offset column. This will be added to the combination of columns before applying the link function. + + Type: ``str``. + """ + return self._parms.get("offset_column") + + @offset_column.setter + def offset_column(self, offset_column): + assert_is_type(offset_column, None, str) + self._parms["offset_column"] = offset_column + + @property + def weights_column(self): + """ + Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the + dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative + weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data + frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. + During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set + weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an + accurate prediction, remove all rows with weight == 0. + + Type: ``str``. + """ + return self._parms.get("weights_column") + + @weights_column.setter + def weights_column(self, weights_column): + assert_is_type(weights_column, None, str) + self._parms["weights_column"] = weights_column + + @property + def max_runtime_secs(self): + """ + Maximum allowed runtime in seconds for model training. Use 0 to disable. + + Type: ``float``, defaults to ``0.0``. + """ + return self._parms.get("max_runtime_secs") + + @max_runtime_secs.setter + def max_runtime_secs(self, max_runtime_secs): + assert_is_type(max_runtime_secs, None, numeric) + self._parms["max_runtime_secs"] = max_runtime_secs + + @property + def custom_metric_func(self): + """ + Reference to custom evaluation function, format: `language:keyName=funcName` + + Type: ``str``. + """ + return self._parms.get("custom_metric_func") + + @custom_metric_func.setter + def custom_metric_func(self, custom_metric_func): + assert_is_type(custom_metric_func, None, str) + self._parms["custom_metric_func"] = custom_metric_func + + @property + def score_each_iteration(self): + """ + Whether to score during each iteration of model training. + + Type: ``bool``, defaults to ``False``. + """ + return self._parms.get("score_each_iteration") + + @score_each_iteration.setter + def score_each_iteration(self, score_each_iteration): + assert_is_type(score_each_iteration, None, bool) + self._parms["score_each_iteration"] = score_each_iteration + + @property + def score_iteration_interval(self): + """ + Perform scoring for every score_iteration_interval iterations. + + Type: ``int``, defaults to ``5``. + """ + return self._parms.get("score_iteration_interval") + + @score_iteration_interval.setter + def score_iteration_interval(self, score_iteration_interval): + assert_is_type(score_iteration_interval, None, int) + self._parms["score_iteration_interval"] = score_iteration_interval + + @property + def seed(self): + """ + Seed for pseudo random number generator (if applicable). + + Type: ``int``, defaults to ``-1``. + """ + return self._parms.get("seed") + + @seed.setter + def seed(self, seed): + assert_is_type(seed, None, int) + self._parms["seed"] = seed + + @property + def missing_values_handling(self): + """ + Handling of missing values. Either MeanImputation, Skip or PlugValues. + + Type: ``Literal["mean_imputation", "skip", "plug_values"]``, defaults to ``"mean_imputation"``. + """ + return self._parms.get("missing_values_handling") + + @missing_values_handling.setter + def missing_values_handling(self, missing_values_handling): + assert_is_type(missing_values_handling, None, Enum("mean_imputation", "skip", "plug_values")) + self._parms["missing_values_handling"] = missing_values_handling + + @property + def plug_values(self): + """ + Plug Values (a single row frame containing values that will be used to impute missing values of the + training/validation frame, use with conjunction missing_values_handling = PlugValues). + + Type: ``Union[None, str, H2OFrame]``. + """ + return self._parms.get("plug_values") + + @plug_values.setter + def plug_values(self, plug_values): + self._parms["plug_values"] = H2OFrame._validate(plug_values, 'plug_values') + + @property + def family(self): + """ + Family. Only gaussian is supported now. + + Type: ``Literal["gaussian"]``, defaults to ``"gaussian"``. + """ + return self._parms.get("family") + + @family.setter + def family(self, family): + assert_is_type(family, None, Enum("gaussian")) + self._parms["family"] = family + + @property + def rand_family(self): + """ + Set distribution of random effects. Only Gaussian is implemented now. + + Type: ``Literal["gaussian"]``. + """ + return self._parms.get("rand_family") + + @rand_family.setter + def rand_family(self, rand_family): + assert_is_type(rand_family, None, Enum("gaussian")) + self._parms["rand_family"] = rand_family + + @property + def max_iterations(self): + """ + Maximum number of iterations. Value should >=1. A value of 0 is only set when only the model coefficient names + and model coefficient dimensions are needed. + + Type: ``int``, defaults to ``-1``. + """ + return self._parms.get("max_iterations") + + @max_iterations.setter + def max_iterations(self, max_iterations): + assert_is_type(max_iterations, None, int) + self._parms["max_iterations"] = max_iterations + + @property + def initial_fixed_effects(self): + """ + An array that contains initial values of the fixed effects coefficient. + + Type: ``List[float]``. + """ + return self._parms.get("initial_fixed_effects") + + @initial_fixed_effects.setter + def initial_fixed_effects(self, initial_fixed_effects): + assert_is_type(initial_fixed_effects, None, [numeric]) + self._parms["initial_fixed_effects"] = initial_fixed_effects + + @property + def initial_random_effects(self): + """ + A H2OFrame id that contains initial values of the random effects coefficient. The row names shouldbe the random + coefficient names. If you are not sure what the random coefficient names are, build HGLM model with + max_iterations = 0 and checkout the model output field random_coefficient_names. The number of rows of this + frame should be the number of level 2 units. Again, to figure this out, build HGLM model with max_iterations=0 + and check out the model output field group_column_names. The number of rows should equal the length of + thegroup_column_names. + + Type: ``Union[None, str, H2OFrame]``. + """ + return self._parms.get("initial_random_effects") + + @initial_random_effects.setter + def initial_random_effects(self, initial_random_effects): + self._parms["initial_random_effects"] = H2OFrame._validate(initial_random_effects, 'initial_random_effects') + + @property + def initial_t_matrix(self): + """ + A H2OFrame id that contains initial values of the T matrix. It should be a positive symmetric matrix. + + Type: ``Union[None, str, H2OFrame]``. + """ + return self._parms.get("initial_t_matrix") + + @initial_t_matrix.setter + def initial_t_matrix(self, initial_t_matrix): + self._parms["initial_t_matrix"] = H2OFrame._validate(initial_t_matrix, 'initial_t_matrix') + + @property + def tau_u_var_init(self): + """ + Initial variance of random coefficient effects. If set, should provide a value > 0.0. If not set, will be + randomly set in the model building process. + + Type: ``float``, defaults to ``0.0``. + """ + return self._parms.get("tau_u_var_init") + + @tau_u_var_init.setter + def tau_u_var_init(self, tau_u_var_init): + assert_is_type(tau_u_var_init, None, numeric) + self._parms["tau_u_var_init"] = tau_u_var_init + + @property + def tau_e_var_init(self): + """ + Initial variance of random noise. If set, should provide a value > 0.0. If not set, will be randomly set in + the model building process. + + Type: ``float``, defaults to ``0.0``. + """ + return self._parms.get("tau_e_var_init") + + @tau_e_var_init.setter + def tau_e_var_init(self, tau_e_var_init): + assert_is_type(tau_e_var_init, None, numeric) + self._parms["tau_e_var_init"] = tau_e_var_init + + @property + def random_columns(self): + """ + Random columns indices for HGLM. + + Type: ``List[str]``. + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OHGLMEstimator + >>> h2o.init() + >>> prostate_path <- system.file("extdata", "prostate.csv", package = "h2o") + >>> prostate <- h2o.uploadFile(path = prostate_path) + >>> prostate$CAPSULE <- as.factor(prostate$CAPSULE) + >>> hglm_model =H2OHGLMEstimator(random_columns = ["AGE"], group_column = "RACE") + >>> hglm_model.train(x=c("AGE","RACE","DPROS"), y="CAPSULE", training_frame=prostate) + """ + return self._parms.get("random_columns") + + @random_columns.setter + def random_columns(self, random_columns): + assert_is_type(random_columns, None, [str]) + self._parms["random_columns"] = random_columns + + @property + def method(self): + """ + We only implemented EM as a method to obtain the fixed, random coefficients and the various variances. + + Type: ``Literal["em"]``, defaults to ``"em"``. + """ + return self._parms.get("method") + + @method.setter + def method(self, method): + assert_is_type(method, None, Enum("em")) + self._parms["method"] = method + + @property + def em_epsilon(self): + """ + Converge if beta/ubeta/tmat/tauEVar changes less (using L-infinity norm) than em esilon. ONLY applies to EM + method. + + Type: ``float``, defaults to ``0.001``. + """ + return self._parms.get("em_epsilon") + + @em_epsilon.setter + def em_epsilon(self, em_epsilon): + assert_is_type(em_epsilon, None, numeric) + self._parms["em_epsilon"] = em_epsilon + + @property + def random_intercept(self): + """ + If true, will allow random component to the GLM coefficients. + + Type: ``bool``, defaults to ``True``. + """ + return self._parms.get("random_intercept") + + @random_intercept.setter + def random_intercept(self, random_intercept): + assert_is_type(random_intercept, None, bool) + self._parms["random_intercept"] = random_intercept + + @property + def group_column(self): + """ + Group column is the column that is categorical and used to generate the groups in HGLM + + Type: ``str``. + """ + return self._parms.get("group_column") + + @group_column.setter + def group_column(self, group_column): + assert_is_type(group_column, None, str) + self._parms["group_column"] = group_column + + @property + def gen_syn_data(self): + """ + If true, add gaussian noise with variance specified in parms._tau_e_var_init. + + Type: ``bool``, defaults to ``False``. + """ + return self._parms.get("gen_syn_data") + + @gen_syn_data.setter + def gen_syn_data(self, gen_syn_data): + assert_is_type(gen_syn_data, None, bool) + self._parms["gen_syn_data"] = gen_syn_data + + + def level_2_names(self): + """ + Get the level 2 column values. + """ + return self._model_json["output"]["group_column_names"] + + def coefs_random_names(self): + """ + Get the random effect coefficient names including the intercept if applicable. + """ + return self._model_json["output"]["random_coefficient_names"] + + def coefs_random(self): + """ + Get the random coefficients of the model. + """ + level_2_names = self.level_2_names() + random_coefs = self._model_json["output"]["ubeta"] + return dict(zip(level_2_names, random_coefs)) + + def scoring_history_valid(self, as_data_frame=True): + """ + Retrieve Model Score History for validation data frame if present + + :returns: The validation score history as an H2OTwoDimTable or a Pandas DataFrame. + """ + model = self._model_json["output"] + if "scoring_history_valid" in model and model["scoring_history_valid"] is not None: + if as_data_frame: + return model["scoring_history_valid"].as_data_frame() + else: + return model["scoring_history_valid"] + print("No validation scoring history for this model") + + def matrix_T(self): + """ + retrieve the T matrix estimated for the random effects. The T matrix is the Tj matrix described in + section II.I of the doc. + + :return: The T matrix as a tuple of tuples. + """ + model = self._model_json["output"] + return model["tmat"] + + def residual_variance(self): + """ + retrieve the residual variance estimate from the model building process. + + :return: residual variance estiamte as a double + """ + model = self._model_json["output"] + return model["residual_variance"] + + def icc(self): + """ + retrieve the icc from the model building process. + + :return: icc as an array + """ + model = self._model_json["output"] + return model["icc"] + + def mean_residual_fixed(self, train = True): + """ + retrieve the mean residual error using the fixed effect coefficients only. + + :param train: boolean, if true return result from training frame, else return result from validation frame. + :return: mean residual error as a double. + """ + model = self._model_json["output"] + if train: + return model["mean_residual_fixed"] + else: + return model["mean_residual_fixed_valid"] diff --git a/h2o-py/h2o/estimators/model_selection.py b/h2o-py/h2o/estimators/model_selection.py index b3d48905bcf1..db44a746c8f6 100644 --- a/h2o-py/h2o/estimators/model_selection.py +++ b/h2o-py/h2o/estimators/model_selection.py @@ -227,7 +227,7 @@ def __init__(self, lambda_search is set to True, the conditional values above are 1E-8 and 1E-6 respectively. Defaults to ``-1.0``. :type gradient_epsilon: float - :param startval: double array to initialize fixed and random coefficients for HGLM, coefficients for GLM. + :param startval: Double array to initialize coefficients for GLM. Defaults to ``None``. :type startval: List[float], optional :param prior: Prior probability for y==1. To be used only for logistic regression iff the data has been sampled @@ -900,7 +900,7 @@ def gradient_epsilon(self, gradient_epsilon): @property def startval(self): """ - double array to initialize fixed and random coefficients for HGLM, coefficients for GLM. + Double array to initialize coefficients for GLM. Type: ``List[float]``. """ diff --git a/h2o-py/h2o/explanation/_explain.py b/h2o-py/h2o/explanation/_explain.py index e59ba69bed53..ad898244f79d 100644 --- a/h2o-py/h2o/explanation/_explain.py +++ b/h2o-py/h2o/explanation/_explain.py @@ -2541,8 +2541,6 @@ def learning_curve_plot( if model.actual_params["lambda_search"]: import h2o.two_dim_table allowed_timesteps = ["iteration"] - elif model.actual_params.get("HGLM"): - allowed_timesteps = ["iterations", "duration"] else: allowed_timesteps = ["iterations", "duration"] @@ -2592,7 +2590,6 @@ def learning_curve_plot( if ("deviance" == metric and model.algo in ["glm", "gam"] - and not model.actual_params.get("HGLM", False) and "deviance_train" in scoring_history.col_header): training_metric = "deviance_train" validation_metric = "deviance_test" diff --git a/h2o-py/h2o/model/extensions/scoring_history.py b/h2o-py/h2o/model/extensions/scoring_history.py index 055ca3d02f9a..2fc5b2af5c06 100644 --- a/h2o-py/h2o/model/extensions/scoring_history.py +++ b/h2o-py/h2o/model/extensions/scoring_history.py @@ -121,9 +121,6 @@ def scoring_history_plot(self, timestep, metric, server=False, save_plot_path=No allowed_metrics = ["deviance_train", "deviance_test", "deviance_xval"] # When provided with multiple alpha values, scoring history contains history of all... scoring_history = scoring_history[scoring_history["alpha"] == self._model_json["output"]["alpha_best"]] - elif self.actual_params.get("HGLM"): - allowed_timesteps = ["iterations", "duration"] - allowed_metrics = ["convergence", "sumetaieta02"] else: allowed_timesteps = ["iterations", "duration"] allowed_metrics = ["objective", "negative_log_likelihood"] diff --git a/h2o-py/h2o/model/metrics_base.py b/h2o-py/h2o/model/metrics_base.py index eccf94c105ff..ed66935a3db6 100644 --- a/h2o-py/h2o/model/metrics_base.py +++ b/h2o-py/h2o/model/metrics_base.py @@ -88,7 +88,7 @@ def _str_items(self, verbosity=None): m_supports_logloss = (m_is_binomial or m_is_multinomial or m_is_ordinal) and not m_is_uplift m_supports_mpce = (m_is_binomial or m_is_multinomial or m_is_ordinal) and not (m_is_glm or m_is_uplift) # GLM excluded? m_supports_mse = not (m_is_anomaly or m_is_clustering or m_is_uplift) - m_supports_r2 = m_is_regression and m_is_glm + m_supports_r2 = m_is_regression and m_is_glm and not(m_is_hglm) items = [ "{mtype}: {algo}".format(mtype=metric_type, algo=self._algo), @@ -103,7 +103,7 @@ def _str_items(self, verbosity=None): "MSE: {}".format(self.mse()), "RMSE: {}".format(self.rmse()), ]) - if m_is_regression: + if m_is_regression and not(m_is_hglm): items.extend([ "MAE: {}".format(self.mae()), "RMSLE: {}".format(self.rmsle()), @@ -125,35 +125,14 @@ def _str_items(self, verbosity=None): auc, aucpr = self.auc(), self.aucpr() if is_type(auc, numeric): items.append("AUC: {}".format(auc)) if is_type(aucpr, numeric): items.append("AUCPR: {}".format(aucpr)) - if m_is_glm: - if m_is_hglm and not m_is_generic: - items.extend([ - "Standard error of fixed columns: {}".format(self.hglm_metric("sefe")), - "Standard error of random columns: {}".format(self.hglm_metric("sere")), - "Coefficients for fixed columns: {}".format(self.hglm_metric("fixedf")), - "Coefficients for random columns: {}".format(self.hglm_metric("ranef")), - "Random column indices: {}".format(self.hglm_metric("randc")), - "Dispersion parameter of the mean model (residual variance for LMM): {}".format(self.hglm_metric("varfix")), - "Dispersion parameter of the random columns (variance of random columns): {}".format(self.hglm_metric("varranef")), - "Convergence reached for algorithm: {}".format(self.hglm_metric("converge")), - "Deviance degrees of freedom for mean part of the model: {}".format(self.hglm_metric("dfrefe")), - "Estimates and standard errors of the linear prediction in the dispersion model: {}".format(self.hglm_metric("summvc1")), - "Estimates and standard errors of the linear predictor for the dispersion parameter of the random columns: {}".format(self.hglm_metric("summvc2")), - "Index of most influential observation (-1 if none): {}".format(self.hglm_metric("bad")), - "H-likelihood: {}".format(self.hglm_metric("hlik")), - "Profile log-likelihood profiled over random columns: {}".format(self.hglm_metric("pvh")), - "Adjusted profile log-likelihood profiled over fixed and random effects: {}".format(self.hglm_metric("pbvh")), - "Conditional AIC: {}".format(self.hglm_metric("caic")), - ]) - else: - items.extend([ - "Null degrees of freedom: {}".format(self.null_degrees_of_freedom()), - "Residual degrees of freedom: {}".format(self.residual_degrees_of_freedom()), - "Null deviance: {}".format(self.null_deviance()), - "Residual deviance: {}".format(self.residual_deviance()), - ]) - if m_is_glm: + items.extend([ + "Null degrees of freedom: {}".format(self.null_degrees_of_freedom()), + "Residual degrees of freedom: {}".format(self.residual_degrees_of_freedom()), + "Null deviance: {}".format(self.null_deviance()), + "Residual deviance: {}".format(self.residual_deviance()), + ]) + if m_is_glm and not(m_is_hglm): if is_type(self.aic(), numeric) and not math.isnan(self.aic()) and self.aic() != 0: items.append("AIC: {}".format(self.aic())) if is_type(self.loglikelihood(), numeric) and not math.isnan(self.loglikelihood()) and self.loglikelihood() != 0: diff --git a/h2o-py/h2o/model/model_base.py b/h2o-py/h2o/model/model_base.py index 239c8817ed69..cff42abb03f1 100644 --- a/h2o-py/h2o/model/model_base.py +++ b/h2o-py/h2o/model/model_base.py @@ -548,7 +548,7 @@ def model_performance(self, test_data=None, train=False, valid=False, xval=False break return self._metrics_class_valid(raw_metrics, algo=self._model_json["algo"]) - def scoring_history(self): + def scoring_history(self, as_data_frame=True): """ Retrieve Model Score History. @@ -556,7 +556,10 @@ def scoring_history(self): """ model = self._model_json["output"] if "scoring_history" in model and model["scoring_history"] is not None: - return model["scoring_history"].as_data_frame() + if as_data_frame: + return model["scoring_history"].as_data_frame() + else: + return model["scoring_history"] print("No score history for this model") def negative_log_likelihood(self): @@ -836,12 +839,15 @@ def get_variable_inflation_factors(self): def coef_names(self): """ - Return the coefficient names of glm model + Return the coefficient names of glm model. For HGLM model, will return all coefficient names plus the + intercept. """ if self.algo == 'glm': coefs = self._model_json['output']['coefficient_names'] coefs.remove('Intercept') return coefs + if self.algo == "hglm": + return self._model_json['output']['coefficient_names'] def coef(self): """ diff --git a/h2o-py/h2o/sklearn/__init__.py b/h2o-py/h2o/sklearn/__init__.py index dd5157884a7c..f2088c0feac9 100644 --- a/h2o-py/h2o/sklearn/__init__.py +++ b/h2o-py/h2o/sklearn/__init__.py @@ -98,7 +98,8 @@ def _estimator_supports_predict_proba(cls): 'H2OModelSelectionEstimator', 'H2OPrincipalComponentAnalysisEstimator', 'H2OSingularValueDecompositionEstimator', - 'H2OTargetEncoderEstimator'] + 'H2OTargetEncoderEstimator' + 'H2OHGLMestimator'] def _estimator_supports_score(cls): @@ -112,7 +113,8 @@ def _estimator_supports_score(cls): 'H2OModelSelectionEstimator', 'H2OPrincipalComponentAnalysisEstimator', 'H2OSingularValueDecompositionEstimator', - 'H2OTargetEncoderEstimator'] + 'H2OTargetEncoderEstimator' + 'H2OHGLMEstimator'] def _estimator_supports_transform(cls): @@ -201,6 +203,7 @@ def h2o_connection(**init_args): 'H2OANOVAGLMEstimator', # fully disabled as it does not support `predict` method. 'H2OModelSelectionEstimator', # fully disabled as it does no support `predict` method. 'H2OIsotonicRegressionEstimator', # specific in behavior (no classification, just "one feature" in regression) + 'H2OHGLMEstimator' ) _generic_only_estimators = ( # e.g. unsupervised and misc estimators # 'H2OANOVAGLMEstimator', diff --git a/h2o-py/tests/pyunit_utils/__init__.py b/h2o-py/tests/pyunit_utils/__init__.py index a8415192911b..f08331d51c62 100644 --- a/h2o-py/tests/pyunit_utils/__init__.py +++ b/h2o-py/tests/pyunit_utils/__init__.py @@ -1,6 +1,6 @@ from .utilsPY import * from .utils_model_metrics import * from .utils_model_custom_distribution import * -from .utils_for_glm_tests import * +from .utils_for_glm_hglm_tests import * from .sklearn_multinomial_auc_method import roc_auc_score from .utils_parser_tests import * diff --git a/h2o-py/tests/pyunit_utils/utils_for_glm_tests.py b/h2o-py/tests/pyunit_utils/utils_for_glm_hglm_tests.py similarity index 81% rename from h2o-py/tests/pyunit_utils/utils_for_glm_tests.py rename to h2o-py/tests/pyunit_utils/utils_for_glm_hglm_tests.py index f993375e9578..d855cfc254d9 100644 --- a/h2o-py/tests/pyunit_utils/utils_for_glm_tests.py +++ b/h2o-py/tests/pyunit_utils/utils_for_glm_hglm_tests.py @@ -1,6 +1,7 @@ from h2o.estimators import H2OGeneralizedLinearEstimator as glm from h2o.exceptions import H2OValueError from h2o.grid.grid_search import H2OGridSearch +from tests import pyunit_utils def gen_constraint_glm_model(training_dataset, x, y, solver="AUTO", family="gaussian", linear_constraints=None, @@ -75,7 +76,7 @@ def grid_models_analysis(grid_models, hyper_parameters, metric="logloss", epsilo [best_equality_constraints, best_lessthan_constraints] = grab_constraint_values( base_constraints_table, cond_index, len(base_constraints_table.cell_values)) - base_iteration = find_glm_iterations(grid_models[0]) + base_iteration = find_model_iterations(grid_models[0]) num_models = len(grid_models) best_model_ind = 0 model_indices = [] @@ -93,13 +94,13 @@ def grid_models_analysis(grid_models, hyper_parameters, metric="logloss", epsilo # conditions used to choose the best model if (sum(equality_constraints_values) < sum(best_equality_constraints)) and (sum(lessthan_constraints_values) < sum(best_lessthan_constraints)): best_model_ind = ind - base_iteration = find_glm_iterations(curr_model) + base_iteration = find_model_iterations(curr_model) best_equality_constraints = equality_constraints_values best_lessthan_constraints = lessthan_constraints_values model_equality_constraints_values.append(equality_constraints_values) model_lessthan_constraints_values.append(lessthan_constraints_values) model_indices.append(ind) - iterations.append(find_glm_iterations(curr_model)) + iterations.append(find_model_iterations(curr_model)) print("Maximum iterations: {0} and it is from model index: {1}".format(base_iteration, best_model_ind)) print_model_hyperparameters(grid_models[best_model_ind], hyper_parameters) return grid_models[best_model_ind] @@ -137,7 +138,7 @@ def is_always_lower_than(original_tuple, new_tuple): "different".format(len(original_tuple), len(new_tuple)) return all(abs(orig) > abs(new) for orig, new in zip(original_tuple, new_tuple)) -def find_glm_iterations(glm_model): +def find_model_iterations(glm_model): """ Given a glm constrainted model, this method will obtain the number of iterations from the model summary. """ @@ -145,3 +146,40 @@ def find_glm_iterations(glm_model): lengths = len(cell_values) iteration_index = glm_model._model_json["output"]["model_summary"].col_header.index("number_of_iterations") return cell_values[lengths-1][iteration_index] + +def add_to_random_coef_dict(normalized_coefs, normalized_one_coefs, level2_val, random_coefs_names): + one_list = [] + for one_name in random_coefs_names: + one_list.append(normalized_one_coefs[one_name]) + normalized_coefs[level2_val] = one_list + +def extract_coef_dict(random_coeffs, level2_name, random_coefs_names): + random_coef_level2 = dict() + index = 0 + for cname in random_coefs_names: + random_coef_level2[cname] = random_coeffs[level2_name][index] + index = index+1 + return random_coef_level2 + +def compare_dicts_with_tupple(dict1, dict2, tolerance=1e-6): + keys = dict1.keys() + for cname in keys: + pyunit_utils.equal_two_arrays(dict1[cname], dict2[cname], tolerance = tolerance, throw_error=True) + +def compare_list_h2o_frame(one_list, h2oframe, col_name_start): + list_len = len(one_list) + for index in range(list_len): + assert col_name_start+h2oframe[index, 0] in one_list, "Value: {0} is not found in the list.".format(h2oframe[index, 0]) + +def check_icc_calculation(tmat, varEVar, icc, tolerance=1e-6): + t_size = len(icc) + varSum = varEVar + for ind in range(t_size): + varSum = varSum + tmat[ind][ind] + oneOVarSum = 1.0/varSum + + for ind in range(t_size): + one_icc = tmat[ind][ind]*oneOVarSum + assert abs(one_icc - icc[ind]) < tolerance, "Expected ICC value {0} for coef {1}, actual ICC value {2}. " \ + "They are not equal or close.".format(one_icc, icc[ind], ind) + diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_beta_equality_loose_lessthan_linear_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_beta_equality_loose_lessthan_linear_constraints_binomial.py index 569a4304268b..2946a34247f4 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_beta_equality_loose_lessthan_linear_constraints_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_beta_equality_loose_lessthan_linear_constraints_binomial.py @@ -1,7 +1,7 @@ import h2o from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm from tests import pyunit_utils -from tests.pyunit_utils import utils_for_glm_tests +from tests.pyunit_utils import utils_for_glm_hglm_tests def test_constraints_binomial(): ''' @@ -161,7 +161,7 @@ def test_constraints_binomial(): constraint_alpha = [0.01] constraint_beta = [0.5, 0.9] constraint_c0 = [40] - h2o_glm_random_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_random_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, beta_constraints=beta_constraints, @@ -174,12 +174,12 @@ def test_constraints_binomial(): return_best=False) init_random_logloss = h2o_glm_random_init.model_performance()._metric_json['logloss'] print("logloss with constraints and coefficients initialized random initial values: {0}, number of iterations" - " taken to build the model: {1}".format(init_random_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_random_init))) + " taken to build the model: {1}".format(init_random_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_random_init))) print(glm.getConstraintsInfo(h2o_glm_random_init)) # GLM model with GLM coefficients with default initialization - h2o_glm_default_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_default_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, beta_constraints=beta_constraints, @@ -191,7 +191,7 @@ def test_constraints_binomial(): return_best=False) default_init_logloss = h2o_glm_default_init.model_performance()._metric_json['logloss'] print("logloss with constraints and default coefficients initialization: {0}, number of iterations" - " taken to build the model: {1}".format(default_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_default_init))) + " taken to build the model: {1}".format(default_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_default_init))) print(glm.getConstraintsInfo(h2o_glm_default_init)) diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_beta_linear_constraints_binomial_objective_likelihood.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_beta_linear_constraints_binomial_objective_likelihood.py index add3f09430e5..a29114bd44e6 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_beta_linear_constraints_binomial_objective_likelihood.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_beta_linear_constraints_binomial_objective_likelihood.py @@ -1,7 +1,7 @@ import h2o from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm from tests import pyunit_utils -from tests.pyunit_utils import utils_for_glm_tests +from tests.pyunit_utils import utils_for_glm_hglm_tests def test_constraints_objective_likelihood(): ''' @@ -94,7 +94,7 @@ def test_constraints_objective_likelihood(): obj_optimal = h2o_glm_optimal_init.average_objective() print("logloss with constraints and coefficients initialized with glm model built without constraints: {0}, aic: " "{2}, llh: {3}, average_objective: {4}, number of iterations taken to build the model: " - "{1}".format(init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_optimal_init), aic_optimal, + "{1}".format(init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_optimal_init), aic_optimal, ll_optimal, obj_optimal)) print(glm.getConstraintsInfo(h2o_glm_optimal_init)) @@ -124,7 +124,7 @@ def test_constraints_objective_likelihood(): init_random_logloss = h2o_glm_random_init.model_performance()._metric_json['logloss'] print("logloss with constraints and coefficients initialized random initial values: {0}, aic: {2}, llh: {3}, " "average objective: {4}, number of iterations taken to build the model: " - "{1}".format(init_random_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_random_init), aic_random, + "{1}".format(init_random_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_random_init), aic_random, ll_random, obj_random)) print(glm.getConstraintsInfo(h2o_glm_random_init)) @@ -141,7 +141,7 @@ def test_constraints_objective_likelihood(): default_init_logloss = h2o_glm_default_init.model_performance()._metric_json['logloss'] print("logloss with constraints and default coefficients initialization: {0}, aic: {2}, llh: {3}, average objective:" " {4}, number of iterations taken to build the model: " - "{1}".format(default_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_default_init), aic_default, + "{1}".format(default_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_default_init), aic_default, ll_default, obj_default)) print(glm.getConstraintsInfo(h2o_glm_default_init)) diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_constraints_only_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_constraints_only_binomial.py index 8c29822f0b87..f3399f798f95 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_constraints_only_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_constraints_only_binomial.py @@ -1,7 +1,7 @@ import h2o from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm from tests import pyunit_utils -from tests.pyunit_utils import utils_for_glm_tests +from tests.pyunit_utils import utils_for_glm_hglm_tests def test_equality_constraints_only_binomial(): ''' @@ -94,7 +94,7 @@ def test_equality_constraints_only_binomial(): constraint_alpha = [0.01] constraint_beta = [0.1] constraint_c0 = [15, 20] - h2o_glm_random_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_random_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, startval=random_coef, @@ -110,7 +110,7 @@ def test_equality_constraints_only_binomial(): print(glm.getConstraintsInfo(h2o_glm_random_init)) # GLM model with GLM coefficients with default initialization - h2o_glm_default_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_default_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, constraint_eta0=constraint_eta0, diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_loose_lessthan_linear_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_loose_lessthan_linear_constraints_binomial.py index dd884aaea48b..62980c816cc0 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_loose_lessthan_linear_constraints_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_equality_loose_lessthan_linear_constraints_binomial.py @@ -1,7 +1,7 @@ import h2o from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm from tests import pyunit_utils -from tests.pyunit_utils import utils_for_glm_tests +from tests.pyunit_utils import utils_for_glm_hglm_tests def test_equality_linear_constraints_binomial(): ''' @@ -128,7 +128,7 @@ def test_equality_linear_constraints_binomial(): constraint_alpha = [0.1] constraint_beta = [0.9] constraint_c0 = [10] # initial value - h2o_glm_random_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_random_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, startval=random_coef, @@ -139,11 +139,11 @@ def test_equality_linear_constraints_binomial(): constraint_c0=constraint_c0, return_best=False) init_random_logloss = h2o_glm_random_init.model_performance()._metric_json['logloss'] print("logloss with constraints and coefficients initialized random initial values: {0}, number of iterations" - " taken to build the model: {1}".format(init_random_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_random_init))) + " taken to build the model: {1}".format(init_random_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_random_init))) print(glm.getConstraintsInfo(h2o_glm_random_init)) # GLM model with GLM coefficients with default initialization - h2o_glm_default_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_default_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, constraint_eta0=constraint_eta0, @@ -153,7 +153,7 @@ def test_equality_linear_constraints_binomial(): constraint_c0=constraint_c0, return_best=False) default_init_logloss = h2o_glm_default_init.model_performance()._metric_json['logloss'] print("logloss with constraints and default coefficients initialization: {0}, number of iterations" - " taken to build the model: {1}".format(default_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_default_init))) + " taken to build the model: {1}".format(default_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_default_init))) print(glm.getConstraintsInfo(h2o_glm_default_init)) assert abs(logloss-init_logloss)<1e-6, "logloss from optimal GLM {0} and logloss from GLM with loose constraints " \ diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_beta_equality_lessthan_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_beta_equality_lessthan_constraints_binomial.py index 732bdda9d8e5..e9048d9e2d43 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_beta_equality_lessthan_constraints_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_beta_equality_lessthan_constraints_binomial.py @@ -1,7 +1,7 @@ import h2o from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm from tests import pyunit_utils -from tests.pyunit_utils import utils_for_glm_tests +from tests.pyunit_utils import utils_for_glm_hglm_tests def test_light_tight_linear_constraints_binomial(): ''' @@ -132,7 +132,7 @@ def test_light_tight_linear_constraints_binomial(): constraint_beta = [0.9] constraint_c0 = [5, 10] # initial value # GLM model with with GLM coefficients set to GLM model coefficients built without constraints - h2o_glm_optimal_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_optimal_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, beta_constraints=beta_constraints, @@ -145,10 +145,10 @@ def test_light_tight_linear_constraints_binomial(): return_best=False, epsilon=0.5) optimal_init_logloss = h2o_glm_optimal_init.model_performance()._metric_json['logloss'] print("logloss with optimal GLM coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(optimal_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_optimal_init))) + "{1}".format(optimal_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_optimal_init))) print(glm.getConstraintsInfo(h2o_glm_optimal_init)) - h2o_glm_default_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_default_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, beta_constraints=beta_constraints, @@ -161,7 +161,7 @@ def test_light_tight_linear_constraints_binomial(): return_best=False, epsilon=0.5) default_init_logloss = h2o_glm_default_init.model_performance()._metric_json['logloss'] print("logloss with default GLM coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(default_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_default_init))) + "{1}".format(default_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_default_init))) print(glm.getConstraintsInfo(h2o_glm_default_init)) random_coef = [0.9740393731418461, 0.9021970400494406, 0.8337282995102272, 0.20588758679724872, 0.12522385214612453, 0.6390730524643073, 0.7055779213989253, 0.9004255614099713, 0.4075431157767999, 0.161093231584713, @@ -178,7 +178,7 @@ def test_light_tight_linear_constraints_binomial(): 0.4941250734508458, 0.5446841276322587, 0.19222703209695946, 0.9232239752817498, 0.8824688635063289, 0.224690851359456, 0.5809304720756304, 0.36863807988348585] - h2o_glm_random_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_random_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, beta_constraints=beta_constraints, @@ -192,7 +192,7 @@ def test_light_tight_linear_constraints_binomial(): return_best=False, epsilon=0.5) random_init_logloss = h2o_glm_random_init.model_performance()._metric_json['logloss'] print("logloss with random GLM coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(random_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_random_init))) + "{1}".format(random_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_random_init))) print(glm.getConstraintsInfo(h2o_glm_random_init)) assert logloss <= optimal_init_logloss, "logloss from optimal GLM {0} should be lower than logloss from GLM with light tight" \ diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_equality_lessthan_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_equality_lessthan_constraints_binomial.py index 8e50603e9f15..f5c7050d3c56 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_equality_lessthan_constraints_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_equality_lessthan_constraints_binomial.py @@ -1,7 +1,7 @@ import h2o from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm from tests import pyunit_utils -from tests.pyunit_utils import utils_for_glm_tests +from tests.pyunit_utils import utils_for_glm_hglm_tests def test_light_tight_linear_constraints_only_binomial(): ''' @@ -111,7 +111,7 @@ def test_light_tight_linear_constraints_only_binomial(): constraint_beta = [0.9] constraint_c0 = [10, 20] # initial value # GLM model with with GLM coefficients set to GLM model coefficients built without constraints - h2o_glm_optimal_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_optimal_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, init_optimal_glm=True, @@ -123,10 +123,10 @@ def test_light_tight_linear_constraints_only_binomial(): return_best=False) optimal_init_logloss = h2o_glm_optimal_init.model_performance()._metric_json['logloss'] print("logloss with optimal GLM coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(optimal_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_optimal_init))) + "{1}".format(optimal_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_optimal_init))) print(glm.getConstraintsInfo(h2o_glm_optimal_init)) - h2o_glm_default_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_default_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, init_optimal_glm=False, @@ -139,7 +139,7 @@ def test_light_tight_linear_constraints_only_binomial(): epsilon=5e-1) default_init_logloss = h2o_glm_default_init.model_performance()._metric_json['logloss'] print("logloss with default GLM coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(default_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_default_init))) + "{1}".format(default_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_default_init))) print(glm.getConstraintsInfo(h2o_glm_default_init)) random_coef = [0.9740393731418461, 0.9021970400494406, 0.8337282995102272, 0.20588758679724872, 0.12522385214612453, 0.6390730524643073, 0.7055779213989253, 0.9004255614099713, 0.4075431157767999, 0.161093231584713, @@ -156,7 +156,7 @@ def test_light_tight_linear_constraints_only_binomial(): 0.4941250734508458, 0.5446841276322587, 0.19222703209695946, 0.9232239752817498, 0.8824688635063289, 0.224690851359456, 0.5809304720756304, 0.36863807988348585] - h2o_glm_random_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_random_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, startval=random_coef, @@ -170,7 +170,7 @@ def test_light_tight_linear_constraints_only_binomial(): epsilon=5e-1) random_init_logloss = h2o_glm_random_init.model_performance()._metric_json['logloss'] print("logloss with random GLM coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(random_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_random_init))) + "{1}".format(random_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_random_init))) print(glm.getConstraintsInfo(h2o_glm_random_init)) assert abs(logloss - optimal_init_logloss)<1e-6, "logloss from optimal GLM {0} should be close to logloss from GLM with light tight" \ diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_linear_constraints_only_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_linear_constraints_only_binomial.py index d59c80fd99a5..dca483a98834 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_linear_constraints_only_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_light_tight_linear_constraints_only_binomial.py @@ -1,7 +1,7 @@ import h2o from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm from tests import pyunit_utils -from tests.pyunit_utils import utils_for_glm_tests +from tests.pyunit_utils import utils_for_glm_hglm_tests def test_light_tight_linear_constraints_only_binomial(): ''' @@ -125,7 +125,7 @@ def test_light_tight_linear_constraints_only_binomial(): constraint_beta = [0.9] constraint_c0 = [1.2, 5] # initial value # GLM model with with GLM coefficients set to GLM model coefficients built without constraints - h2o_glm_optimal_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_optimal_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, init_optimal_glm=True, @@ -137,11 +137,11 @@ def test_light_tight_linear_constraints_only_binomial(): return_best=False) optimal_init_logloss = h2o_glm_optimal_init.model_performance()._metric_json['logloss'] print("logloss with optimal GLM coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(optimal_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_optimal_init))) + "{1}".format(optimal_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_optimal_init))) print(glm.getConstraintsInfo(h2o_glm_optimal_init)) print("All constraints satisfied: {0}".format(glm.allConstraintsPassed(h2o_glm_optimal_init))) - h2o_glm_default_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_default_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, init_optimal_glm=False, @@ -153,7 +153,7 @@ def test_light_tight_linear_constraints_only_binomial(): return_best=False) default_init_logloss = h2o_glm_default_init.model_performance()._metric_json['logloss'] print("logloss with default GLM coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(default_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_default_init))) + "{1}".format(default_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_default_init))) print(glm.getConstraintsInfo(h2o_glm_default_init)) print("All constraints satisfied: {0}".format(glm.allConstraintsPassed(h2o_glm_default_init))) @@ -172,7 +172,7 @@ def test_light_tight_linear_constraints_only_binomial(): 0.4941250734508458, 0.5446841276322587, 0.19222703209695946, 0.9232239752817498, 0.8824688635063289, 0.224690851359456, 0.5809304720756304, 0.36863807988348585] - h2o_glm_random_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_random_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, startval=random_coef, @@ -185,7 +185,7 @@ def test_light_tight_linear_constraints_only_binomial(): return_best=False) random_init_logloss = h2o_glm_random_init.model_performance()._metric_json['logloss'] print("logloss with random GLM coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(random_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_random_init))) + "{1}".format(random_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_random_init))) print(glm.getConstraintsInfo(h2o_glm_random_init)) print("All constraints satisfied: {0}".format(glm.allConstraintsPassed(h2o_glm_random_init))) diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_beta_linear_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_beta_linear_constraints_binomial.py index 08778437db07..00178a814543 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_beta_linear_constraints_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_beta_linear_constraints_binomial.py @@ -1,7 +1,7 @@ import h2o from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm from tests import pyunit_utils -from tests.pyunit_utils import utils_for_glm_tests +from tests.pyunit_utils import utils_for_glm_hglm_tests def test_loose_beta_linear_constraints_binomial(): ''' @@ -115,7 +115,7 @@ def test_loose_beta_linear_constraints_binomial(): constraint_beta = [0.5] constraint_c0 = [2] # initial value # GLM model with GLM coefficients with default initialization - h2o_glm_random_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_random_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", beta_constraints=beta_constraints, linear_constraints=linear_constraints2, @@ -127,11 +127,11 @@ def test_loose_beta_linear_constraints_binomial(): constraint_c0=constraint_c0) init_random_logloss = h2o_glm_random_init.model_performance()._metric_json['logloss'] print("logloss with constraints and coefficients initialized random initial values: {0}, number of iterations" - " taken to build the model: {1}".format(init_random_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_random_init))) + " taken to build the model: {1}".format(init_random_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_random_init))) print(glm.getConstraintsInfo(h2o_glm_random_init)) # GLM model with GLM coefficients with default initialization - h2o_glm_default_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_default_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", beta_constraints=beta_constraints, linear_constraints=linear_constraints2, @@ -142,7 +142,7 @@ def test_loose_beta_linear_constraints_binomial(): constraint_c0=constraint_c0) default_init_logloss = h2o_glm_default_init.model_performance()._metric_json['logloss'] print("logloss with constraints and default coefficients initialization: {0}, number of iterations" - " taken to build the model: {1}".format(default_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_default_init))) + " taken to build the model: {1}".format(default_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_default_init))) print(glm.getConstraintsInfo(h2o_glm_default_init)) diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_only_linear_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_only_linear_constraints_binomial.py index f9b0d17f976d..d38fd22097f0 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_only_linear_constraints_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_loose_only_linear_constraints_binomial.py @@ -1,7 +1,7 @@ import h2o from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm from tests import pyunit_utils -from tests.pyunit_utils import utils_for_glm_tests +from tests.pyunit_utils import utils_for_glm_hglm_tests def test_loose_linear_constraints_binomial(): ''' @@ -92,7 +92,7 @@ def test_loose_linear_constraints_binomial(): constraint_alpha = [0.01] constraint_beta = [0.5] constraint_c0 = [5, 10] # initial value - h2o_glm_random_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_random_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, startval=random_coef, @@ -103,11 +103,11 @@ def test_loose_linear_constraints_binomial(): constraint_c0=constraint_c0) random_init_logloss = h2o_glm_random_init.model_performance()._metric_json['logloss'] print("logloss with random coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(random_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_random_init))) + "{1}".format(random_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_random_init))) print(glm.getConstraintsInfo(h2o_glm_random_init)) # GLM model with GLM coefficients with default initialization - h2o_glm_default_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_default_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, constraint_eta0=constraint_eta0, @@ -117,7 +117,7 @@ def test_loose_linear_constraints_binomial(): constraint_c0=constraint_c0) default_init_logloss = h2o_glm_default_init.model_performance()._metric_json['logloss'] print("logloss with default coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(default_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_default_init))) + "{1}".format(default_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_default_init))) print(glm.getConstraintsInfo(h2o_glm_default_init)) # since the constraints are loose, performance of GLM model without linear constraints and GLM model with linear diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_separate_linear_beta_gaussian.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_separate_linear_beta_gaussian.py index 6bcfc2fbcf45..084291387511 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_separate_linear_beta_gaussian.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_separate_linear_beta_gaussian.py @@ -1,7 +1,6 @@ import h2o from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm from tests import pyunit_utils -from tests.pyunit_utils import utils_for_glm_tests def test_separate_linear_beta_gaussian(): ''' diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_beta_equality_linear_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_beta_equality_linear_constraints_binomial.py index f40887389093..5945f5606b94 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_beta_equality_linear_constraints_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_beta_equality_linear_constraints_binomial.py @@ -1,7 +1,7 @@ import h2o from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm from tests import pyunit_utils -from tests.pyunit_utils import utils_for_glm_tests +from tests.pyunit_utils import utils_for_glm_hglm_tests def test_tight_beta_linear_constraints_binomial(): ''' @@ -185,7 +185,7 @@ def test_tight_beta_linear_constraints_binomial(): constraint_beta = [0.001, 0.5] constraint_c0 = [20, 30] # initial value - h2o_glm_default_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_default_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, beta_constraints=beta_constraints, @@ -198,10 +198,10 @@ def test_tight_beta_linear_constraints_binomial(): return_best=False, epsilon=20) default_init_logloss = h2o_glm_default_init.model_performance()._metric_json['logloss'] print("logloss with default GLM coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(default_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_default_init))) + "{1}".format(default_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_default_init))) # GLM model with with GLM coefficients set to GLM model coefficients built without constraints - h2o_glm_optimal_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_optimal_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, beta_constraints=beta_constraints, @@ -215,11 +215,11 @@ def test_tight_beta_linear_constraints_binomial(): epsilon=20) optimal_init_logloss = h2o_glm_optimal_init.model_performance()._metric_json['logloss'] print("logloss with optimal GLM coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(optimal_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_optimal_init))) + "{1}".format(optimal_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_optimal_init))) print(glm.getConstraintsInfo(h2o_glm_optimal_init)) print("All constraints satisfied: {0}".format(glm.allConstraintsPassed(h2o_glm_optimal_init))) - h2o_glm_default_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_default_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, beta_constraints=beta_constraints, @@ -232,7 +232,7 @@ def test_tight_beta_linear_constraints_binomial(): return_best=False, epsilon=20) default_init_logloss = h2o_glm_default_init.model_performance()._metric_json['logloss'] print("logloss with default GLM coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(default_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_default_init))) + "{1}".format(default_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_default_init))) print(glm.getConstraintsInfo(h2o_glm_default_init)) print("All constraints satisfied: {0}".format(glm.allConstraintsPassed(h2o_glm_default_init))) @@ -251,7 +251,7 @@ def test_tight_beta_linear_constraints_binomial(): 0.4941250734508458, 0.5446841276322587, 0.19222703209695946, 0.9232239752817498, 0.8824688635063289, 0.224690851359456, 0.5809304720756304, 0.36863807988348585] - h2o_glm_random_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_random_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, beta_constraints=beta_constraints, @@ -265,7 +265,7 @@ def test_tight_beta_linear_constraints_binomial(): return_best=False, epsilon=20) random_init_logloss = h2o_glm_random_init.model_performance()._metric_json['logloss'] print("logloss with random GLM coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(random_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_random_init))) + "{1}".format(random_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_random_init))) print(glm.getConstraintsInfo(h2o_glm_random_init)) print("All constraints satisfied: {0}".format(glm.allConstraintsPassed(h2o_glm_random_init))) diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_equality_linear_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_equality_linear_constraints_binomial.py index 94ac1155c494..a87a8f891dbd 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_equality_linear_constraints_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_equality_linear_constraints_binomial.py @@ -1,7 +1,7 @@ import h2o from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm from tests import pyunit_utils -from tests.pyunit_utils import utils_for_glm_tests +from tests.pyunit_utils import utils_for_glm_hglm_tests def test_tight_equality_linear_constraints_binomial(): ''' @@ -161,7 +161,7 @@ def test_tight_equality_linear_constraints_binomial(): constraint_beta = [0.001] constraint_c0 = [1.5, 5] # initial value # GLM model with with GLM coefficients set to GLM model coefficients built without constraints - h2o_glm_optimal_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_optimal_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, init_optimal_glm=True, @@ -173,11 +173,11 @@ def test_tight_equality_linear_constraints_binomial(): return_best=False) optimal_init_logloss = h2o_glm_optimal_init.model_performance()._metric_json['logloss'] print("logloss with optimal GLM coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(optimal_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_optimal_init))) + "{1}".format(optimal_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_optimal_init))) print(glm.getConstraintsInfo(h2o_glm_optimal_init)) print("All constraints satisfied: {0}".format(glm.allConstraintsPassed(h2o_glm_optimal_init))) - h2o_glm_default_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_default_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, init_optimal_glm=False, @@ -189,7 +189,7 @@ def test_tight_equality_linear_constraints_binomial(): return_best=False) default_init_logloss = h2o_glm_default_init.model_performance()._metric_json['logloss'] print("logloss with default GLM coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(default_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_default_init))) + "{1}".format(default_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_default_init))) print(glm.getConstraintsInfo(h2o_glm_default_init)) print("All constraints satisfied: {0}".format(glm.allConstraintsPassed(h2o_glm_default_init))) @@ -208,7 +208,7 @@ def test_tight_equality_linear_constraints_binomial(): 0.4941250734508458, 0.5446841276322587, 0.19222703209695946, 0.9232239752817498, 0.8824688635063289, 0.224690851359456, 0.5809304720756304, 0.36863807988348585] - h2o_glm_random_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_random_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, startval=random_coef, @@ -221,7 +221,7 @@ def test_tight_equality_linear_constraints_binomial(): return_best=False) random_init_logloss = h2o_glm_random_init.model_performance()._metric_json['logloss'] print("logloss with random GLM coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(random_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_random_init))) + "{1}".format(random_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_random_init))) print(glm.getConstraintsInfo(h2o_glm_random_init)) print("All constraints satisfied: {0}".format(glm.allConstraintsPassed(h2o_glm_random_init))) diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_linear_constraints_only_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_linear_constraints_only_binomial.py index cc5b5385c8d3..f14b2f860907 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_linear_constraints_only_binomial.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_tight_linear_constraints_only_binomial.py @@ -1,7 +1,7 @@ import h2o from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm from tests import pyunit_utils -from tests.pyunit_utils import utils_for_glm_tests +from tests.pyunit_utils import utils_for_glm_hglm_tests def test_tight_linear_constraints_binomial(): ''' @@ -125,7 +125,7 @@ def test_tight_linear_constraints_binomial(): constraint_beta = [0.9] constraint_c0 = [10, 12] # initial value # GLM model with with GLM coefficients set to GLM model coefficients built without constraints - h2o_glm_optimal_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_optimal_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, init_optimal_glm=True, @@ -137,11 +137,11 @@ def test_tight_linear_constraints_binomial(): return_best=False) optimal_init_logloss = h2o_glm_optimal_init.model_performance()._metric_json['logloss'] print("logloss with optimal GLM coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(optimal_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_optimal_init))) + "{1}".format(optimal_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_optimal_init))) print(glm.getConstraintsInfo(h2o_glm_optimal_init)) print("All constraints satisfied: {0}".format(glm.allConstraintsPassed(h2o_glm_optimal_init))) - h2o_glm_default_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_default_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, init_optimal_glm=False, @@ -153,7 +153,7 @@ def test_tight_linear_constraints_binomial(): return_best=False) default_init_logloss = h2o_glm_default_init.model_performance()._metric_json['logloss'] print("logloss with default GLM coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(default_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_default_init))) + "{1}".format(default_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_default_init))) print(glm.getConstraintsInfo(h2o_glm_default_init)) print("All constraints satisfied: {0}".format(glm.allConstraintsPassed(h2o_glm_default_init))) @@ -172,7 +172,7 @@ def test_tight_linear_constraints_binomial(): 0.4941250734508458, 0.5446841276322587, 0.19222703209695946, 0.9232239752817498, 0.8824688635063289, 0.224690851359456, 0.5809304720756304, 0.36863807988348585] - h2o_glm_random_init = utils_for_glm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", + h2o_glm_random_init = utils_for_glm_hglm_tests.constraint_glm_gridsearch(train, predictors, response, solver="IRLSM", family="binomial", linear_constraints=linear_constraints2, startval=random_coef, @@ -185,7 +185,7 @@ def test_tight_linear_constraints_binomial(): return_best=False) random_init_logloss = h2o_glm_random_init.model_performance()._metric_json['logloss'] print("logloss with random GLM coefficient initializaiton: {0}, number of iterations taken to build the model: " - "{1}".format(random_init_logloss, utils_for_glm_tests.find_glm_iterations(h2o_glm_random_init))) + "{1}".format(random_init_logloss, utils_for_glm_hglm_tests.find_model_iterations(h2o_glm_random_init))) print(glm.getConstraintsInfo(h2o_glm_random_init)) print("All constraints satisfied: {0}".format(glm.allConstraintsPassed(h2o_glm_random_init))) diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_PUBDEV_6876_HGLM_compare_R_large.py b/h2o-py/tests/testdir_algos/glm/pyunit_PUBDEV_6876_HGLM_compare_R_large.py deleted file mode 100644 index 4cf65e8d418b..000000000000 --- a/h2o-py/tests/testdir_algos/glm/pyunit_PUBDEV_6876_HGLM_compare_R_large.py +++ /dev/null @@ -1,34 +0,0 @@ -from builtins import range -import sys -sys.path.insert(1,"../../../") -import h2o -from tests import pyunit_utils -from h2o.estimators.glm import H2OGeneralizedLinearEstimator - -# in this test, I compare the results obtained from R run with H2O-3 runs using a much larger datasets to test -# multiple chunks operation. - -def test_HGLM_R(): - tot=1e-6 - h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/glm_test/HGLM_5KRows_100Z.csv"), - col_types=["enum", "enum", "enum", "enum", "numeric", "numeric", "numeric", - "numeric"]) - y = "response" - x = ["enum1","enum2","enum3","num1","num2","num3"] - z = 0 - h2o_glm = H2OGeneralizedLinearEstimator(HGLM=True, family="gaussian", rand_family=["gaussian"], random_columns=[z], - calc_like=True) - h2o_glm.train(x=x, y=y, training_frame=h2o_data) - modelMetrics = h2o_glm.training_model_metrics() - rmodelMetrics = {"hlik":-23643.3076231, "caic":47019.7968491, "pvh":-23491.5738429, "pbvh": -23490.2982034, - "dfrefe":4953.0, "varfix":703.86912057} - - metricsNames = ["hlik", "caic", "pvh", "pbvh", "dfrefe", "varfix"] - for kNames in metricsNames: - assert abs(rmodelMetrics[kNames]-modelMetrics[kNames]) 2): + for ind in list(range(1, model_iterations)): + p_ind = ind-1 + assert scoring_history.cell_values[p_ind][3] <= scoring_history.cell_values[ind][3], \ + "training llg {0} from iteration {1} should be smaller than training llg {2} from iteration " \ + "{3}".format(scoring_history.cell_values[p_ind][3], p_ind, scoring_history.cell_values[ind][3], ind) + assert scoring_history_valid.cell_values[p_ind][3] <= scoring_history_valid.cell_values[ind][3], \ + "validation llg {0} from iteration {1} should be smaller than validation llg {2} from iteration " \ + "{3}".format(scoring_history_valid.cell_values[p_ind][3], p_ind, scoring_history_valid.cell_values[ind][3], ind) + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_scoring_history_model_summary) +else: + test_scoring_history_model_summary() diff --git a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_2_noise_var_init_beta_ubeta_tmat.py b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_2_noise_var_init_beta_ubeta_tmat.py new file mode 100644 index 000000000000..06d973540281 --- /dev/null +++ b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_2_noise_var_init_beta_ubeta_tmat.py @@ -0,0 +1,65 @@ +import sys +sys.path.insert(1,"../../../") +import h2o +from tests import pyunit_utils +from h2o.estimators.hglm import H2OHGLMEstimator as hglm +from tests.pyunit_utils import utils_for_glm_hglm_tests + +# in this test, want to check the following with random intercept: +# 1.scoring history (both training and valid) +# 2. the model summary +# 3. Fixed effect coefficients, normal and standardized +# 4. icc +# 5. residual variance +def test_scoring_history_model_summary(): + h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/hglm_test/gaussian_0GC_123R_all5Numeric_p2noise_p08T_woIntercept_standardize.gz")) + beta = [1.5606284972932365, -0.0002347762275008978, -0.007899880335654788, 0.0018421903682971376, + 0.6654323495890934, -0.6544609203736372] + ubeta = [[-0.9319187693195115, 0.6070501821727673, 0.8394540491750797], + [-1.3823145230494698, 0.21486874352840676, 0.8366860141888742], + [-0.552534049777237, 0.24577758770128783, 0.8172622402154629], + [-0.7632283839126288, 0.3662979940622124, 0.8382611342477616], + [-0.7660574987463035, 0.5278044590884986, 0.8421686869476276], + [-1.2704526364630178, 0.3882261064670864, 0.8626801006264753], + [-1.2615857701992563, 0.39167873788423885, 0.8448421359246485], + [-1.1863349889243804, 0.4802231651611951, 0.852783164270973]] + ubeta_init = h2o.H2OFrame(ubeta) + t_mat = [[1.1086713375915982, -0.40493787563311834, -0.8561132576680854], + [-0.40493787563311834, 0.17812207973788066, 0.33964543424526844], + [-0.8561132576680854, 0.33964543424526844, 0.709024192121366]] + t_mat_init = h2o.H2OFrame(t_mat) + y = "response" + x = h2o_data.names + x.remove("response") + x.remove("C1") + random_columns = ["C2", "C3", "C4"] + # hglm_model = hglm(random_columns=random_columns, group_column = "C1", score_each_iteration=True, seed=12345, + # max_iterations = 20, random_intercept = False) + hglm_model = hglm(random_columns = random_columns, group_column = "C1", seed = 12345, max_iterations = 0, + random_intercept = False, initial_fixed_effects = beta, initial_random_effects = ubeta_init, + initial_t_matrix = t_mat_init) + hglm_model.train(x=x, y=y, training_frame=h2o_data) + # check and make sure the fixed effect coeffs, random effect coeffs and matrix T from model should equal to the + # original initial values since we set max_iterations = 0 + beta_model = hglm_model.coef() + # compare intital beta + for index in range(4): + assert abs(beta[index]-beta_model[x[index]]) < 1e-6, \ + "fixed coefficients for {0} from model: {1}, from initialization: {2} should be the same but is " \ + "not.".format(x[index], beta_model[x[index]], beta[index]) + ubeta_model = hglm_model.coefs_random() + level_2_names = hglm_model.level_2_names() + for index in range(len(level_2_names)): + pyunit_utils.equal_two_arrays(ubeta[index], ubeta_model[level_2_names[index]]) + t_mat_model = hglm_model.matrix_T() + for index in range(len(t_mat_model)): + pyunit_utils.equal_two_arrays(t_mat[index], t_mat_model[index]) + + + + + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_scoring_history_model_summary) +else: + test_scoring_history_model_summary() diff --git a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_2_noise_var_scoring_history_summary.py b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_2_noise_var_scoring_history_summary.py new file mode 100644 index 000000000000..0ff356a873bc --- /dev/null +++ b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_2_noise_var_scoring_history_summary.py @@ -0,0 +1,89 @@ +import sys +sys.path.insert(1,"../../../") +import h2o +from tests import pyunit_utils +from h2o.estimators.hglm import H2OHGLMEstimator as hglm +from tests.pyunit_utils import utils_for_glm_hglm_tests + +# in this test, want to check the following with random intercept: +# 1.scoring history (both training and valid) +# 2. the model summary +# 3. Fixed effect coefficients, normal and standardized +# 4. icc +# 5. residual variance +def test_scoring_history_model_summary(): + h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/hglm_test/gaussian_0GC_123R_all5Numeric_p2noise_p08T_woIntercept_standardize.gz")) + train, valid = h2o_data.split_frame(ratios = [.8], seed = 1234) + y = "response" + x = h2o_data.names + x.remove("response") + x.remove("C1") + random_columns = ["C2", "C3", "C4"] + hglm_model = hglm(random_columns=random_columns, group_column = "C1", score_each_iteration=True, seed=12345, + max_iterations = 20, random_intercept = False) + hglm_model.train(x=x, y=y, training_frame=train, validation_frame=valid) + # grab various metrics + model_metrics = hglm_model.training_model_metrics() + scoring_history = hglm_model.scoring_history(as_data_frame=False) + scoring_history_valid = hglm_model.scoring_history_valid(as_data_frame=False) + model_summary = hglm_model.summary() + coef_random_names = hglm_model.coefs_random_names() + t_mat = hglm_model.matrix_T() + residual_var = hglm_model.residual_variance() + mse = hglm_model.mse() + mse_fixed = hglm_model.mean_residual_fixed() + mse_fixed_valid = hglm_model.mean_residual_fixed(train=False) + icc = hglm_model.icc() + level2_names = hglm_model.level_2_names() + + # check to make sure metrics/coefficients make sense + residual_var_true = 2.0 + assert abs(residual_var-residual_var_true) < 0.05, \ + "Expected variance: {1}, actual: {0}. The difference is too big.".format(residual_var, residual_var_true) + # residual error taking account into only fixed effect coefficients should be greater than mse, mse_valid + assert mse < mse_fixed, "residual error with only fixed effects {0} should exceed that of mse {1} but is" \ + " not.".format(mse_fixed, mse) + assert mse < mse_fixed_valid, "residual error with only fixed effects from validation frames {0} should exceed that" \ + " of mse {1} but is not.".format(mse_fixed_valid, mse) + # make sure level 2 values are captured correctly + group2_value = train["C1"].unique() + utils_for_glm_hglm_tests.compare_list_h2o_frame(level2_names, group2_value, "C1.") + # assert icc is calculated correctly. + assert len(t_mat) == len(coef_random_names), "expected T matrix size: {0}, actual: {1} and they are not " \ + "equal.".format(len(coef_random_names), len(t_mat)) + utils_for_glm_hglm_tests.check_icc_calculation(t_mat, residual_var, icc) + # check model summary and model metrics if contain the same information should equal to each other + model_iterations = model_metrics["iterations"] + assert model_iterations == model_summary.cell_values[0][1], \ + "model metrics iterations {0} should equal model_summary iterations {1}".format(model_iterations, model_summary.cell_values[0][1]) + last_mse = model_metrics["MSE"] + assert abs(last_mse - model_summary.cell_values[0][3]) < 1e-6, \ + "model metrics MSE {0} should equal to model summary MSE {1}.".format(last_mse, model_summary.cell_values[0][3]) + last_llg = model_metrics["log_likelihood"] + assert abs(last_llg - model_summary.cell_values[0][2]) < 1e-6,\ + "model metrics llg {0} should equal to model summary llg {1}.".format(last_llg, model_summary.cell_values[0][2]) + # check scoring history last entry with model metric values + assert len(scoring_history.cell_values) == model_iterations, \ + "length of scoring history {0} should equal to number of model iterations {1}".format(len(scoring_history.cell_values), model_iterations) + last_sc_index = model_iterations-1 + assert abs(scoring_history.cell_values[last_sc_index][3] - last_llg) < 1e-6, \ + "last scoring history llg {0} should equal to model metrics llg {1}".format(scoring_history.cell_values[last_sc_index][3], last_llg) + assert abs(scoring_history.cell_values[last_sc_index][4] - last_mse) < 1e-6, \ + "last scoring history MSE {0} should equal to model metrics MSE {1}.".format(scoring_history.cell_values[last_sc_index][4], last_mse) + # check and make sure the llg from training and validation frame should be increasing in values + # this is only true when the true residual variance is high. For low true residual variance, it is only + # true for the last few iterations when the residual variance estimate is close to the true residual variance + if (residual_var_true > 2): + for ind in list(range(1, model_iterations)): + p_ind = ind-1 + assert scoring_history.cell_values[p_ind][3] <= scoring_history.cell_values[ind][3], \ + "training llg {0} from iteration {1} should be smaller than training llg {2} from iteration " \ + "{3}".format(scoring_history.cell_values[p_ind][3], p_ind, scoring_history.cell_values[ind][3], ind) + assert scoring_history_valid.cell_values[p_ind][3] <= scoring_history_valid.cell_values[ind][3], \ + "validation llg {0} from iteration {1} should be smaller than validation llg {2} from iteration " \ + "{3}".format(scoring_history_valid.cell_values[p_ind][3], p_ind, scoring_history_valid.cell_values[ind][3], ind) + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_scoring_history_model_summary) +else: + test_scoring_history_model_summary() diff --git a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_3_noise_variance_random_intercept_only.py b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_3_noise_variance_random_intercept_only.py new file mode 100644 index 000000000000..c60e1125f832 --- /dev/null +++ b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_3_noise_variance_random_intercept_only.py @@ -0,0 +1,33 @@ +import sys +sys.path.insert(1,"../../../") +import h2o +from tests import pyunit_utils +from h2o.estimators.hglm import H2OHGLMEstimator as hglm +from tests.pyunit_utils import utils_for_glm_hglm_tests + +# Test that model built with random intercept work properly +def test_model_with_random_intercept_only(): + h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/hglm_test/gaussian_0GC_allRC_2enum2numeric_3noise_p08T_wIntercept_standardize.gz")) + train, valid = h2o_data.split_frame(ratios = [.8], seed = 1234) + y = "response" + x = h2o_data.names + x.remove("response") + x.remove("C1") + random_columns = ["C2", "C3", "C10", "C20"] + hglm_model = hglm(random_columns=random_columns, group_column = "C1", score_each_iteration=True, seed=12345, + random_intercept = True, max_iterations=10) + hglm_model.train(x=x, y=y, training_frame=train, validation_frame=valid) + hglm_model_random_intercept = hglm(group_column = "C1", score_each_iteration=True, seed=12345, + random_intercept = True, max_iterations=10) + hglm_model_random_intercept.train(x=x, y=y, training_frame=train, validation_frame=valid) + mse = hglm_model.mse() + mse_random_intercept = hglm_model_random_intercept.mse() + + # check to make sure metrics/coefficients make sense + assert mse < mse_random_intercept, "MSE {0} with random_columns should be lower than model built with random " \ + "intercept only MSE {1}".format(mse, mse_random_intercept) + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_model_with_random_intercept_only) +else: + test_model_with_random_intercept_only() diff --git a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_3_noise_variance_scoring_history_summary.py b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_3_noise_variance_scoring_history_summary.py new file mode 100644 index 000000000000..552fc2750103 --- /dev/null +++ b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_3_noise_variance_scoring_history_summary.py @@ -0,0 +1,86 @@ +import sys +sys.path.insert(1,"../../../") +import h2o +from tests import pyunit_utils +from h2o.estimators.hglm import H2OHGLMEstimator as hglm +from tests.pyunit_utils import utils_for_glm_hglm_tests + +# in this test, want to check the following with random intercept: +# 1.scoring history (both training and valid) +# 2. the model summary +# 3. Fixed effect coefficients, normal and standardized +# 4. icc +# 5. residual variance +def test_scoring_history_model_summary(): + h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/hglm_test/gaussian_0GC_allRC_2enum2numeric_3noise_p08T_wIntercept_standardize.gz")) + train, valid = h2o_data.split_frame(ratios = [.8], seed = 1234) + y = "response" + x = h2o_data.names + x.remove("response") + x.remove("C1") + random_columns = ["C2", "C3", "C10", "C20"] + hglm_model = hglm(random_columns=random_columns, group_column = "C1", score_each_iteration=True, seed=12345, + random_intercept = True, max_iterations=10) + hglm_model.train(x=x, y=y, training_frame=train, validation_frame=valid) + print(hglm_model) # make sure this one works. + # grab various metrics + model_metrics = hglm_model.training_model_metrics() + scoring_history = hglm_model.scoring_history(as_data_frame=False) + scoring_history_valid = hglm_model.scoring_history_valid(as_data_frame=False) + model_summary = hglm_model.summary() + coef_random_names = hglm_model.coefs_random_names() + t_mat = hglm_model.matrix_T() + residual_var = hglm_model.residual_variance() + mse = hglm_model.mse() + mse_fixed = hglm_model.mean_residual_fixed() + mse_fixed_valid = hglm_model.mean_residual_fixed(train=False) + icc = hglm_model.icc() + level2_names = hglm_model.level_2_names() + # check to make sure metrics/coefficients make sense + residual_var_true = 3.0 + assert abs(residual_var-residual_var_true) < 0.05, \ + "Expected variance: {1}}, actual: {0}. The difference is too big.".format(residual_var, residual_var_true) + # residual error taking account into only fixed effect coefficients should be greater than mse, mse_valid + assert mse < mse_fixed, "residual error with only fixed effects {0} should exceed that of mse {1} but is" \ + " not.".format(mse_fixed, mse) + assert mse < mse_fixed_valid, "residual error with only fixed effects from validation frames {0} should exceed that" \ + " of mse {1} but is not.".format(mse_fixed_valid, mse) + # make sure level 2 values are captured correctly + group2_value = train["C1"].unique() + utils_for_glm_hglm_tests.compare_list_h2o_frame(level2_names, group2_value, "C1.") + # assert icc is calculated correctly. + assert len(t_mat) == len(coef_random_names), "expected T matrix size: {0}, actual: {1} and they are not " \ + "equal.".format(len(coef_random_names), len(t_mat)) + utils_for_glm_hglm_tests.check_icc_calculation(t_mat, residual_var, icc) + # check model summary and model metrics if contain the same information should equal to each other + model_iterations = model_metrics["iterations"] + assert model_iterations == model_summary.cell_values[0][1], \ + "model metrics iterations {0} should equal model_summary iterations {1}".format(model_iterations, model_summary.cell_values[0][1]) + last_mse = model_metrics["MSE"] + assert abs(last_mse - model_summary.cell_values[0][3]) < 1e-6, \ + "model metrics MSE {0} should equal to model summary MSE {1}.".format(last_mse, model_summary.cell_values[0][3]) + last_llg = model_metrics["log_likelihood"] + assert abs(last_llg - model_summary.cell_values[0][2]) < 1e-6,\ + "model metrics llg {0} should equal to model summary llg {1}.".format(last_llg, model_summary.cell_values[0][2]) + # check scoring history last entry with model metric values + assert len(scoring_history.cell_values) == model_iterations, \ + "length of scoring history {0} should equal to number of model iterations {1}".format(len(scoring_history.cell_values), model_iterations) + last_sc_index = model_iterations-1 + assert abs(scoring_history.cell_values[last_sc_index][3] - last_llg) < 1e-6, \ + "last scoring history llg {0} should equal to model metrics llg {1}".format(scoring_history.cell_values[last_sc_index][3], last_llg) + assert abs(scoring_history.cell_values[last_sc_index][4] - last_mse) < 1e-6, \ + "last scoring history MSE {0} should equal to model metrics MSE {1}.".format(scoring_history.cell_values[last_sc_index][4], last_mse) + # check and make sure the llg from training and validation frame should be increasing in values + for ind in list(range(1, model_iterations)): + p_ind = ind-1 + assert scoring_history.cell_values[p_ind][3] <= scoring_history.cell_values[ind][3], \ + "training llg {0} from iteration {1} should be smaller than training llg {2} from iteration " \ + "{3}".format(scoring_history.cell_values[p_ind][3], p_ind, scoring_history.cell_values[ind][3], ind) + assert scoring_history_valid.cell_values[p_ind][3] <= scoring_history_valid.cell_values[ind][3], \ + "validation llg {0} from iteration {1} should be smaller than validation llg {2} from iteration " \ + "{3}".format(scoring_history_valid.cell_values[p_ind][3], p_ind, scoring_history_valid.cell_values[ind][3], ind) + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_scoring_history_model_summary) +else: + test_scoring_history_model_summary() diff --git a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_coefficients_check.py b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_coefficients_check.py new file mode 100644 index 000000000000..f5f68353b02c --- /dev/null +++ b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_coefficients_check.py @@ -0,0 +1,50 @@ +import sys +sys.path.insert(1,"../../../") +import h2o +from tests import pyunit_utils +from h2o.estimators.hglm import H2OHGLMEstimator as hglm +from tests.pyunit_utils import utils_for_glm_hglm_tests + +# in this test, want to check to make sure we are getting our coefficients +# 1. Fixed effect coefficients; +# 2. Random effect coefficients. +def test_scoring_history_model_summary(): + h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/hglm_test/gaussian_0GC_allRC_2enum2numeric_3noise_p08T_wIntercept_standardize.gz")) + train, valid = h2o_data.split_frame(ratios = [.8], seed = 1234) + y = "response" + x = h2o_data.names + x.remove("response") + x.remove("C1") + random_columns = ["C2", "C3", "C10", "C20"] + hglm_model = hglm(random_columns=random_columns, group_column = "C1", score_each_iteration=True, seed=12345, + max_iterations=10) + hglm_model.train(x=x, y=y, training_frame=train, validation_frame=valid) + # grab various metrics + coef = hglm_model.coef() + coef_random = hglm_model.coefs_random() + coef_random_names = hglm_model.coefs_random_names() + residual_var = hglm_model.residual_variance() + mse = hglm_model.mse() + mse_fixed = hglm_model.mean_residual_fixed() + mse_fixed_valid = hglm_model.mean_residual_fixed(train=False) + level2_names = hglm_model.level_2_names() + # check to make sure metrics/coefficients make sense + + true_residual_var = 3.0 + assert abs(residual_var-true_residual_var) < 5.0e-2, \ + "Expected variance: {1}, actual: {0}. The difference is too big.".format(residual_var, true_residual_var) + # residual error taking account into only fixed effect coefficients should be greater than mse, mse_valid + assert mse < mse_fixed, "residual error with only fixed effects {0} should exceed that of mse {1} but is" \ + " not.".format(mse_fixed, mse) + assert mse < mse_fixed_valid, "residual error with only fixed effects from validation frames {0} should exceed that" \ + " of mse {1} but is not.".format(mse_fixed_valid, mse) + assert len(coef) == len(coef_random_names), "fixed coefficient length {0} should equal to random coefficient names" \ + " length: {1}".format(len(coef), len(coef_random_names)) + assert len(level2_names) == len(coef_random), \ + "expected random coefficient length: {0}, actual random coefficient names length " \ + "{1}".format(len(level2_names),len(coef_random)) + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_scoring_history_model_summary) +else: + test_scoring_history_model_summary() diff --git a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_p05_noise_variance_scoring_history_summary.py b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_p05_noise_variance_scoring_history_summary.py new file mode 100644 index 000000000000..c23f2128cc5b --- /dev/null +++ b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_p05_noise_variance_scoring_history_summary.py @@ -0,0 +1,89 @@ +import sys +sys.path.insert(1,"../../../") +import h2o +from tests import pyunit_utils +from h2o.estimators.hglm import H2OHGLMEstimator as hglm +from tests.pyunit_utils import utils_for_glm_hglm_tests + +# in this test, want to check the following with standardization and with random intercept: +# 1.scoring history (both training and valid) +# 2. the model summary +# 3. Fixed effect coefficients, normal and standardized +# 4. icc +# 5. residual variance +def test_scoring_history_model_summary(): + h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/hglm_test/gaussian_0GC_678R_6enum_5num_p05oise_p08T_woIntercept_standardize.gz")) + train, valid = h2o_data.split_frame(ratios = [.8], seed = 1234) + y = "response" + x = h2o_data.names + x.remove("response") + x.remove("C1") + random_columns = ["C10", "C20", "C30"] + hglm_model = hglm(random_columns=random_columns, group_column="C1", score_each_iteration=True, seed=12345, + random_intercept=False, max_iterations=10, em_epsilon=0.000001) + hglm_model.train(x=x, y=y, training_frame=train, validation_frame=valid) + print(hglm_model) # make sure this one works. + # grab various metrics + model_metrics = hglm_model.training_model_metrics() + scoring_history = hglm_model.scoring_history(as_data_frame=False) + scoring_history_valid = hglm_model.scoring_history_valid(as_data_frame=False) + model_summary = hglm_model.summary() + coef_random_names = hglm_model.coefs_random_names() + t_mat = hglm_model.matrix_T() + residual_var = hglm_model.residual_variance() + mse = hglm_model.mse() + mse_fixed = hglm_model.mean_residual_fixed() + mse_fixed_valid = hglm_model.mean_residual_fixed(train=False) + icc = hglm_model.icc() + level2_names = hglm_model.level_2_names() + # check to make sure metrics/coefficients make sense + residual_var_true = 0.05 + assert abs(residual_var-residual_var_true) < 0.05, \ + "Expected variance: {1}}, actual: {0}. The difference is too big.".format(residual_var, residual_var_true) + # residual error taking account into only fixed effect coefficients should be greater than mse, mse_valid + assert mse < mse_fixed, "residual error with only fixed effects {0} should exceed that of mse {1} but is" \ + " not.".format(mse_fixed, mse) + assert mse < mse_fixed_valid, "residual error with only fixed effects from validation frames {0} should exceed that" \ + " of mse {1} but is not.".format(mse_fixed_valid, mse) + # make sure level 2 values are captured correctly + group2_value = train["C1"].unique() + utils_for_glm_hglm_tests.compare_list_h2o_frame(level2_names, group2_value, "C1.") + # assert icc is calculated correctly. + assert len(t_mat) == len(coef_random_names), "expected T matrix size: {0}, actual: {1} and they are not " \ + "equal.".format(len(coef_random_names), len(t_mat)) + utils_for_glm_hglm_tests.check_icc_calculation(t_mat, residual_var, icc) + # check model summary and model metrics if contain the same information should equal to each other + model_iterations = model_metrics["iterations"] + assert model_iterations == model_summary.cell_values[0][1], \ + "model metrics iterations {0} should equal model_summary iterations {1}".format(model_iterations, model_summary.cell_values[0][1]) + last_mse = model_metrics["MSE"] + assert abs(last_mse - model_summary.cell_values[0][3]) < 1e-6, \ + "model metrics MSE {0} should equal to model summary MSE {1}.".format(last_mse, model_summary.cell_values[0][3]) + last_llg = model_metrics["log_likelihood"] + assert abs(last_llg - model_summary.cell_values[0][2]) < 1e-6,\ + "model metrics llg {0} should equal to model summary llg {1}.".format(last_llg, model_summary.cell_values[0][2]) + # check scoring history last entry with model metric values + assert len(scoring_history.cell_values) == model_iterations, \ + "length of scoring history {0} should equal to number of model iterations {1}".format(len(scoring_history.cell_values), model_iterations) + last_sc_index = model_iterations-1 + assert abs(scoring_history.cell_values[last_sc_index][3] - last_llg) < 1e-6, \ + "last scoring history llg {0} should equal to model metrics llg {1}".format(scoring_history.cell_values[last_sc_index][3], last_llg) + assert abs(scoring_history.cell_values[last_sc_index][4] - last_mse) < 1e-6, \ + "last scoring history MSE {0} should equal to model metrics MSE {1}.".format(scoring_history.cell_values[last_sc_index][4], last_mse) + # check and make sure the llg from training and validation frame should be increasing in values + # this is only true when the true residual variance is high. For low true residual variance, it is only + # true for the last few iterations when the residual variance estimate is close to the true residual variance + if (residual_var_true >= 2): + for ind in list(range(1, model_iterations)): + p_ind = ind-1 + assert scoring_history.cell_values[p_ind][3] <= scoring_history.cell_values[ind][3], \ + "training llg {0} from iteration {1} should be smaller than training llg {2} from iteration " \ + "{3}".format(scoring_history.cell_values[p_ind][3], p_ind, scoring_history.cell_values[ind][3], ind) + assert scoring_history_valid.cell_values[p_ind][3] <= scoring_history_valid.cell_values[ind][3], \ + "validation llg {0} from iteration {1} should be smaller than validation llg {2} from iteration " \ + "{3}".format(scoring_history_valid.cell_values[p_ind][3], p_ind, scoring_history_valid.cell_values[ind][3], ind) + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_scoring_history_model_summary) +else: + test_scoring_history_model_summary() diff --git a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_p5_noise_var_scoring_history_summary.py b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_p5_noise_var_scoring_history_summary.py new file mode 100644 index 000000000000..0aa576ac51bc --- /dev/null +++ b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_p5_noise_var_scoring_history_summary.py @@ -0,0 +1,89 @@ +import sys +sys.path.insert(1,"../../../") +import h2o +from tests import pyunit_utils +from h2o.estimators.hglm import H2OHGLMEstimator as hglm +from tests.pyunit_utils import utils_for_glm_hglm_tests + +# in this test, want to check the following with standardization and with random intercept: +# 1.scoring history (both training and valid) +# 2. the model summary +# 3. Fixed effect coefficients, normal and standardized +# 4. icc +# 5. residual variance +def test_scoring_history_model_summary(): + h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/hglm_test/gaussian_0GC_123R_all5Enum_p5oise_p08T_wIntercept_standardize.gz")) + train, valid = h2o_data.split_frame(ratios = [.8], seed = 1234) + y = "response" + x = h2o_data.names + x.remove("response") + x.remove("C1") + random_columns = ["C2", "C3", "C4"] + hglm_model = hglm(random_columns=random_columns, group_column = "C1", score_each_iteration=True, seed=12345, + max_iterations = 10, random_intercept = True) + hglm_model.train(x=x, y=y, training_frame=train, validation_frame=valid) + # grab various metrics + model_metrics = hglm_model.training_model_metrics() + scoring_history = hglm_model.scoring_history(as_data_frame=False) + scoring_history_valid = hglm_model.scoring_history_valid(as_data_frame=False) + model_summary = hglm_model.summary() + coef_random_names = hglm_model.coefs_random_names() + t_mat = hglm_model.matrix_T() + residual_var = hglm_model.residual_variance() + mse = hglm_model.mse() + mse_fixed = hglm_model.mean_residual_fixed() + mse_fixed_valid = hglm_model.mean_residual_fixed(train=False) + icc = hglm_model.icc() + level2_names = hglm_model.level_2_names() + residual_var_true = 0.5 + + # check to make sure metrics/coefficients make sense + assert abs(residual_var-residual_var_true) < 0.05, \ + "Expected variance: {1}, actual: {0}. The difference is too big.".format(residual_var, residual_var_true) + # residual error taking account into only fixed effect coefficients should be greater than mse, mse_valid + assert mse < mse_fixed, "residual error with only fixed effects {0} should exceed that of mse {1} but is" \ + " not.".format(mse_fixed, mse) + assert mse < mse_fixed_valid, "residual error with only fixed effects from validation frames {0} should exceed that" \ + " of mse {1} but is not.".format(mse_fixed_valid, mse) + # make sure level 2 values are captured correctly + group2_value = train["C1"].unique() + utils_for_glm_hglm_tests.compare_list_h2o_frame(level2_names, group2_value, "C1.") + # assert icc is calculated correctly. + assert len(t_mat) == len(coef_random_names), "expected T matrix size: {0}, actual: {1} and they are not " \ + "equal.".format(len(coef_random_names), len(t_mat)) + utils_for_glm_hglm_tests.check_icc_calculation(t_mat, residual_var, icc) + # check model summary and model metrics if contain the same information should equal to each other + model_iterations = model_metrics["iterations"] + assert model_iterations == model_summary.cell_values[0][1], \ + "model metrics iterations {0} should equal model_summary iterations {1}".format(model_iterations, model_summary.cell_values[0][1]) + last_mse = model_metrics["MSE"] + assert abs(last_mse - model_summary.cell_values[0][3]) < 1e-6, \ + "model metrics MSE {0} should equal to model summary MSE {1}.".format(last_mse, model_summary.cell_values[0][3]) + last_llg = model_metrics["log_likelihood"] + assert abs(last_llg - model_summary.cell_values[0][2]) < 1e-6,\ + "model metrics llg {0} should equal to model summary llg {1}.".format(last_llg, model_summary.cell_values[0][2]) + # check scoring history last entry with model metric values + assert len(scoring_history.cell_values) == model_iterations, \ + "length of scoring history {0} should equal to number of model iterations {1}".format(len(scoring_history.cell_values), model_iterations) + last_sc_index = model_iterations-1 + assert abs(scoring_history.cell_values[last_sc_index][3] - last_llg) < 1e-6, \ + "last scoring history llg {0} should equal to model metrics llg {1}".format(scoring_history.cell_values[last_sc_index][3], last_llg) + assert abs(scoring_history.cell_values[last_sc_index][4] - last_mse) < 1e-6, \ + "last scoring history MSE {0} should equal to model metrics MSE {1}.".format(scoring_history.cell_values[last_sc_index][4], last_mse) + # check and make sure the llg from training and validation frame should be increasing in values + # this is only true when the true residual variance is high. For low true residual variance, it is only + # true for the last few iterations when the residual variance estimate is close to the true residual variance + if (residual_var_true > 2): + for ind in list(range(1, model_iterations)): + p_ind = ind-1 + assert scoring_history.cell_values[p_ind][3] <= scoring_history.cell_values[ind][3], \ + "training llg {0} from iteration {1} should be smaller than training llg {2} from iteration " \ + "{3}".format(scoring_history.cell_values[p_ind][3], p_ind, scoring_history.cell_values[ind][3], ind) + assert scoring_history_valid.cell_values[p_ind][3] <= scoring_history_valid.cell_values[ind][3], \ + "validation llg {0} from iteration {1} should be smaller than validation llg {2} from iteration " \ + "{3}".format(scoring_history_valid.cell_values[p_ind][3], p_ind, scoring_history_valid.cell_values[ind][3], ind) + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_scoring_history_model_summary) +else: + test_scoring_history_model_summary() diff --git a/h2o-py/tests/testdir_algos/hglm/pyunit_generate_synthetic_HGLM_data.py b/h2o-py/tests/testdir_algos/hglm/pyunit_generate_synthetic_HGLM_data.py new file mode 100644 index 000000000000..f03604c518e0 --- /dev/null +++ b/h2o-py/tests/testdir_algos/hglm/pyunit_generate_synthetic_HGLM_data.py @@ -0,0 +1,106 @@ +import sys +sys.path.insert(1,"../../../") +import h2o +from tests import pyunit_utils +from h2o.estimators.hglm import H2OHGLMEstimator as hglm + +# This test will generate synthetic HGLM dataset. If given to a HGLM model, it should be able to perform well with +# this dataset since the assumptions associated with HGLM are used to generate the dataset. However, pay attention +# to the data types and you may have to cast enum columns to factors manually since during the save, column types +# information may be lost. +# +# Apart from saving the dataset using h2o.download_csv, remember to save the column types as +# np.save('my_file.npy', dictionary) np.save('my_file.npy', varDict) +# +# when you want to load the dataset, remember to load the types dictionary as +# types_dict = np.load('my_file.npy',allow_pickle='TRUE').item() +# +# then load your synthetic dataset specifying the column type as +# train = h2o.import_file("mydata.csv", col_types=types_dict) +def test_define_dataset(): + family = 'gaussian' # can be any valid GLM families + nrow = 40000 + nenum = 3 + nreal = 3 # last one is the response + # to generate data in hglm_test/gaussian_0GC_123R_all5Numeric_p2noise_p08T_wIntercept_standardize.csv, 1 cat, 5 numeric + # 1 response, seed = 12345 + # startval = [1.9011867, -1.2616812, 0.4293167, 0.9802659, 0.7680827, -0.6359531] + + # gaussian_0GC_123R_all5Numeric_p2noise_p08T_woIntercept_standardize.csv + + # to generate data in hglm_test/gaussian_0GC_123R_all5Enum_p5oise_p08T_wIntercept_standardize.csv, 6 cat, 0 numeric + # 1 response, seed = 12345 + # startval = [0.7906251, 1.8005780, -3.5665564, -0.8804172, -1.5809320, 1.5188019, -1.6089287, 1.7509011, + # -0.5286826, -1.1203812, -2.3159930, 0.1674759, -0.9065857, -0.7587694, -0.8578529, 0.3007900, + # 1.5765745, 1.1725489, -0.6935900, -1.1467158, 1.3960304, -1.7078175, -2.8960526, 0.9847858, + # -1.0951275, 0.1393349, -0.6782085, 3.3711444, -2.0059428, 1.3293327, -0.5083064, 2.7324153, + # 0.2036385, -1.6967069, 0.699569, -0.4288891] + # hglm_test/gaussian_0GC_123R_all5Enum_p5oise_p08T_woIntercept_standardize.csv + + # to generate data in hglm_test/gaussian_0GC_123R_6enum_5num_1p5oise_p08T_wIntercept_standardize.csv, seed=12345, + # startval = [3.93013069, 0.54472937, 1.00317237, 0.45930296, 2.41925257, -3.09530556, -3.56112954, 1.63825546, + # -0.09974517, 0.09546386, -0.67192248, -0.71572626, 0.78566524, -0.58579001, -1.91637762, 0.85650108, + # 0.91881537, 2.35773321, -0.20756380, 0.40147277, -1.10384921, 0.75619311, -0.57409532, 1.44300300, + # 2.01180669, -1.90782107,-0.41173998, -0.50159384, 1.22944372, -1.18281946, -2.96645841, 2.14272813, + # -0.32555483, -1.00719124, 0.74755600, 1.09900559, 2.30948122, 1.23596162, 0.66856774, -2.56878032, + # 0.05599762] + # hglm_test/gaussian_0GC_123R_6enum_5num_1p5oise_p08T_woIntercept_standardize.gz + # hglm_test/gaussian_0GC_678R_6enum_5num_p05oise_p08T_wIntercept_standardize + # hglm_test/gaussian_0GC_678R_6enum_5num_p05oise_p08T_woIntercept_standardize + # hglm_test/gaussian_0GC_1267R_6enum_5num_p08oise_p08T_wIntercept_standardize + # hglm_test/gaussian_0GC_1267R_6enum_5num_p08oise_p08T_woIntercept_standardize + + + # hglm_test/gaussian_0GC_allenum_allRC_2p5noise_p08T_wIntercept_standardize + #startval = [1.10825995, -0.37625500, 0.01522888, -2.33646889, -1.39787749, 0.10817416, -0.48015694, 2.47842056, + # -3.45931533, 0.25396556, -2.52770259, 0.96282659, -2.40216594, -2.79117384, -2.21220306] + # hglm_test/gaussian_0GC_allnumeric_allRC_2p5noise_p08T_woIntercept_standardize + #startval = [-0.9414337, -2.0222721, -2.4312540] + + # hglm_test/gaussian_0GC_allRC_2enum2numeric_3noise_p08T_wIntercept_standardize + startval = [-1.4313612, 0.6795744, 1.9795154, -3.1187255, 0.2058840, -1.6596187, 0.3460812, -0.7809777, + 1.6617960, -0.5174034, 1.8273497, -2.4161541, 0.9474324, 2.3616221, 0.7710148, 0.2706556, 1.0541668] + # hglm_test/gaussian_0GC_allRC_2enum2numeric_3noise_p08T_woIntercept_standardize + enum_columns = pyunit_utils.random_dataset_enums_only(nrow, nenum, factorL=8, misFrac=0.0) + real_columns = pyunit_utils.random_dataset_real_only(nrow, nreal, realR = 2, misFrac=0.0) + dataset = enum_columns.cbind(real_columns) + dataset.set_name(dataset.ncol-1, "response") + cnames = dataset.names + group_column=cnames[0] + random_intercept = False + vare = 3 + varu = 0.08 + random_columns = [cnames[1], cnames[2], cnames[3], cnames[4]] + hglmDataSet = generate_dataset(family, dataset, group_column, random_columns, startval, random_intercept, + vare, varu) + print("Done!") + #h2o.download_csv(hglmDataSet, "/Users/wendycwong/temp/dataset.csv") # save dataset + + +def generate_dataset(family, trainData, group_column, random_columns, startval, random_intercept, vare, varu): + myX = trainData.names + myY = 'response' + myX.remove(myY) + myX.remove(group_column) + + names_without_response = trainData.names + names_without_response.remove(myY) + + m = hglm(family=family, max_iterations=0, random_columns=random_columns, group_column=group_column, + tau_u_var_init = varu, tau_e_var_init = vare, random_intercept = random_intercept, gen_syn_data=True, + seed = 12345, initial_fixed_effects=startval) + m.train(training_frame=trainData, y = "response", x =myX) + f2 = m.predict(trainData) + finalDataset = trainData[names_without_response] + finalDataset = finalDataset.cbind(f2[0]) + finalDataset.set_name(col=finalDataset.ncols-1, name='response') + + return finalDataset + + + + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_define_dataset) +else: + test_define_dataset() diff --git a/h2o-py/tests/testdir_misc/explain/pyunit_explain.py b/h2o-py/tests/testdir_misc/explain/pyunit_explain.py index cdcd8a690b22..b580cb96badb 100644 --- a/h2o-py/tests/testdir_misc/explain/pyunit_explain.py +++ b/h2o-py/tests/testdir_misc/explain/pyunit_explain.py @@ -583,22 +583,6 @@ def test_learning_curve_for_algos_not_present_in_automl(): assert isinstance(glm_model.learning_curve_plot().figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() - # HGLM - h2o_data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/semiconductor.csv")) - y = "y" - x = ["x1", "x3", "x5", "x6"] - z = 0 - h2o_data["Device"] = h2o_data["Device"].asfactor() - hglm_model = H2OGeneralizedLinearEstimator(HGLM=True, - family="gaussian", - rand_family=["gaussian"], - random_columns=[z], - rand_link=["identity"], - calc_like=True) - hglm_model.train(x=x, y=y, training_frame=h2o_data) - assert isinstance(hglm_model.learning_curve_plot().figure(), matplotlib.pyplot.Figure) - matplotlib.pyplot.close() - # GAM knots1 = [-1.99905699, -0.98143075, 0.02599159, 1.00770987, 1.99942290] frameKnots1 = h2o.H2OFrame(python_obj=knots1) diff --git a/h2o-py/tests_rest_smoke/testdir_multi_jvm/test_rest_api.py b/h2o-py/tests_rest_smoke/testdir_multi_jvm/test_rest_api.py index 13c93693ae8e..01b348364111 100644 --- a/h2o-py/tests_rest_smoke/testdir_multi_jvm/test_rest_api.py +++ b/h2o-py/tests_rest_smoke/testdir_multi_jvm/test_rest_api.py @@ -23,7 +23,7 @@ algos = ['coxph', 'kmeans', 'deeplearning', 'drf', 'glm', 'gbm', 'pca', 'naivebayes', 'glrm', 'svd', 'isotonicregression', 'psvm', 'aggregator', 'word2vec', 'stackedensemble', 'xgboost', 'isolationforest', 'gam', 'generic', 'targetencoder', 'rulefit', 'extendedisolationforest', 'anovaglm', 'modelselection', - 'upliftdrf', 'infogram', 'dt', 'adaboost'] + 'upliftdrf', 'infogram', 'dt', 'adaboost', 'hglm'] algo_additional_default_params = { 'grep' : { 'regex' : '.*' }, 'kmeans' : { 'k' : 2 }, diff --git a/h2o-r/H2O_Load.R b/h2o-r/H2O_Load.R index 74af068e4558..5d1d59e1347f 100755 --- a/h2o-r/H2O_Load.R +++ b/h2o-r/H2O_Load.R @@ -18,7 +18,7 @@ function() { "deeplearning.R", "naivebayes.R", "randomforest.R", "svd.R", "locate.R", "predict.R", "rulefit.R", "isolationforest.R", "psvm.R", "tf-idf.R", "permutation_varimp.R", "extendedisolationforest.R", "anovaglm.R", "modelselection.R", "upliftrandomforest.R", "infogram.R", "admissibleml.R", "decisiontree.R", - "adaBoost.R") + "adaBoost.R", "hglm.R") require(jsonlite); require(RCurl) invisible(lapply(to_src,function(x){source(paste(FULL.PATH, x, sep = ""))})) } diff --git a/h2o-r/h2o-DESCRIPTION.template b/h2o-r/h2o-DESCRIPTION.template index d4c56118bfa3..1f16d2eb915a 100644 --- a/h2o-r/h2o-DESCRIPTION.template +++ b/h2o-r/h2o-DESCRIPTION.template @@ -41,7 +41,7 @@ Description: R interface for 'H2O', the scalable open source machine learning Models (GLM), Gradient Boosting Machines (including XGBoost), Random Forests, Deep Neural Networks (Deep Learning), Stacked Ensembles, Naive Bayes, Generalized Additive Models (GAM), ANOVA GLM, Cox Proportional Hazards, K-Means, PCA, ModelSelection, - Word2Vec, as well as a fully automatic machine learning algorithm (H2O AutoML). + Word2Vec, Hierarchical Generalized Linear Models (HGLM), as well as a fully automatic machine learning algorithm (H2O AutoML). License: Apache License (== 2.0) URL: https://github.com/h2oai/h2o-3 BugReports: https://github.com/h2oai/h2o-3/issues diff --git a/h2o-r/h2o-package/R/classes.R b/h2o-r/h2o-package/R/classes.R index 296105062f62..d34d35b395c3 100755 --- a/h2o-r/h2o-package/R/classes.R +++ b/h2o-r/h2o-package/R/classes.R @@ -726,28 +726,8 @@ setMethod("show", "H2ORegressionMetrics", function(object) { cat("RMSE: ", object@metrics$RMSE, "\n", sep="") cat("MAE: ", object@metrics$mae, "\n", sep="") cat("RMSLE: ", object@metrics$rmsle, "\n", sep="") - if(!is.null(object@algorithm) && object@algorithm %in% c("glm") && exists("sefe", where=object@metrics)) { - cat("sefe: ", object@metrics$sefe, "\n", sep="") - cat("sere: ", object@metrics$sere, "\n", sep="") - cat("fixedf: ", object@metrics$fixedf, "\n", sep="") - cat("ranef: ", object@metrics$ranef, "\n", sep="") - cat("randc: ", object@metrics$randc, "\n", sep="") - cat("varfix: ", object@metrics$varfix, "\n", sep="") - cat("varranef: ", object@metrics$varranef, "\n", sep="") - cat("converge: ", object@metrics$converge, "\n", sep="") - cat("dfrefe: ", object@metrics$dfrefe, "\n", sep="") - cat("summvc1: ", object@metrics$summvc1, "\n", sep="") - cat("summvc2: ", object@metrics$summvc2, "\n", sep="") - cat("bad: ", object@metrics$bad, "\n", sep="") - if (exists("hlik", where=object@metrics) && !is.null(object@metrics$hlik)) { - cat("hlik: ", object@metrics$hlik, "\n", sep="") - cat("pvh: ", object@metrics$pvh, "\n", sep="") - cat("pbvh: ", object@metrics$pbvh, "\n", sep="") - cat("caic: ", object@metrics$caic, "\n", sep="") - } - } else { - cat("Mean Residual Deviance : ", h2o.mean_residual_deviance(object), "\n", sep="") - } + cat("Mean Residual Deviance : ", h2o.mean_residual_deviance(object), "\n", sep="") + if(!is.null(object@algorithm) && object@algorithm %in% c("gam","glm","generic") && exists("r2", where=object@metrics)) { if (!is.na(h2o.r2(object))) cat("R^2 : ", h2o.r2(object), "\n", sep="") null_dev <- h2o.null_deviance(object) diff --git a/h2o-r/h2o-package/R/explain.R b/h2o-r/h2o-package/R/explain.R index 67220f3a323e..3a08abe8a338 100644 --- a/h2o-r/h2o-package/R/explain.R +++ b/h2o-r/h2o-package/R/explain.R @@ -3155,11 +3155,8 @@ h2o.learning_curve_plot <- function(model, sh <- .preprocess_scoring_history(model, sh) if (model@algorithm %in% c("glm", "gam")) { - hglm <- !is.null(model@parameters$HGLM) && model@parameters$HGLM if (model@allparameters$lambda_search) { allowed_timesteps <- "iteration" - } else if (!is.null(hglm) && hglm) { - allowed_timesteps <- "iterations" } else { allowed_timesteps <- "iterations" } @@ -3213,7 +3210,6 @@ h2o.learning_curve_plot <- function(model, validation_metric <- "UNDEFINED" } else if ("deviance" == metric && model@algorithm %in% c("gam", "glm") && - !hglm && "deviance_train" %in% names(sh)) { training_metric <- "deviance_train" validation_metric <- "deviance_test" diff --git a/h2o-r/h2o-package/R/glm.R b/h2o-r/h2o-package/R/glm.R index 48b09a7fbea4..8e9bf31fe508 100644 --- a/h2o-r/h2o-package/R/glm.R +++ b/h2o-r/h2o-package/R/glm.R @@ -28,7 +28,6 @@ #' stratify the folds based on the response variable, for classification problems. Must be one of: "AUTO", #' "Random", "Modulo", "Stratified". Defaults to AUTO. #' @param fold_column Column with cross-validation fold index assignment per observation. -#' @param random_columns random columns indices for HGLM. #' @param ignore_const_cols \code{Logical}. Ignore constant columns. Defaults to TRUE. #' @param score_each_iteration \code{Logical}. Whether to score during each iteration of model training. Defaults to FALSE. #' @param score_iteration_interval Perform scoring for every score_iteration_interval iterations. Defaults to -1. @@ -43,8 +42,6 @@ #' @param family Family. Use binomial for classification with logistic regression, others are for regression problems. Must be #' one of: "AUTO", "gaussian", "binomial", "fractionalbinomial", "quasibinomial", "ordinal", "multinomial", #' "poisson", "gamma", "tweedie", "negativebinomial". Defaults to AUTO. -#' @param rand_family Random Component Family array. One for each random component. Only support gaussian for now. Must be one of: -#' "[gaussian]". #' @param tweedie_variance_power Tweedie variance power Defaults to 0. #' @param tweedie_link_power Tweedie link power. Defaults to 1. #' @param theta Theta Defaults to 1e-10. @@ -90,12 +87,9 @@ #' the conditional values above are 1E-8 and 1E-6 respectively. Defaults to -1. #' @param link Link function. Must be one of: "family_default", "identity", "logit", "log", "inverse", "tweedie", "ologit". #' Defaults to family_default. -#' @param rand_link Link function array for random component in HGLM. Must be one of: "[identity]", "[family_default]". -#' @param startval double array to initialize fixed and random coefficients for HGLM, coefficients for GLM. If standardize is -#' true, the standardized coefficients should be used. Otherwise, use the regular coefficients. +#' @param startval double array to initialize coefficients for GLM. If standardize is true, the standardized coefficients should +#' be used. Otherwise, use the regular coefficients. #' @param calc_like \code{Logical}. if true, will return likelihood function value. Defaults to FALSE. -#' @param HGLM \code{Logical}. If set to true, will return HGLM model. Otherwise, normal GLM model will be returned. -#' Defaults to FALSE. #' @param prior Prior probability for y==1. To be used only for logistic regression iff the data has been sampled and the mean #' of response does not reflect reality. Defaults to -1. #' @param cold_start \code{Logical}. Only applicable to multiple alpha/lambda values. If false, build the next model for next set @@ -229,14 +223,12 @@ h2o.glm <- function(x, keep_cross_validation_fold_assignment = FALSE, fold_assignment = c("AUTO", "Random", "Modulo", "Stratified"), fold_column = NULL, - random_columns = NULL, ignore_const_cols = TRUE, score_each_iteration = FALSE, score_iteration_interval = -1, offset_column = NULL, weights_column = NULL, family = c("AUTO", "gaussian", "binomial", "fractionalbinomial", "quasibinomial", "ordinal", "multinomial", "poisson", "gamma", "tweedie", "negativebinomial"), - rand_family = c("[gaussian]"), tweedie_variance_power = 0, tweedie_link_power = 1, theta = 1e-10, @@ -260,10 +252,8 @@ h2o.glm <- function(x, beta_epsilon = 0.0001, gradient_epsilon = -1, link = c("family_default", "identity", "logit", "log", "inverse", "tweedie", "ologit"), - rand_link = c("[identity]", "[family_default]"), startval = NULL, calc_like = FALSE, - HGLM = FALSE, prior = -1, cold_start = FALSE, lambda_min_ratio = -1, @@ -331,11 +321,6 @@ h2o.glm <- function(x, parms <- list() parms$training_frame <- training_frame args <- .verify_dataxy(training_frame, x, y) - if (HGLM && is.null(random_columns)) stop("HGLM: must specify random effect column!") - if (HGLM && (!is.null(random_columns))) { - temp <- .verify_dataxy(training_frame, random_columns, y) - random_columns <- temp$x_i-1 # change column index to numeric column indices starting from 0 - } if( !missing(offset_column) && !is.null(offset_column)) args$x_ignore <- args$x_ignore[!( offset_column == args$x_ignore )] if( !missing(weights_column) && !is.null(weights_column)) args$x_ignore <- args$x_ignore[!( weights_column == args$x_ignore )] if( !missing(fold_column) && !is.null(fold_column)) args$x_ignore <- args$x_ignore[!( fold_column == args$x_ignore )] @@ -362,8 +347,6 @@ h2o.glm <- function(x, parms$fold_assignment <- fold_assignment if (!missing(fold_column)) parms$fold_column <- fold_column - if (!missing(random_columns)) - parms$random_columns <- random_columns if (!missing(ignore_const_cols)) parms$ignore_const_cols <- ignore_const_cols if (!missing(score_each_iteration)) @@ -376,8 +359,6 @@ h2o.glm <- function(x, parms$weights_column <- weights_column if (!missing(family)) parms$family <- family - if (!missing(rand_family)) - parms$rand_family <- rand_family if (!missing(tweedie_variance_power)) parms$tweedie_variance_power <- tweedie_variance_power if (!missing(tweedie_link_power)) @@ -422,14 +403,10 @@ h2o.glm <- function(x, parms$gradient_epsilon <- gradient_epsilon if (!missing(link)) parms$link <- link - if (!missing(rand_link)) - parms$rand_link <- rand_link if (!missing(startval)) parms$startval <- startval if (!missing(calc_like)) parms$calc_like <- calc_like - if (!missing(HGLM)) - parms$HGLM <- HGLM if (!missing(prior)) parms$prior <- prior if (!missing(cold_start)) @@ -539,14 +516,12 @@ h2o.glm <- function(x, keep_cross_validation_fold_assignment = FALSE, fold_assignment = c("AUTO", "Random", "Modulo", "Stratified"), fold_column = NULL, - random_columns = NULL, ignore_const_cols = TRUE, score_each_iteration = FALSE, score_iteration_interval = -1, offset_column = NULL, weights_column = NULL, family = c("AUTO", "gaussian", "binomial", "fractionalbinomial", "quasibinomial", "ordinal", "multinomial", "poisson", "gamma", "tweedie", "negativebinomial"), - rand_family = c("[gaussian]"), tweedie_variance_power = 0, tweedie_link_power = 1, theta = 1e-10, @@ -570,10 +545,8 @@ h2o.glm <- function(x, beta_epsilon = 0.0001, gradient_epsilon = -1, link = c("family_default", "identity", "logit", "log", "inverse", "tweedie", "ologit"), - rand_link = c("[identity]", "[family_default]"), startval = NULL, calc_like = FALSE, - HGLM = FALSE, prior = -1, cold_start = FALSE, lambda_min_ratio = -1, @@ -648,11 +621,6 @@ h2o.glm <- function(x, parms <- list() parms$training_frame <- training_frame args <- .verify_dataxy(training_frame, x, y) - if (HGLM && is.null(random_columns)) stop("HGLM: must specify random effect column!") - if (HGLM && (!is.null(random_columns))) { - temp <- .verify_dataxy(training_frame, random_columns, y) - random_columns <- temp$x_i-1 # change column index to numeric column indices starting from 0 - } if( !missing(offset_column) && !is.null(offset_column)) args$x_ignore <- args$x_ignore[!( offset_column == args$x_ignore )] if( !missing(weights_column) && !is.null(weights_column)) args$x_ignore <- args$x_ignore[!( weights_column == args$x_ignore )] if( !missing(fold_column) && !is.null(fold_column)) args$x_ignore <- args$x_ignore[!( fold_column == args$x_ignore )] @@ -677,8 +645,6 @@ h2o.glm <- function(x, parms$fold_assignment <- fold_assignment if (!missing(fold_column)) parms$fold_column <- fold_column - if (!missing(random_columns)) - parms$random_columns <- random_columns if (!missing(ignore_const_cols)) parms$ignore_const_cols <- ignore_const_cols if (!missing(score_each_iteration)) @@ -691,8 +657,6 @@ h2o.glm <- function(x, parms$weights_column <- weights_column if (!missing(family)) parms$family <- family - if (!missing(rand_family)) - parms$rand_family <- rand_family if (!missing(tweedie_variance_power)) parms$tweedie_variance_power <- tweedie_variance_power if (!missing(tweedie_link_power)) @@ -737,14 +701,10 @@ h2o.glm <- function(x, parms$gradient_epsilon <- gradient_epsilon if (!missing(link)) parms$link <- link - if (!missing(rand_link)) - parms$rand_link <- rand_link if (!missing(startval)) parms$startval <- startval if (!missing(calc_like)) parms$calc_like <- calc_like - if (!missing(HGLM)) - parms$HGLM <- HGLM if (!missing(prior)) parms$prior <- prior if (!missing(cold_start)) diff --git a/h2o-r/h2o-package/R/hglm.R b/h2o-r/h2o-package/R/hglm.R new file mode 100644 index 000000000000..6e4fa2cbed30 --- /dev/null +++ b/h2o-r/h2o-package/R/hglm.R @@ -0,0 +1,411 @@ +# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_R.py +# Copyright 2016 H2O.ai; Apache License Version 2.0 (see LICENSE for details) +# +# -------------------------- HGLM Model -------------------------- # +#' +#' Fits a HGLM model with both the residual noise and random effect being modeled by Gaussian distribution. The fixed +#' effect coefficients are specified in parameter x, the random effect coefficients are specified in parameter +#' random_columns. The column specified in group_column will contain the level 2 index value and must be an enum column. +#' +#' +#' @param x (Optional) A vector containing the names or indices of the predictor variables to use in building the model. +#' If x is missing, then all columns except y are used. +#' @param y The name or column index of the response variable in the data. +#' The response must be either a numeric or a categorical/factor variable. +#' If the response is numeric, then a regression model will be trained, otherwise it will train a classification model. +#' @param training_frame Id of the training data frame. +#' @param random_columns Random columns indices for HGLM. +#' @param group_column Group column is the column that is categorical and used to generate the groups in HGLM +#' @param model_id Destination id for this model; auto-generated if not specified. +#' @param validation_frame Id of the validation data frame. +#' @param ignore_const_cols \code{Logical}. Ignore constant columns. Defaults to TRUE. +#' @param offset_column Offset column. This will be added to the combination of columns before applying the link function. +#' @param weights_column Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from +#' the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative +#' weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the +#' data frame. This is typically the number of times a row is repeated, but non-integer values are supported as +#' well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If +#' you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get +#' an accurate prediction, remove all rows with weight == 0. +#' @param max_runtime_secs Maximum allowed runtime in seconds for model training. Use 0 to disable. Defaults to 0. +#' @param custom_metric_func Reference to custom evaluation function, format: `language:keyName=funcName` +#' @param score_each_iteration \code{Logical}. Whether to score during each iteration of model training. Defaults to FALSE. +#' @param score_iteration_interval Perform scoring for every score_iteration_interval iterations. Defaults to 5. +#' @param seed Seed for random numbers (affects certain parts of the algo that are stochastic and those might or might not be enabled by default). +#' Defaults to -1 (time-based random number). +#' @param missing_values_handling Handling of missing values. Either MeanImputation, Skip or PlugValues. Must be one of: "MeanImputation", +#' "Skip", "PlugValues". Defaults to MeanImputation. +#' @param plug_values Plug Values (a single row frame containing values that will be used to impute missing values of the +#' training/validation frame, use with conjunction missing_values_handling = PlugValues). +#' @param family Family. Only gaussian is supported now. Must be one of: "gaussian". Defaults to gaussian. +#' @param rand_family Set distribution of random effects. Only Gaussian is implemented now. Must be one of: "gaussian". +#' @param max_iterations Maximum number of iterations. Value should >=1. A value of 0 is only set when only the model coefficient +#' names and model coefficient dimensions are needed. Defaults to -1. +#' @param initial_fixed_effects An array that contains initial values of the fixed effects coefficient. +#' @param initial_random_effects A H2OFrame id that contains initial values of the random effects coefficient. The row names shouldbe the +#' random coefficient names. If you are not sure what the random coefficient names are, build HGLM model with +#' max_iterations = 0 and checkout the model output field random_coefficient_names. The number of rows of this +#' frame should be the number of level 2 units. Again, to figure this out, build HGLM model with +#' max_iterations=0 and check out the model output field group_column_names. The number of rows should equal the +#' length of thegroup_column_names. +#' @param initial_t_matrix A H2OFrame id that contains initial values of the T matrix. It should be a positive symmetric matrix. +#' @param tau_u_var_init Initial variance of random coefficient effects. If set, should provide a value > 0.0. If not set, will be +#' randomly set in the model building process. Defaults to 0. +#' @param tau_e_var_init Initial variance of random noise. If set, should provide a value > 0.0. If not set, will be randomly set in +#' the model building process. Defaults to 0. +#' @param method We only implemented EM as a method to obtain the fixed, random coefficients and the various variances. Must be +#' one of: "EM". Defaults to EM. +#' @param em_epsilon Converge if beta/ubeta/tmat/tauEVar changes less (using L-infinity norm) than em esilon. ONLY applies to EM +#' method. Defaults to 0.001. +#' @param random_intercept \code{Logical}. If true, will allow random component to the GLM coefficients. Defaults to TRUE. +#' @param gen_syn_data \code{Logical}. If true, add gaussian noise with variance specified in parms._tau_e_var_init. Defaults to +#' FALSE. +#' @examples +#' \dontrun{ +#' library(h2o) +#' h2o.init() +#' # build a HGLM model with prostate dataset +#' prostate_path <- system.file("extdata", "prostate.csv", package = "h2o") +#' prostate <- h2o.uploadFile(path = prostate_path) +#' prostate$CAPSULE <- as.factor(prostate$CAPSULE) +#' prostate$RACE <- as.factor(prostate$RACE) +#' model <- h2o.hglm(y="VOL", x=c("AGE","RACE","DPROS"), random_columns = ["AGE"], group_column = "RACE", training_frame=prostate) +#' +#' } +#' @export +h2o.hglm <- function(x, + y, + training_frame, + random_columns, + group_column, + model_id = NULL, + validation_frame = NULL, + ignore_const_cols = TRUE, + offset_column = NULL, + weights_column = NULL, + max_runtime_secs = 0, + custom_metric_func = NULL, + score_each_iteration = FALSE, + score_iteration_interval = 5, + seed = -1, + missing_values_handling = c("MeanImputation", "Skip", "PlugValues"), + plug_values = NULL, + family = c("gaussian"), + rand_family = c("gaussian"), + max_iterations = -1, + initial_fixed_effects = NULL, + initial_random_effects = NULL, + initial_t_matrix = NULL, + tau_u_var_init = 0, + tau_e_var_init = 0, + method = c("EM"), + em_epsilon = 0.001, + random_intercept = TRUE, + gen_syn_data = FALSE) +{ + # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object + training_frame <- .validate.H2OFrame(training_frame, required=TRUE) + validation_frame <- .validate.H2OFrame(validation_frame, required=FALSE) + + # Validate other required args + # If x is missing, then assume user wants to use all columns as features. + if (missing(x)) { + if (is.numeric(y)) { + x <- setdiff(col(training_frame), y) + } else { + x <- setdiff(colnames(training_frame), y) + } + } + + # Build parameter list to send to model builder + parms <- list() + parms$training_frame <- training_frame + args <- .verify_dataxy(training_frame, x, y) + if (!missing(random_columns)) { + parms$random_columns <- random_columns + } else { + stop("random_columns is required.") + } + if (!missing(group_column)) { + parms$group_column <- group_column + } else { + stop("group_column is required.") + } + parms$ignored_columns <- args$x_ignore + parms$response_column <- args$y + + if (!missing(model_id)) + parms$model_id <- model_id + if (!missing(validation_frame)) + parms$validation_frame <- validation_frame + if (!missing(ignore_const_cols)) + parms$ignore_const_cols <- ignore_const_cols + if (!missing(offset_column)) + parms$offset_column <- offset_column + if (!missing(weights_column)) + parms$weights_column <- weights_column + if (!missing(max_runtime_secs)) + parms$max_runtime_secs <- max_runtime_secs + if (!missing(custom_metric_func)) + parms$custom_metric_func <- custom_metric_func + if (!missing(score_each_iteration)) + parms$score_each_iteration <- score_each_iteration + if (!missing(score_iteration_interval)) + parms$score_iteration_interval <- score_iteration_interval + if (!missing(seed)) + parms$seed <- seed + if (!missing(missing_values_handling)) + parms$missing_values_handling <- missing_values_handling + if (!missing(plug_values)) + parms$plug_values <- plug_values + if (!missing(family)) + parms$family <- family + if (!missing(rand_family)) + parms$rand_family <- rand_family + if (!missing(max_iterations)) + parms$max_iterations <- max_iterations + if (!missing(initial_fixed_effects)) + parms$initial_fixed_effects <- initial_fixed_effects + if (!missing(initial_random_effects)) + parms$initial_random_effects <- initial_random_effects + if (!missing(initial_t_matrix)) + parms$initial_t_matrix <- initial_t_matrix + if (!missing(tau_u_var_init)) + parms$tau_u_var_init <- tau_u_var_init + if (!missing(tau_e_var_init)) + parms$tau_e_var_init <- tau_e_var_init + if (!missing(random_columns)) + parms$random_columns <- random_columns + if (!missing(method)) + parms$method <- method + if (!missing(em_epsilon)) + parms$em_epsilon <- em_epsilon + if (!missing(random_intercept)) + parms$random_intercept <- random_intercept + if (!missing(group_column)) + parms$group_column <- group_column + if (!missing(gen_syn_data)) + parms$gen_syn_data <- gen_syn_data + + # Error check and build model + model <- .h2o.modelJob('hglm', parms, h2oRestApiVersion=3, verbose=FALSE) + return(model) +} +.h2o.train_segments_hglm <- function(x, + y, + training_frame, + random_columns, + group_column, + validation_frame = NULL, + ignore_const_cols = TRUE, + offset_column = NULL, + weights_column = NULL, + max_runtime_secs = 0, + custom_metric_func = NULL, + score_each_iteration = FALSE, + score_iteration_interval = 5, + seed = -1, + missing_values_handling = c("MeanImputation", "Skip", "PlugValues"), + plug_values = NULL, + family = c("gaussian"), + rand_family = c("gaussian"), + max_iterations = -1, + initial_fixed_effects = NULL, + initial_random_effects = NULL, + initial_t_matrix = NULL, + tau_u_var_init = 0, + tau_e_var_init = 0, + method = c("EM"), + em_epsilon = 0.001, + random_intercept = TRUE, + gen_syn_data = FALSE, + segment_columns = NULL, + segment_models_id = NULL, + parallelism = 1) +{ + # formally define variables that were excluded from function parameters + model_id <- NULL + verbose <- NULL + destination_key <- NULL + # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object + training_frame <- .validate.H2OFrame(training_frame, required=TRUE) + validation_frame <- .validate.H2OFrame(validation_frame, required=FALSE) + + # Validate other required args + # If x is missing, then assume user wants to use all columns as features. + if (missing(x)) { + if (is.numeric(y)) { + x <- setdiff(col(training_frame), y) + } else { + x <- setdiff(colnames(training_frame), y) + } + } + + # Build parameter list to send to model builder + parms <- list() + parms$training_frame <- training_frame + args <- .verify_dataxy(training_frame, x, y) + if (!missing(random_columns)) { + parms$random_columns <- random_columns + } else { + stop("random_columns is required.") + } + if (!missing(group_column)) { + parms$group_column <- group_column + } else { + stop("group_column is required.") + } + parms$ignored_columns <- args$x_ignore + parms$response_column <- args$y + + if (!missing(validation_frame)) + parms$validation_frame <- validation_frame + if (!missing(ignore_const_cols)) + parms$ignore_const_cols <- ignore_const_cols + if (!missing(offset_column)) + parms$offset_column <- offset_column + if (!missing(weights_column)) + parms$weights_column <- weights_column + if (!missing(max_runtime_secs)) + parms$max_runtime_secs <- max_runtime_secs + if (!missing(custom_metric_func)) + parms$custom_metric_func <- custom_metric_func + if (!missing(score_each_iteration)) + parms$score_each_iteration <- score_each_iteration + if (!missing(score_iteration_interval)) + parms$score_iteration_interval <- score_iteration_interval + if (!missing(seed)) + parms$seed <- seed + if (!missing(missing_values_handling)) + parms$missing_values_handling <- missing_values_handling + if (!missing(plug_values)) + parms$plug_values <- plug_values + if (!missing(family)) + parms$family <- family + if (!missing(rand_family)) + parms$rand_family <- rand_family + if (!missing(max_iterations)) + parms$max_iterations <- max_iterations + if (!missing(initial_fixed_effects)) + parms$initial_fixed_effects <- initial_fixed_effects + if (!missing(initial_random_effects)) + parms$initial_random_effects <- initial_random_effects + if (!missing(initial_t_matrix)) + parms$initial_t_matrix <- initial_t_matrix + if (!missing(tau_u_var_init)) + parms$tau_u_var_init <- tau_u_var_init + if (!missing(tau_e_var_init)) + parms$tau_e_var_init <- tau_e_var_init + if (!missing(random_columns)) + parms$random_columns <- random_columns + if (!missing(method)) + parms$method <- method + if (!missing(em_epsilon)) + parms$em_epsilon <- em_epsilon + if (!missing(random_intercept)) + parms$random_intercept <- random_intercept + if (!missing(group_column)) + parms$group_column <- group_column + if (!missing(gen_syn_data)) + parms$gen_syn_data <- gen_syn_data + + # Build segment-models specific parameters + segment_parms <- list() + if (!missing(segment_columns)) + segment_parms$segment_columns <- segment_columns + if (!missing(segment_models_id)) + segment_parms$segment_models_id <- segment_models_id + segment_parms$parallelism <- parallelism + + # Error check and build segment models + segment_models <- .h2o.segmentModelsJob('hglm', segment_parms, parms, h2oRestApiVersion=3) + return(segment_models) +} + + +#' Extracts the random effects coefficients of an HGLM model. +#' +#' @param model is a H2O HGLM model. +#' @export +h2o.coef_random <- function(model) { + if (is(model, "H2OModel") && (model@algorithm=="hglm")) + return(model@model$ubeta) +} + +#' Extracts the group_column levels of an HGLM model. The group_column is usually referred to as level 2 predictor. +#' +#' @param model is a H2O HGLM model. +#' @export +h2o.level_2_names <- function(model) { + if (is(model, "H2OModel") && (model@algorithm=="hglm")) + return(model@model$group_column_names) +} + +#' Extracts the coefficient names of random effect coefficients. +#' +#' @param model is a H2O HGLM model. +#' @export +h2o.coefs_random_names <- function(model) { + if (is(model, "H2OModel") && (model@algorithm=="hglm")) + return(model@model$random_coefficient_names) +} + +#' Extracts scoring history of validation dataframe during training +#' +#' @param model is a H2O HGLM model. +#' @export +h2o.scoring_history_valid <- function(model) { + if (is(model, "H2OModel") && (model@algorithm=="hglm")) + return(model@model$scoring_history_valid) +} + +#' Extracts scoring history of training dataframe during training +#' +#' @param model is a H2O HGLM model. +#' @export +h2o.scoring_history <- function(model) { + if (is(model, "H2OModel") && (model@algorithm=="hglm")) + return(model@model$scoring_history) +} + +#' Extracts T matrix which is the covariance of random effect coefficients. +#' +#' @param model is a H2O HGLM model. +#' @export +h2o.matrix_T <- function(model) { + if (is(model, "H2OModel") && (model@algorithm=="hglm")) + return(model@model$tmat) +} + +#' Extracts the variance of residuals of the HGLM model. +#' +#' @param model is a H2O HGLM model. +#' @export +h2o.residual_variance <- function(model) { + if (is(model, "H2OModel") && (model@algorithm=="hglm")) + return(model@model$residual_variance) +} + +#' Extracts the ICC of the HGLM model. +#' +#' @param model is a H2O HGLM model. +#' @export +h2o.icc <- function(model) { + if (is(model, "H2OModel") && (model@algorithm=="hglm")) + return(model@model$icc) +} + +#' Extracts the mean residual error taking into account only the fixed effect coefficients. +#' +#' @param model is a H2O HGLM model. +#' @param train is true for training and false for validation dataset +#' @export +h2o.mean_residual_fixed <- function(model, train=TRUE) { + if (is(model, "H2OModel") && (model@algorithm=="hglm")) { + if (train) + return(model@model$mean_residual_fixed) + else + return(model@model$mean_residual_fixed_valid) + } +} + diff --git a/h2o-r/h2o-package/R/kvstore.R b/h2o-r/h2o-package/R/kvstore.R index aba10a750f82..5460bd9f0a52 100644 --- a/h2o-r/h2o-package/R/kvstore.R +++ b/h2o-r/h2o-package/R/kvstore.R @@ -358,17 +358,6 @@ h2o.getModel <- function(model_id) { } } - if (identical("glm", json$algo) && allparams$HGLM) { - .newH2OModel(Class = Class, - model_id = model_id, - algorithm = json$algo, - parameters = parameters, - allparameters = allparams, - params = params, - have_pojo = FALSE, - have_mojo = FALSE, - model = model) - } else { .newH2OModel(Class = Class, model_id = model_id, algorithm = json$algo, @@ -378,7 +367,6 @@ h2o.getModel <- function(model_id) { have_pojo = json$have_pojo, have_mojo = json$have_mojo, model = model) - } } #' Retrieves an instance of \linkS4class{H2OSegmentModels} for a given id. diff --git a/h2o-r/h2o-package/R/models.R b/h2o-r/h2o-package/R/models.R index 7bc4b6d85e49..764d262b100d 100755 --- a/h2o-r/h2o-package/R/models.R +++ b/h2o-r/h2o-package/R/models.R @@ -2559,19 +2559,6 @@ h2o.mean_residual_deviance <- function(object, train=FALSE, valid=FALSE, xval=FA invisible(NULL) } -#' Retrieve HGLM ModelMetrics -#' -#' @param object an H2OModel object or H2OModelMetrics. -#' @export -h2o.HGLMMetrics <- function(object) { - if( is(object, "H2OModel") ) { - model.parts <- .model.parts(object) - return(model.parts$tm@metrics) - } - warning(paste0("No HGLM Metric for ",class(object))) - invisible(NULL) -} - #' Retrieve the GINI Coefficcient #' #' Retrieves the GINI coefficient from an \linkS4class{H2OBinomialMetrics}. @@ -2831,7 +2818,7 @@ h2o.get_variable_inflation_factors <- function(object) { #' @export h2o.coef <- function(object, predictorSize = -1) { if (is(object, "H2OModel") && - object@algorithm %in% c("glm", "gam", "coxph", "modelselection")) { + object@algorithm %in% c("glm", "gam", "coxph", "modelselection", "hglm")) { if ((object@algorithm == "glm" || object@algorithm == "gam") && (object@allparameters$family %in% c("multinomial", "ordinal"))) { @@ -2893,7 +2880,7 @@ h2o.coef <- function(object, predictorSize = -1) { } } } else { - stop("Can only extract coefficients from GAM, GLM and CoxPH models") + stop("Can only extract coefficients from GAM, GLM, HGLM and CoxPH models") } } @@ -3187,7 +3174,7 @@ h2o.coef_norm <- function(object, predictorSize=-1) { ) } } else { - stop("Can only extract coefficients from GAMs/GLMs") + stop("Can only extract coefficients from GAMs/GLMs/CoxPHs/ModelSelections") } } @@ -5047,9 +5034,6 @@ plot.H2OModel <- function(x, timestep = "AUTO", metric = "AUTO", ...) { allowed_metrics <- c("deviance_train", "deviance_test", "deviance_xval") allowed_timesteps <- c("iteration", "duration") df <- df[df["alpha"] == x@model$alpha_best,] - } else if (!is.null(x@allparameters$HGLM) && x@allparameters$HGLM) { - allowed_metrics <- c("convergence", "sumetaieta02") - allowed_timesteps <- c("iterations", "duration") } else { allowed_metrics <- c("objective", "negative_log_likelihood") allowed_timesteps <- c("iterations", "duration") diff --git a/h2o-r/h2o-package/R/modelselection.R b/h2o-r/h2o-package/R/modelselection.R index 3171bd8fa268..a0ae2a49155f 100644 --- a/h2o-r/h2o-package/R/modelselection.R +++ b/h2o-r/h2o-package/R/modelselection.R @@ -77,7 +77,7 @@ #' (of -1.0) indicates: If lambda_search is set to False and lambda is equal to zero, the default value of #' gradient_epsilon is equal to .000001, otherwise the default value is .0001. If lambda_search is set to True, #' the conditional values above are 1E-8 and 1E-6 respectively. Defaults to -1. -#' @param startval double array to initialize fixed and random coefficients for HGLM, coefficients for GLM. +#' @param startval Double array to initialize coefficients for GLM. #' @param prior Prior probability for y==1. To be used only for logistic regression iff the data has been sampled and the mean #' of response does not reflect reality. Defaults to 0. #' @param cold_start \code{Logical}. Only applicable to multiple alpha/lambda values. If false, build the next model for next set diff --git a/h2o-r/h2o-package/R/segment.R b/h2o-r/h2o-package/R/segment.R index 76261276b21d..4039256f4a16 100644 --- a/h2o-r/h2o-package/R/segment.R +++ b/h2o-r/h2o-package/R/segment.R @@ -11,7 +11,7 @@ #' #' Start Segmented-Data bulk Model Training for a given algorithm and parameters. #' -#' @param algorithm Name of algorithm to use in training segment models (gbm, randomForest, kmeans, glm, deeplearning, naivebayes, psvm, +#' @param algorithm Name of algorithm to use in training segment models (gbm, randomForest, kmeans, glm, hglm, deeplearning, naivebayes, psvm, #' xgboost, pca, svd, targetencoder, aggregator, word2vec, coxph, isolationforest, kmeans, stackedensemble, glrm, gam, anovaglm, modelselection). #' @param segment_columns A list of columns to segment-by. H2O will group the training (and validation) dataset by the segment-by columns #' and train a separate model for each segment (group of rows). diff --git a/h2o-r/h2o-package/pkgdown/_pkgdown.yml b/h2o-r/h2o-package/pkgdown/_pkgdown.yml index 3b66ce2221d1..b4c42b3fa98b 100644 --- a/h2o-r/h2o-package/pkgdown/_pkgdown.yml +++ b/h2o-r/h2o-package/pkgdown/_pkgdown.yml @@ -74,6 +74,8 @@ reference: - h2o.coef - h2o.coef_names - h2o.coef_norm + - h2o.coef_random + - h2o.coefs_random_names - h2o.coef_with_p_values - h2o.colnames - h2o.columns_by_type @@ -168,9 +170,11 @@ reference: - h2o.group_by - h2o.gsub - h2o.head + - h2o.hglm - h2o.hist - h2o.hit_ratio_table - h2o.hour + - h2o.icc - h2o.ice_plot - h2o.ifelse - h2o.import_hive_table @@ -198,6 +202,7 @@ reference: - h2o.kolmogorov_smirnov - h2o.kurtosis - h2o.learning_curve_plot + - h2o.level_2_names - h2o.levels - h2o.list_all_extensions - h2o.list_api_extensions @@ -221,10 +226,12 @@ reference: - h2o.make_metrics - h2o.makeGLMModel - h2o.match + - h2o.matrix_T - h2o.max - h2o.modelSelection - h2o.mean_per_class_error - h2o.mean_residual_deviance + - h2o.mean_residual_fixed - h2o.mean - h2o.median - h2o.melt @@ -287,6 +294,7 @@ reference: - h2o.residual_deviance - h2o.residual_dof - h2o.residual_analysis_plot + - h2o.residual_variance - h2o.result - h2o.rm - h2o.rmse @@ -303,6 +311,8 @@ reference: - h2o.save_frame - h2o.scale - h2o.scoreHistory + - h2o.scoring_history + - h2o.scoring_history_valid - h2o.scoreHistoryGAM - h2o.sd - h2o.sdev diff --git a/h2o-r/scripts/h2o-r-test-setup.R b/h2o-r/scripts/h2o-r-test-setup.R index 9fbcecca52a9..7b9f2ea7099a 100755 --- a/h2o-r/scripts/h2o-r-test-setup.R +++ b/h2o-r/scripts/h2o-r-test-setup.R @@ -186,7 +186,7 @@ function() { # "coxph.R", "coxphutils.R", "gbm.R", "glm.R", "gam.R", "anovaglm.R", "glrm.R", "kmeans.R", "deeplearning.R", "randomforest.R", "generic.R", # "naivebayes.R", "pca.R", "svd.R", "locate.R", "grid.R", "word2vec.R", "w2vutils.R", "stackedensemble.R", "rulefit.R", "modelselection.R", # "predict.R", "xgboost.R", "isolationforest.R", "psvm.R", "segment.R", "tf-idf.R", "explain.R", "permutation_varimp.R", "extendedisolationforest.R", - # "upliftrandomforest.R", "infogram.R", "isotonicregression.R", "admissibleml.R", "decisiontree.R", "adaboost.R") + # "upliftrandomforest.R", "infogram.R", "isotonicregression.R", "admissibleml.R", "decisiontree.R", "adaboost.R", "hglm.R) # src_path <- paste(h2oRDir,"h2o-package","R",sep=.Platform$file.sep) # invisible(lapply(to_src,function(x){source(paste(src_path, x, sep = .Platform$file.sep))})) diff --git a/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_6876_HGLM_setInitialValues.R b/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_6876_HGLM_setInitialValues.R deleted file mode 100644 index 16a02d536326..000000000000 --- a/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_6876_HGLM_setInitialValues.R +++ /dev/null @@ -1,42 +0,0 @@ -setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) -source("../../../scripts/h2o-r-test-setup.R") -## -# We build two HGLM models, one with user defined initial values, one without. However, since we use the same initial values -# we expected the model metrics to be the same. -## - -test.HGLMData1 <- function() { - tol = 1e-4 - h2odata <- h2o.importFile(locate("smalldata/glm_test/semiconductor.csv")) - h2odata$Device <- h2o.asfactor(h2odata$Device) - yresp <- "y" - xlist <- c("x1", "x3", "x5", "x6") - z <- c(1) - m1H2O <- h2o.glm(x=xlist, y=yresp, family="gaussian", rand_family = c("gaussian"), rand_link=c("identity"), - training_frame=h2odata, HGLM=TRUE, random_columns=z, calc_like=TRUE) - modelMetrics = h2o.HGLMMetrics(m1H2O) # grab HGLM model metrics - - initialVs = c(0.001929687,0.002817188,-0.001707812,-0.003889062,0.010685937,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.1,0.1) - m2H2OInitialValues <- h2o.glm(x=xlist, y=yresp, family="gaussian", rand_family = c("gaussian"), rand_link=c("identity"), - training_frame=h2odata, HGLM=TRUE, random_columns=z, calc_like=TRUE, startval=initialVs) - rModelMetrics = h2o.HGLMMetrics( m2H2OInitialValues) - - # compare H2O output with no startval and with startval assigned by user to be the same as the machine generated ones - expect_true(abs(rModelMetrics$hlik-modelMetrics$hlik) Date: Tue, 29 Oct 2024 22:45:34 +0100 Subject: [PATCH 2/2] [GH-15810] Allow the user to adjust parquet import timezone [nocheck] (#16304) * [GH-15810] Allow the user to adjust parquet import timezone * new approach * new approach part 2 * add export option * fix test * fix java tests compilation --- .../java/water/bindings/examples/Example.java | 2 + .../main/java/water/api/FramesHandler.java | 2 +- .../src/main/java/water/api/ParseHandler.java | 2 +- .../java/water/api/schemas3/FramesV3.java | 3 ++ .../java/water/api/schemas3/ParseSetupV3.java | 3 ++ .../main/java/water/api/schemas3/ParseV3.java | 3 ++ h2o-core/src/main/java/water/fvec/Frame.java | 4 +- .../main/java/water/parser/ARFFParser.java | 2 +- .../water/parser/BinaryFormatExporter.java | 2 +- .../water/parser/BinaryParserProvider.java | 2 +- .../src/main/java/water/parser/CsvParser.java | 4 +- .../main/java/water/parser/ParseSetup.java | 52 ++++++++++++------- .../java/water/parser/ParserProvider.java | 2 +- .../java/water/parser/SVMLightParser.java | 2 +- .../src/main/java/water/parser/XlsParser.java | 4 +- .../parser/ParseCompressedAndXLSTest.java | 2 +- .../test/java/water/parser/ParserTest2.java | 8 +-- .../java/water/parser/avro/AvroParser.java | 2 +- .../water/parser/parquet/ChunkConverter.java | 36 ++++++++++--- .../parser/parquet/ChunkReadSupport.java | 6 ++- .../parquet/ChunkRecordMaterializer.java | 4 +- .../parser/parquet/FrameParquetExporter.java | 16 ++++-- .../water/parser/parquet/ParquetExporter.java | 11 ++-- .../water/parser/parquet/ParquetParser.java | 24 ++++----- .../parser/parquet/ParquetParserProvider.java | 2 +- .../java/water/parser/parquet/TypeUtils.java | 7 +++ .../parser/parquet/VecParquetReader.java | 14 ++--- h2o-py/h2o/frame.py | 12 +++-- h2o-py/h2o/h2o.py | 17 +++--- .../pyunit_parquet_adjust_timezone.py | 40 ++++++++++++++ .../src/main/java/water/TestUtil.java | 6 +-- 31 files changed, 204 insertions(+), 92 deletions(-) create mode 100644 h2o-py/tests/testdir_parser/pyunit_parquet_adjust_timezone.py diff --git a/h2o-bindings/src/main/java/water/bindings/examples/Example.java b/h2o-bindings/src/main/java/water/bindings/examples/Example.java index 857b26e9bf59..88958325649e 100644 --- a/h2o-bindings/src/main/java/water/bindings/examples/Example.java +++ b/h2o-bindings/src/main/java/water/bindings/examples/Example.java @@ -112,6 +112,7 @@ public static void gbm_example_flow() { null, (byte)'\\', false, + false, null).execute().body(); System.out.println("parseSetupBody: " + parseSetupBody); @@ -140,6 +141,7 @@ public static void gbm_example_flow() { null, null, parseSetupBody.escapechar, + false, null).execute().body(); System.out.println("parseBody: " + parseBody); diff --git a/h2o-core/src/main/java/water/api/FramesHandler.java b/h2o-core/src/main/java/water/api/FramesHandler.java index 5ac14ff7eb10..283d4dc9f9c5 100644 --- a/h2o-core/src/main/java/water/api/FramesHandler.java +++ b/h2o-core/src/main/java/water/api/FramesHandler.java @@ -252,7 +252,7 @@ public FramesV3 export(int version, FramesV3 s) { if (s.parallel) { Log.warn("Parallel export to a single file is not supported for parquet format! Export will continue with a parquet-specific setup."); } - s.job = new JobV3(Frame.exportParquet(fr, s.path, s.force, s.compression, s.write_checksum)); + s.job = new JobV3(Frame.exportParquet(fr, s.path, s.force, s.compression, s.write_checksum, s.tz_adjust_from_local)); } else { Frame.CSVStreamParams csvParms = new Frame.CSVStreamParams() .setSeparator(s.separator) diff --git a/h2o-core/src/main/java/water/api/ParseHandler.java b/h2o-core/src/main/java/water/api/ParseHandler.java index c70e06ffff9e..105cd64e50dd 100644 --- a/h2o-core/src/main/java/water/api/ParseHandler.java +++ b/h2o-core/src/main/java/water/api/ParseHandler.java @@ -28,7 +28,7 @@ public ParseV3 parse(int version, ParseV3 parse) { new ParseWriter.ParseErr[0], parse.chunk_size, parse.decrypt_tool != null ? parse.decrypt_tool.key() : null, parse.skipped_columns, parse.custom_non_data_line_markers != null ? parse.custom_non_data_line_markers.getBytes(): null, - parse.escapechar, parse.force_col_types); + parse.escapechar, parse.force_col_types, parse.tz_adjust_to_local); if (parse.source_frames == null) throw new H2OIllegalArgumentException("Data for Frame '" + parse.destination_frame.name + "' is not available. Please check that the path is valid (for all H2O nodes).'"); diff --git a/h2o-core/src/main/java/water/api/schemas3/FramesV3.java b/h2o-core/src/main/java/water/api/schemas3/FramesV3.java index 610ee0adf1de..3be470401f51 100644 --- a/h2o-core/src/main/java/water/api/schemas3/FramesV3.java +++ b/h2o-core/src/main/java/water/api/schemas3/FramesV3.java @@ -53,6 +53,9 @@ public class FramesV3 extends RequestSchemaV3 { @API(help="Specifies if checksum should be written next to data files on export (if supported by export format).") public boolean write_checksum = true; + @API(help="Specifies if the timezone should be adjusted from local to UTC timezone (parquet only).") + public boolean tz_adjust_from_local = false; + @API(help="Field separator (default ',')") public byte separator = Frame.CSVStreamParams.DEFAULT_SEPARATOR; diff --git a/h2o-core/src/main/java/water/api/schemas3/ParseSetupV3.java b/h2o-core/src/main/java/water/api/schemas3/ParseSetupV3.java index 2866d95a2516..a51318432036 100644 --- a/h2o-core/src/main/java/water/api/schemas3/ParseSetupV3.java +++ b/h2o-core/src/main/java/water/api/schemas3/ParseSetupV3.java @@ -90,6 +90,9 @@ public class ParseSetupV3 extends RequestSchemaV3 { " will happen without setting this parameter. Defaults to false.", direction=API.Direction.INPUT) public boolean force_col_types; + @API(help="Adjust the imported time from GMT timezone to cluster timezone.", direction=API.Direction.INPUT) + public boolean tz_adjust_to_local; + @Override public ParseSetup fillImpl(ParseSetup impl) { ParseSetup parseSetup = fillImpl(impl, new String[] {"parse_type"}); diff --git a/h2o-core/src/main/java/water/api/schemas3/ParseV3.java b/h2o-core/src/main/java/water/api/schemas3/ParseV3.java index 5bc4ddc46672..b6eafb3c1002 100644 --- a/h2o-core/src/main/java/water/api/schemas3/ParseV3.java +++ b/h2o-core/src/main/java/water/api/schemas3/ParseV3.java @@ -79,4 +79,7 @@ public class ParseV3 extends RequestSchemaV3 { @API(help="One ASCII character used to escape other characters.", direction=API.Direction.INOUT) public byte escapechar = ParseSetup.DEFAULT_ESCAPE_CHAR; + + @API(help="Adjust the imported time from GMT timezone to cluster timezone.", direction=API.Direction.INPUT) + public boolean tz_adjust_to_local; } diff --git a/h2o-core/src/main/java/water/fvec/Frame.java b/h2o-core/src/main/java/water/fvec/Frame.java index a1ac60dae984..a600f5ef12be 100644 --- a/h2o-core/src/main/java/water/fvec/Frame.java +++ b/h2o-core/src/main/java/water/fvec/Frame.java @@ -1614,7 +1614,7 @@ public static Job export(Frame fr, String path, String frameName, boolean overwr return job.start(t, fr.anyVec().nChunks()); } - public static Job exportParquet(Frame fr, String path, boolean overwrite, String compression, boolean writeChecksum) { + public static Job exportParquet(Frame fr, String path, boolean overwrite, String compression, boolean writeChecksum, boolean tzAdjustFromLocal) { // Validate input if (H2O.getPM().isFileAccessDenied(path)) { throw new H2OFileAccessDeniedException("File " + path + " access denied"); @@ -1638,7 +1638,7 @@ public static Job exportParquet(Frame fr, String path, boolean overwrite, String } Job job = new Job<>(fr._key, "water.fvec.Frame", "Export dataset"); - H2O.H2OCountedCompleter t = parquetExporter.export(fr, path, overwrite, compression, writeChecksum); + H2O.H2OCountedCompleter t = parquetExporter.export(fr, path, overwrite, compression, writeChecksum, tzAdjustFromLocal); return job.start(t, fr.anyVec().nChunks()); } diff --git a/h2o-core/src/main/java/water/parser/ARFFParser.java b/h2o-core/src/main/java/water/parser/ARFFParser.java index 680b4db3d991..2a8ece36b953 100644 --- a/h2o-core/src/main/java/water/parser/ARFFParser.java +++ b/h2o-core/src/main/java/water/parser/ARFFParser.java @@ -129,7 +129,7 @@ static ParseSetup guessSetup(ByteVec bv, byte[] bits, byte sep, boolean singleQu naStrings = addDefaultNAs(naStrings, ncols); // Return the final setup - return new ParseSetup(ARFF_INFO, sep, singleQuotes, ParseSetup.NO_HEADER, ncols, labels, ctypes, domains, naStrings, data, nonDataLineMarkers, escapechar); + return new ParseSetup(ARFF_INFO, sep, singleQuotes, ParseSetup.NO_HEADER, ncols, labels, ctypes, domains, naStrings, data, nonDataLineMarkers, escapechar, false); } private static String[][] addDefaultNAs(String[][] naStrings, int nCols) { diff --git a/h2o-core/src/main/java/water/parser/BinaryFormatExporter.java b/h2o-core/src/main/java/water/parser/BinaryFormatExporter.java index f90fc9803735..ac21e3452b88 100644 --- a/h2o-core/src/main/java/water/parser/BinaryFormatExporter.java +++ b/h2o-core/src/main/java/water/parser/BinaryFormatExporter.java @@ -6,7 +6,7 @@ public interface BinaryFormatExporter { - H2O.H2OCountedCompleter export(Frame frame, String path, boolean force, String compression, boolean writeChecksum); + H2O.H2OCountedCompleter export(Frame frame, String path, boolean force, String compression, boolean writeChecksum, boolean tzAdjustFromLocal); boolean supports(ExportFileFormat format); } diff --git a/h2o-core/src/main/java/water/parser/BinaryParserProvider.java b/h2o-core/src/main/java/water/parser/BinaryParserProvider.java index 0cd96f0f0147..2873d51ae035 100644 --- a/h2o-core/src/main/java/water/parser/BinaryParserProvider.java +++ b/h2o-core/src/main/java/water/parser/BinaryParserProvider.java @@ -23,7 +23,7 @@ public abstract class BinaryParserProvider extends ParserProvider { @Deprecated public final ParseSetup guessSetup(ByteVec v, byte[] bits, byte sep, int ncols, boolean singleQuotes, int checkHeader, String[] columnNames, byte[] columnTypes, String[][] domains, String[][] naStrings) { ParseSetup ps = new ParseSetup(null, sep, singleQuotes, checkHeader, - ncols, columnNames, columnTypes, domains, naStrings, null); + ncols, columnNames, columnTypes, domains, naStrings, null, false); return guessSetup(v, bits, ps); } diff --git a/h2o-core/src/main/java/water/parser/CsvParser.java b/h2o-core/src/main/java/water/parser/CsvParser.java index e02907c8005f..c1ad7d20299b 100644 --- a/h2o-core/src/main/java/water/parser/CsvParser.java +++ b/h2o-core/src/main/java/water/parser/CsvParser.java @@ -731,7 +731,7 @@ else if (ParseUUID.isUUID(str)) } //FIXME should set warning message and let fall through return new ParseSetup(CSV_INFO, GUESS_SEP, singleQuotes, checkHeader, 1, null, ctypes, domains, naStrings, data, new ParseWriter.ParseErr[0],FileVec.DFLT_CHUNK_SIZE, - nonDataLineMarkers, escapechar); + nonDataLineMarkers, escapechar, false); } } data[0] = determineTokens(lines[0], sep, singleQuotes, escapechar); @@ -791,7 +791,7 @@ else if (ParseUUID.isUUID(str)) // Assemble the setup understood so far ParseSetup resSetup = new ParseSetup(CSV_INFO, sep, singleQuotes, checkHeader, ncols, labels, null, null /*domains*/, naStrings, data, - nonDataLineMarkers, escapechar); + nonDataLineMarkers, escapechar, false); // now guess the types if (columnTypes == null || ncols != columnTypes.length) { diff --git a/h2o-core/src/main/java/water/parser/ParseSetup.java b/h2o-core/src/main/java/water/parser/ParseSetup.java index 825b19d061b7..1b16631c1555 100644 --- a/h2o-core/src/main/java/water/parser/ParseSetup.java +++ b/h2o-core/src/main/java/water/parser/ParseSetup.java @@ -44,6 +44,7 @@ public class ParseSetup extends Iced { int[] _parse_columns_indices; // store column indices to be parsed into the final file byte[] _nonDataLineMarkers; boolean _force_col_types = false; // at end of parsing, change column type to users specified ones + boolean _tz_adjust_to_local = false; String[] _orig_column_types; // copy over the original column type setup before translating to byte[] String[] _synthetic_column_names; // Columns with constant values to be added to parsed Frame @@ -73,35 +74,35 @@ public ParseSetup(ParseSetup ps) { ps._separator, ps._single_quotes, ps._check_header, ps._number_columns, ps._column_names, ps._column_types, ps._domains, ps._na_strings, ps._data, new ParseWriter.ParseErr[0], ps._chunk_size, ps._decrypt_tool, ps._skipped_columns, - ps._nonDataLineMarkers, ps._escapechar); + ps._nonDataLineMarkers, ps._escapechar, ps._tz_adjust_to_local); } public static ParseSetup makeSVMLightSetup(){ return new ParseSetup(SVMLight_INFO, ParseSetup.GUESS_SEP, false,ParseSetup.NO_HEADER,1,null,new byte[]{Vec.T_NUM},null,null,null, new ParseWriter.ParseErr[0], - null); + null, false); } // This method was called during guess setup, lot of things are null, like ctypes. // when it is called again, it either contains the guess column types or it will have user defined column types public ParseSetup(ParserInfo parse_type, byte sep, boolean singleQuotes, int checkHeader, int ncols, String[] columnNames, byte[] ctypes, String[][] domains, String[][] naStrings, String[][] data, ParseWriter.ParseErr[] errs, - int chunkSize, byte[] nonDataLineMarkers, byte escapeChar) { + int chunkSize, byte[] nonDataLineMarkers, byte escapeChar, boolean tzAdjustToLocal) { this(parse_type, sep, singleQuotes, checkHeader, ncols, columnNames, ctypes, domains, naStrings, data, errs, - chunkSize, null, null, nonDataLineMarkers, escapeChar); + chunkSize, null, null, nonDataLineMarkers, escapeChar, tzAdjustToLocal); } public ParseSetup(ParserInfo parse_type, byte sep, boolean singleQuotes, int checkHeader, int ncols, String[] columnNames, byte[] ctypes, String[][] domains, String[][] naStrings, String[][] data, ParseWriter.ParseErr[] errs, - int chunkSize, Key decrypt_tool, int[] skipped_columns, byte[] nonDataLineMarkers, byte escapeChar) { + int chunkSize, Key decrypt_tool, int[] skipped_columns, byte[] nonDataLineMarkers, byte escapeChar, boolean tzAdjustToLocal) { this(parse_type, sep, singleQuotes, checkHeader, ncols, columnNames, ctypes, domains, naStrings, data, errs, - chunkSize, decrypt_tool, skipped_columns, nonDataLineMarkers, escapeChar, false); + chunkSize, decrypt_tool, skipped_columns, nonDataLineMarkers, escapeChar, false, tzAdjustToLocal); } public ParseSetup(ParserInfo parse_type, byte sep, boolean singleQuotes, int checkHeader, int ncols, String[] columnNames, byte[] ctypes, String[][] domains, String[][] naStrings, String[][] data, ParseWriter.ParseErr[] errs, int chunkSize, Key decrypt_tool, int[] skipped_columns, byte[] nonDataLineMarkers, - byte escapeChar, boolean force_col_types) { + byte escapeChar, boolean force_col_types, boolean tz_adjust_to_local) { _parse_type = parse_type; _separator = sep; _nonDataLineMarkers = nonDataLineMarkers; @@ -119,6 +120,7 @@ public ParseSetup(ParserInfo parse_type, byte sep, boolean singleQuotes, int che _skipped_columns = skipped_columns; _escapechar = escapeChar; _force_col_types = force_col_types; + _tz_adjust_to_local = tz_adjust_to_local; setParseColumnIndices(ncols, _skipped_columns); } @@ -172,7 +174,7 @@ ps.column_names, strToColumnTypes(ps.column_types), ps.chunk_size, ps.decrypt_tool != null ? ps.decrypt_tool.key() : null, ps.skipped_columns, ps.custom_non_data_line_markers != null ? ps.custom_non_data_line_markers.getBytes() : null, - ps.escapechar, ps.force_col_types); + ps.escapechar, ps.force_col_types, ps.tz_adjust_to_local); this._force_col_types = ps.force_col_types; this._orig_column_types = this._force_col_types ? (ps.column_types == null ? null : ps.column_types.clone()) : null; } @@ -185,9 +187,9 @@ ps.column_names, strToColumnTypes(ps.column_types), */ public ParseSetup(ParserInfo parseType, byte sep, boolean singleQuotes, int checkHeader, int ncols, String[] columnNames, byte[] ctypes, - String[][] domains, String[][] naStrings, String[][] data, byte[] nonDataLineMarkers, byte escapeChar) { + String[][] domains, String[][] naStrings, String[][] data, byte[] nonDataLineMarkers, byte escapeChar, boolean tzAdjustToLocal) { this(parseType, sep, singleQuotes, checkHeader, ncols, columnNames, ctypes, - domains, naStrings, data, new ParseWriter.ParseErr[0], FileVec.DFLT_CHUNK_SIZE, nonDataLineMarkers, escapeChar); + domains, naStrings, data, new ParseWriter.ParseErr[0], FileVec.DFLT_CHUNK_SIZE, nonDataLineMarkers, escapeChar, tzAdjustToLocal); } /** @@ -198,30 +200,30 @@ public ParseSetup(ParserInfo parseType, byte sep, boolean singleQuotes, int chec */ public ParseSetup(ParserInfo parseType, byte sep, boolean singleQuotes, int checkHeader, int ncols, String[] columnNames, byte[] ctypes, - String[][] domains, String[][] naStrings, String[][] data, byte escapeChar) { + String[][] domains, String[][] naStrings, String[][] data, byte escapeChar, boolean tzAdjustToLocal) { this(parseType, sep, singleQuotes, checkHeader, ncols, columnNames, ctypes, - domains, naStrings, data, new ParseWriter.ParseErr[0], FileVec.DFLT_CHUNK_SIZE, null, escapeChar); + domains, naStrings, data, new ParseWriter.ParseErr[0], FileVec.DFLT_CHUNK_SIZE, null, escapeChar, tzAdjustToLocal); } public ParseSetup(ParserInfo parseType, byte sep, boolean singleQuotes, int checkHeader, int ncols, String[] columnNames, byte[] ctypes, - String[][] domains, String[][] naStrings, String[][] data) { + String[][] domains, String[][] naStrings, String[][] data, boolean tzAdjustToLocal) { this(parseType, sep, singleQuotes, checkHeader, ncols, columnNames, ctypes, - domains, naStrings, data, new ParseWriter.ParseErr[0], FileVec.DFLT_CHUNK_SIZE, null, ParseSetup.DEFAULT_ESCAPE_CHAR); + domains, naStrings, data, new ParseWriter.ParseErr[0], FileVec.DFLT_CHUNK_SIZE, null, ParseSetup.DEFAULT_ESCAPE_CHAR, tzAdjustToLocal); } public ParseSetup(ParserInfo parseType, byte sep, boolean singleQuotes, int checkHeader, int ncols, String[] columnNames, byte[] ctypes, - String[][] domains, String[][] naStrings, String[][] data, ParseWriter.ParseErr[] errs, byte[] nonDataLineMarkers) { + String[][] domains, String[][] naStrings, String[][] data, ParseWriter.ParseErr[] errs, byte[] nonDataLineMarkers, boolean tzAdjustToLocal) { this(parseType, sep, singleQuotes, checkHeader, ncols, columnNames, ctypes, - domains, naStrings, data, errs, FileVec.DFLT_CHUNK_SIZE, nonDataLineMarkers, ParseSetup.DEFAULT_ESCAPE_CHAR); + domains, naStrings, data, errs, FileVec.DFLT_CHUNK_SIZE, nonDataLineMarkers, ParseSetup.DEFAULT_ESCAPE_CHAR, tzAdjustToLocal); } public ParseSetup(ParserInfo parseType, byte sep, boolean singleQuotes, int checkHeader, int ncols, String[] columnNames, byte[] ctypes, String[][] domains, String[][] naStrings, String[][] data, ParseWriter.ParseErr[] errs) { this(parseType, sep, singleQuotes, checkHeader, ncols, columnNames, ctypes, - domains, naStrings, data, errs, FileVec.DFLT_CHUNK_SIZE, null, ParseSetup.DEFAULT_ESCAPE_CHAR); + domains, naStrings, data, errs, FileVec.DFLT_CHUNK_SIZE, null, ParseSetup.DEFAULT_ESCAPE_CHAR, false); } /** @@ -230,7 +232,7 @@ public ParseSetup(ParserInfo parseType, byte sep, boolean singleQuotes, int chec * Typically used by file type parsers for returning final invalid results */ public ParseSetup(ParserInfo parseType, byte sep, boolean singleQuotes, int checkHeader, int ncols, String[][] data, ParseWriter.ParseErr[] errs) { - this(parseType, sep, singleQuotes, checkHeader, ncols, null, null, null, null, data, errs, FileVec.DFLT_CHUNK_SIZE, null, ParseSetup.DEFAULT_ESCAPE_CHAR); + this(parseType, sep, singleQuotes, checkHeader, ncols, null, null, null, null, data, errs, FileVec.DFLT_CHUNK_SIZE, null, ParseSetup.DEFAULT_ESCAPE_CHAR, false); } /** @@ -258,6 +260,10 @@ public String[] getOrigColumnTypes() { public boolean getForceColTypes() { return _force_col_types; } + + public boolean gettzAdjustToLocal() { + return _tz_adjust_to_local; + } public byte[] getColumnTypes() { return _column_types; } @@ -558,6 +564,7 @@ public GuessSetupTsk(ParseSetup userSetup) { } if (_gblSetup==null) throw new RuntimeException("This H2O node couldn't find the file(s) to parse. Please check files and/or working directories."); + _gblSetup.settzAdjustToLocal(_userSetup.gettzAdjustToLocal()); _gblSetup.setFileName(FileUtils.keyToFileName(key)); } @@ -587,6 +594,7 @@ public void reduce(GuessSetupTsk other) { else _gblSetup._na_strings = _userSetup._na_strings; } + _gblSetup._tz_adjust_to_local = _gblSetup._tz_adjust_to_local || _userSetup._tz_adjust_to_local; // if(_gblSetup._errs != null) for(ParseWriter.ParseErr err:_gblSetup._errs) Log.warn("ParseSetup: " + err.toString()); @@ -600,6 +608,7 @@ private ParseSetup mergeSetups(ParseSetup setupA, ParseSetup setupB, String file } ParseSetup mergedSetup = setupA; + mergedSetup._tz_adjust_to_local = setupA._tz_adjust_to_local || setupB._tz_adjust_to_local; mergedSetup._check_header = unifyCheckHeader(setupA._check_header, setupB._check_header); mergedSetup._separator = unifyColumnSeparators(setupA._separator, setupB._separator); @@ -707,7 +716,7 @@ public static ParseSetup guessSetup(ByteVec bv, byte [] bits, ParseSetup userSet */ private ParseSetup toInitialSetup() { return new ParseSetup(_parse_type, _separator, _single_quotes, _check_header, GUESS_COL_CNT, _column_names, - _column_types, null, null, null, _nonDataLineMarkers, _escapechar); + _column_types, null, null, null, _nonDataLineMarkers, _escapechar, false); } /** @@ -878,6 +887,11 @@ public ParseSetup setForceColTypes(boolean force_col_types) { return this; } + public ParseSetup settzAdjustToLocal(boolean tz_adjust_to_local) { + this._tz_adjust_to_local = tz_adjust_to_local; + return this; + } + public ParseSetup setDomains(String[][] domains) { this._domains = domains; return this; diff --git a/h2o-core/src/main/java/water/parser/ParserProvider.java b/h2o-core/src/main/java/water/parser/ParserProvider.java index 4e2b462285da..376478d74bcb 100644 --- a/h2o-core/src/main/java/water/parser/ParserProvider.java +++ b/h2o-core/src/main/java/water/parser/ParserProvider.java @@ -37,7 +37,7 @@ public final ParseSetup guessSetup(ByteVec v, byte[] bits, ParseSetup userSetup) */ protected ParseSetup guessSetup_impl(ByteVec v, byte[] bits, ParseSetup userSetup) { ParseSetup ps = guessInitSetup(v, bits, userSetup); - return guessFinalSetup(v, bits, ps); + return guessFinalSetup(v, bits, ps).settzAdjustToLocal(userSetup._tz_adjust_to_local); } /** diff --git a/h2o-core/src/main/java/water/parser/SVMLightParser.java b/h2o-core/src/main/java/water/parser/SVMLightParser.java index ed3e55f5cf94..f627b6732861 100644 --- a/h2o-core/src/main/java/water/parser/SVMLightParser.java +++ b/h2o-core/src/main/java/water/parser/SVMLightParser.java @@ -30,7 +30,7 @@ public static ParseSetup guessSetup(byte [] bits) { if(lastNewline > 0) bits = Arrays.copyOf(bits,lastNewline+1); SVMLightParser p = new SVMLightParser(new ParseSetup(SVMLight_INFO, ParseSetup.GUESS_SEP, false,ParseSetup.GUESS_HEADER,ParseSetup.GUESS_COL_CNT, - null,null,null,null,null), null); + null,null,null,null,null, false), null); SVMLightInspectParseWriter dout = new SVMLightInspectParseWriter(); p.parseChunk(0,new ByteAryData(bits,0), dout); if (dout._ncols > 0 && dout._nlines > 0 && dout._nlines > dout._invalidLines) diff --git a/h2o-core/src/main/java/water/parser/XlsParser.java b/h2o-core/src/main/java/water/parser/XlsParser.java index dbaf47f151b9..d776da61a2b9 100644 --- a/h2o-core/src/main/java/water/parser/XlsParser.java +++ b/h2o-core/src/main/java/water/parser/XlsParser.java @@ -80,7 +80,7 @@ private void readAtLeast(int lim) throws IOException{ /** Try to parse the bytes as XLS format */ public static ParseSetup guessSetup( byte[] bytes ) { XlsParser p = new XlsParser(new ParseSetup(XLS_INFO, ParseSetup.GUESS_SEP, false, - ParseSetup.GUESS_HEADER, ParseSetup.GUESS_COL_CNT, null, null, null, null, null), null); + ParseSetup.GUESS_HEADER, ParseSetup.GUESS_COL_CNT, null, null, null, null, null, false), null); p._buf = bytes; // No need to copy already-unpacked data; just use it directly p._lim = bytes.length; PreviewParseWriter dout = new PreviewParseWriter(); @@ -88,7 +88,7 @@ public static ParseSetup guessSetup( byte[] bytes ) { if (dout._ncols > 0 && dout._nlines > 0 && dout._nlines > dout._invalidLines) return new ParseSetup(XLS_INFO, ParseSetup.GUESS_SEP, false, dout.colNames()==null?ParseSetup.NO_HEADER:ParseSetup.HAS_HEADER,dout._ncols, - dout.colNames(), dout.guessTypes(),null,null,dout._data); + dout.colNames(), dout.guessTypes(),null,null,dout._data, false); else throw new ParseDataset.H2OParseException("Could not parse file as an XLS file."); } diff --git a/h2o-core/src/test/java/water/parser/ParseCompressedAndXLSTest.java b/h2o-core/src/test/java/water/parser/ParseCompressedAndXLSTest.java index 8793e0011fcc..5145474e228b 100644 --- a/h2o-core/src/test/java/water/parser/ParseCompressedAndXLSTest.java +++ b/h2o-core/src/test/java/water/parser/ParseCompressedAndXLSTest.java @@ -59,7 +59,7 @@ public class ParseCompressedAndXLSTest extends TestUtil { 12, // ncols new String[]{"fYear", "fMonth", "fDayofMonth", "fDayOfWeek", "DepTime", "ArrTime", "UniqueCarrier", "Origin", "Dest", "Distance", "IsDepDelayed", "IsDepDelayed_REC"}, ctypes, - null, null, null, null, null); + null, null, null, null, null, false); try { k1 = ParseDataset.parse(Key.make(), new Key[]{nfs._key}, true, setup, true)._job.get(); assertTrue("Should have thrown ParseException since file isn't XLS file",false); // fail - should've thrown diff --git a/h2o-core/src/test/java/water/parser/ParserTest2.java b/h2o-core/src/test/java/water/parser/ParserTest2.java index 0055058590b5..30bb6235302c 100644 --- a/h2o-core/src/test/java/water/parser/ParserTest2.java +++ b/h2o-core/src/test/java/water/parser/ParserTest2.java @@ -58,7 +58,7 @@ private static void testParsed(Frame fr, String[][] expected) { Key rkey = FVecFactory.makeByteVec(data); ParseSetup ps = new ParseSetup(CSV_INFO, (byte)',', false, ParseSetup.HAS_HEADER, 9, new String[]{"'C1Chunk'","C1SChunk", "'C2Chunk'", "'C2SChunk'", "'C4Chunk'", "'C4FChunk'", "'C8Chunk'", "'C8DChunk'", "'Categorical'"}, - ParseSetup.strToColumnTypes(new String[]{"Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Enum"}), null, null, null, null, null); + ParseSetup.strToColumnTypes(new String[]{"Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Enum"}), null, null, null, null, null, false); Frame fr = ParseDataset.parse(Key.make("na_test.hex"), new Key[]{rkey}, true, ps); int nlines = (int)fr.numRows(); Assert.assertEquals(9,nlines); @@ -101,7 +101,7 @@ private static void testParsed(Frame fr, String[][] expected) { ParseSetup ps = new ParseSetup(CSV_INFO, (byte)',', false, ParseSetup.HAS_HEADER, 4, new String[]{"'C1'","'C2'", "'C3'", "'C4'"}, ParseSetup.strToColumnTypes(new String[]{"Numeric", "Numeric", "Numeric", "Numeric"}), - null, null, null, null, null); + null, null, null, null, null, false); Frame fr = null; try { fr = ParseDataset.parse(Key.make("blank_lines_test.hex"), new Key[]{dataKey}, true, ps); @@ -124,7 +124,7 @@ private static void testParsed(Frame fr, String[][] expected) { ar("last","'line''s","trailing","piece'") }; Key k = FVecFactory.makeByteVec(data); - ParseSetup gSetupF = ParseSetup.guessSetup(null, StringUtils.bytesOf(data[0]), new ParseSetup(CSV_INFO, (byte)',', false/*single quote*/, 4, ParseSetup.NO_HEADER, null, null, null, null, null)); + ParseSetup gSetupF = ParseSetup.guessSetup(null, StringUtils.bytesOf(data[0]), new ParseSetup(CSV_INFO, (byte)',', false/*single quote*/, 4, ParseSetup.NO_HEADER, null, null, null, null, null, false)); gSetupF._column_types = ParseSetup.strToColumnTypes(new String[]{"Enum", "Enum", "Enum", "Enum"}); Frame frF = ParseDataset.parse(Key.make(), new Key[]{k}, false, gSetupF); testParsed(frF,expectFalse); @@ -132,7 +132,7 @@ private static void testParsed(Frame fr, String[][] expected) { String[][] expectTrue = new String[][] { ar("Tomass,test,first,line", null), ar("Tomas''stest2","test2"), ar("last", "lines trailing piece") }; - ParseSetup gSetupT = ParseSetup.guessSetup(null, StringUtils.bytesOf(data[0]), new ParseSetup(CSV_INFO, (byte)',', true/*single quote*/, 2, ParseSetup.NO_HEADER, null, null, null, null, null)); + ParseSetup gSetupT = ParseSetup.guessSetup(null, StringUtils.bytesOf(data[0]), new ParseSetup(CSV_INFO, (byte)',', true/*single quote*/, 2, ParseSetup.NO_HEADER, null, null, null, null, null, false)); gSetupT._column_types = ParseSetup.strToColumnTypes(new String[]{"Enum", "Enum", "Enum", "Enum"}); Frame frT = ParseDataset.parse(Key.make(), new Key[]{k}, true, gSetupT); //testParsed(frT,expectTrue); // not currently passing diff --git a/h2o-parsers/h2o-avro-parser/src/main/java/water/parser/avro/AvroParser.java b/h2o-parsers/h2o-avro-parser/src/main/java/water/parser/avro/AvroParser.java index 3542455f2692..f29817421691 100644 --- a/h2o-parsers/h2o-avro-parser/src/main/java/water/parser/avro/AvroParser.java +++ b/h2o-parsers/h2o-avro-parser/src/main/java/water/parser/avro/AvroParser.java @@ -264,7 +264,7 @@ public AvroParseSetup(int ncols, String[][] data, byte[] header, long blockSize) { - super(AvroParserProvider.AVRO_INFO, (byte) '|', true, HAS_HEADER , ncols, columnNames, ctypes, domains, naStrings, data); + super(AvroParserProvider.AVRO_INFO, (byte) '|', true, HAS_HEADER , ncols, columnNames, ctypes, domains, naStrings, data, false); this.header = header; this.blockSize = blockSize; this.setChunkSize((int) blockSize); diff --git a/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/ChunkConverter.java b/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/ChunkConverter.java index 9fe6dc03e4b7..5dc09579fe65 100644 --- a/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/ChunkConverter.java +++ b/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/ChunkConverter.java @@ -6,13 +6,20 @@ import org.apache.parquet.io.api.GroupConverter; import org.apache.parquet.io.api.PrimitiveConverter; import org.apache.parquet.schema.*; +import org.joda.time.DateTime; +import org.joda.time.DateTimeUtils; +import org.joda.time.DateTimeZone; import water.fvec.Vec; +import water.logging.Logger; import water.parser.BufferedString; +import water.parser.ParseTime; import water.parser.parquet.ext.DecimalUtils; import water.util.StringUtils; import java.time.Instant; +import static water.parser.parquet.TypeUtils.getTimestampAdjustmentFromUtcToLocalInMillis; + /** * Implementation of Parquet's GroupConverter for H2O's chunks. * @@ -29,9 +36,11 @@ class ChunkConverter extends GroupConverter { private final Converter[] _converters; private long _currentRecordIdx = -1; + private boolean _adjustTimezone; - ChunkConverter(MessageType parquetSchema, byte[] chunkSchema, WriterDelegate writer, boolean[] keepColumns) { + ChunkConverter(MessageType parquetSchema, byte[] chunkSchema, WriterDelegate writer, boolean[] keepColumns, boolean adjustTimezone) { _writer = writer; + _adjustTimezone = adjustTimezone; int colIdx = 0; // index to columns actually parsed _converters = new Converter[chunkSchema.length]; @@ -134,7 +143,12 @@ private PrimitiveConverter newConverter(int colIdx, byte vecType, PrimitiveType case Vec.T_UUID: case Vec.T_TIME: if (OriginalType.TIMESTAMP_MILLIS.equals(parquetType.getOriginalType()) || parquetType.getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.INT96)) { - return new TimestampConverter(colIdx, _writer); + if (_adjustTimezone) { + long timestampAdjustmentMillis = getTimestampAdjustmentFromUtcToLocalInMillis(); + return new TimestampConverter(colIdx, _writer, timestampAdjustmentMillis); + } else { + return new TimestampConverter(colIdx, _writer, 0L); + } } else if (OriginalType.DATE.equals(parquetType.getOriginalType()) || parquetType.getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.INT32)){ return new DateConverter(colIdx, _writer); } else { @@ -303,26 +317,32 @@ public void addBinary(Binary value) { } private static class TimestampConverter extends PrimitiveConverter { - private final int _colIdx; private final WriterDelegate _writer; + private final long timestampAdjustmentMillis; - TimestampConverter(int _colIdx, WriterDelegate _writer) { - this._colIdx = _colIdx; - this._writer = _writer; + TimestampConverter(int colIdx, WriterDelegate writer, long timestampAdjustmentMillis) { + this._colIdx = colIdx; + this._writer = writer; + this.timestampAdjustmentMillis = timestampAdjustmentMillis; } @Override public void addLong(long value) { - _writer.addNumCol(_colIdx, value, 0); + _writer.addNumCol(_colIdx, adjustTimeStamp(value), 0); } @Override public void addBinary(Binary value) { final long timestampMillis = ParquetInt96TimestampConverter.getTimestampMillis(value); - _writer.addNumCol(_colIdx, timestampMillis); + _writer.addNumCol(_colIdx, adjustTimeStamp(timestampMillis)); + } + + private long adjustTimeStamp(long ts) { + return ts + timestampAdjustmentMillis; } + } private static class DateConverter extends PrimitiveConverter { diff --git a/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/ChunkReadSupport.java b/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/ChunkReadSupport.java index 1d3ecc5c2f62..77a97d0a113b 100644 --- a/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/ChunkReadSupport.java +++ b/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/ChunkReadSupport.java @@ -13,11 +13,13 @@ public class ChunkReadSupport extends ReadSupport { private WriterDelegate _writer; private byte[] _chunkSchema; private boolean[] _keepColumns; + private boolean _adjustTimezone; - public ChunkReadSupport(WriterDelegate writer, byte[] chunkSchema, boolean[] keepcolumns) { + public ChunkReadSupport(WriterDelegate writer, byte[] chunkSchema, boolean[] keepcolumns, boolean adjustTimezone) { _writer = writer; _chunkSchema = chunkSchema; _keepColumns = keepcolumns; + _adjustTimezone = adjustTimezone; } @Override @@ -28,7 +30,7 @@ public ReadContext init(InitContext context) { @Override public RecordMaterializer prepareForRead(Configuration configuration, Map keyValueMetaData, MessageType fileSchema, ReadContext readContext) { - return new ChunkRecordMaterializer(fileSchema, _chunkSchema, _writer, _keepColumns); + return new ChunkRecordMaterializer(fileSchema, _chunkSchema, _writer, _keepColumns, _adjustTimezone); } } diff --git a/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/ChunkRecordMaterializer.java b/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/ChunkRecordMaterializer.java index d9a2266b86fa..3f4e760c11e9 100644 --- a/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/ChunkRecordMaterializer.java +++ b/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/ChunkRecordMaterializer.java @@ -15,8 +15,8 @@ class ChunkRecordMaterializer extends RecordMaterializer { private ChunkConverter _converter; - ChunkRecordMaterializer(MessageType parquetSchema, byte[] chunkSchema, WriterDelegate writer, boolean[] keepColumns) { - _converter = new ChunkConverter(parquetSchema, chunkSchema, writer, keepColumns); + ChunkRecordMaterializer(MessageType parquetSchema, byte[] chunkSchema, WriterDelegate writer, boolean[] keepColumns, boolean adjustTimezone) { + _converter = new ChunkConverter(parquetSchema, chunkSchema, writer, keepColumns, adjustTimezone); } @Override diff --git a/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/FrameParquetExporter.java b/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/FrameParquetExporter.java index e1fc1f8a85ed..845997b5e8b2 100644 --- a/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/FrameParquetExporter.java +++ b/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/FrameParquetExporter.java @@ -19,6 +19,7 @@ import static org.apache.parquet.hadoop.metadata.CompressionCodecName.UNCOMPRESSED; import static org.apache.parquet.schema.MessageTypeParser.parseMessageType; import static water.fvec.Vec.*; +import static water.parser.parquet.TypeUtils.getTimestampAdjustmentFromUtcToLocalInMillis; import org.apache.parquet.hadoop.ParquetWriter; import org.apache.hadoop.fs.Path; @@ -30,7 +31,7 @@ public class FrameParquetExporter { - public void export(H2O.H2OCountedCompleter completer, String path, Frame frame, boolean force, String compression, boolean writeChecksum) { + public void export(H2O.H2OCountedCompleter completer, String path, Frame frame, boolean force, String compression, boolean writeChecksum, boolean tzAdjustFromLocal) { File f = new File(path); new FrameParquetExporter.PartExportParquetTask( completer, @@ -41,7 +42,8 @@ public void export(H2O.H2OCountedCompleter completer, String path, Frame fram frame.domains(), force, compression, - writeChecksum + writeChecksum, + tzAdjustFromLocal ).dfork(frame); } @@ -54,10 +56,11 @@ private static class PartExportParquetTask extends MRTask final String[][] _domains; final boolean _force; final boolean _writeChecksum; + final boolean _tzAdjustFromLocal; PartExportParquetTask(H2O.H2OCountedCompleter completer, String path, String messageTypeString, String[] colNames, byte[] colTypes, String[][] domains, - boolean force, String compression, boolean writeChecksum) { + boolean force, String compression, boolean writeChecksum, boolean tzAdjustFromLocal) { super(completer); _path = path; _compressionCodecName = getCompressionCodecName(compression); @@ -67,6 +70,7 @@ private static class PartExportParquetTask extends MRTask _domains = domains; _force = force; _writeChecksum = writeChecksum; + _tzAdjustFromLocal = tzAdjustFromLocal; } CompressionCodecName getCompressionCodecName(String compression) { @@ -100,7 +104,7 @@ public void map(Chunk[] cs) { try (ParquetWriter writer = buildWriter(new Path(partPath), _compressionCodecName, PersistHdfs.CONF, parseMessageType(_messageTypeString), getMode(_force), _writeChecksum)) { String currColName; byte currColType; - + long timeStampAdjustment = _tzAdjustFromLocal ? getTimestampAdjustmentFromUtcToLocalInMillis() : 0L; for (int i = 0; i < anyChunk._len; i++) { Group group = fact.newGroup(); for (int j = 0; j < cs.length; j++) { @@ -109,7 +113,9 @@ public void map(Chunk[] cs) { switch (currColType) { case (T_UUID): case (T_TIME): - group = group.append(currColName, cs[j].at8(i)); + long timestamp = cs[j].at8(i); + long adjustedTimestamp = timestamp - timeStampAdjustment; + group = group.append(currColName, adjustedTimestamp); break; case (T_STR): if (!cs[j].isNA(i)) { diff --git a/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/ParquetExporter.java b/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/ParquetExporter.java index 76559c20eb15..c42c483a3015 100644 --- a/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/ParquetExporter.java +++ b/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/ParquetExporter.java @@ -8,8 +8,8 @@ public class ParquetExporter implements BinaryFormatExporter { @Override - public H2O.H2OCountedCompleter export(Frame frame, String path, boolean force, String compression, boolean writeChecksum) { - return new ExportParquetDriver(frame, path, force, compression, writeChecksum); + public H2O.H2OCountedCompleter export(Frame frame, String path, boolean force, String compression, boolean writeChecksum, boolean tzAdjustFromLocal) { + return new ExportParquetDriver(frame, path, force, compression, writeChecksum, tzAdjustFromLocal); } @Override @@ -25,19 +25,22 @@ private class ExportParquetDriver extends H2O.H2OCountedCompleter jobKey) { @Override public ParseSetup guessInitSetup(ByteVec v, byte[] bits, ParseSetup userSetup) { - return ParquetParser.guessFormatSetup(v, bits); + return ParquetParser.guessFormatSetup(v, bits, userSetup.gettzAdjustToLocal()); } @Override diff --git a/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/TypeUtils.java b/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/TypeUtils.java index 27b6234eea9d..295ee0c18c24 100644 --- a/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/TypeUtils.java +++ b/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/TypeUtils.java @@ -14,6 +14,8 @@ package water.parser.parquet; +import org.joda.time.DateTimeZone; + /** * Several helper methods inspired by Guava library - https://github.com/google/guava/. We want to avoid bringing guava dependency when possible. * @@ -44,4 +46,9 @@ public static long longFromBytes( public static int intFromBytes(byte b1, byte b2, byte b3, byte b4) { return b1 << 24 | (b2 & 0xFF) << 16 | (b3 & 0xFF) << 8 | (b4 & 0xFF); } + + public static int getTimestampAdjustmentFromUtcToLocalInMillis() { + DateTimeZone clusterLocalTimezone = DateTimeZone.getDefault(); + return clusterLocalTimezone.getOffset(null); + } } diff --git a/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/VecParquetReader.java b/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/VecParquetReader.java index 2c989cb3a931..c767e2fdf6d9 100644 --- a/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/VecParquetReader.java +++ b/h2o-parsers/h2o-parquet-parser/src/main/java/water/parser/parquet/VecParquetReader.java @@ -56,18 +56,20 @@ public class VecParquetReader implements Closeable { private final byte[] chunkSchema; // contains column types of all columns, not just the skipped one private ParquetReader reader; - private boolean[] _keepColumns; + private boolean[] keepColumns; + private boolean adjustTimezone; - public VecParquetReader(Vec vec, ParquetMetadata metadata, ParseWriter writer, byte[] chunkSchema, boolean[] keepcolumns, int parseColumnNumber) { - this(vec, metadata, new WriterDelegate(writer, parseColumnNumber), chunkSchema, keepcolumns); + public VecParquetReader(Vec vec, ParquetMetadata metadata, ParseWriter writer, byte[] chunkSchema, boolean[] keepcolumns, int parseColumnNumber, boolean adjustTimezone) { + this(vec, metadata, new WriterDelegate(writer, parseColumnNumber), chunkSchema, keepcolumns, adjustTimezone); } - VecParquetReader(Vec vec, ParquetMetadata metadata, WriterDelegate writer, byte[] chunkSchema, boolean[] keepcolumns) { + VecParquetReader(Vec vec, ParquetMetadata metadata, WriterDelegate writer, byte[] chunkSchema, boolean[] keepcolumns, boolean adjustTimezone) { this.vec = vec; this.metadata = metadata; this.writer = writer; this.chunkSchema = chunkSchema; - _keepColumns = keepcolumns; + this.keepColumns = keepcolumns; + this.adjustTimezone = adjustTimezone; } /** @@ -85,7 +87,7 @@ public Long read() throws IOException { private void initReader() throws IOException { assert reader == null; final VecReaderEnv env = VecReaderEnv.make(vec); - ChunkReadSupport crSupport = new ChunkReadSupport(writer, chunkSchema, _keepColumns); + ChunkReadSupport crSupport = new ChunkReadSupport(writer, chunkSchema, keepColumns, adjustTimezone); ParquetReader.Builder prBuilder = ParquetReader.builder(crSupport, env.getPath()) .withConf(env.getConf()) .withFilter(new FilterCompat.Filter() { diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py index 432e197016cb..802cdac82575 100644 --- a/h2o-py/h2o/frame.py +++ b/h2o-py/h2o/frame.py @@ -452,12 +452,13 @@ def type(self, col): def _import_parse(self, path, pattern, destination_frame, header, separator, column_names, column_types, na_strings, skipped_columns=None, force_col_types=False, custom_non_data_line_markers=None, partition_by=None, - quotechar=None, escapechar=None): + quotechar=None, escapechar=None, tz_adjust_to_local=False): if H2OFrame.__LOCAL_EXPANSION_ON_SINGLE_IMPORT__ and is_type(path, str) and "://" not in path: # fixme: delete those 2 lines, cf. https://github.com/h2oai/h2o-3/issues/12573 path = os.path.abspath(path) rawkey = h2o.lazy_import(path, pattern) self._parse(rawkey, destination_frame, header, separator, column_names, column_types, na_strings, - skipped_columns, force_col_types, custom_non_data_line_markers, partition_by, quotechar, escapechar) + skipped_columns, force_col_types, custom_non_data_line_markers, partition_by, quotechar, + escapechar, tz_adjust_to_local) return self def _upload_parse(self, path, destination_frame, header, sep, column_names, column_types, na_strings, @@ -470,9 +471,9 @@ def _upload_parse(self, path, destination_frame, header, sep, column_names, colu def _parse(self, rawkey, destination_frame="", header=None, separator=None, column_names=None, column_types=None, na_strings=None, skipped_columns=None, force_col_types=False, custom_non_data_line_markers=None, partition_by=None, quotechar=None, - escapechar=None): + escapechar=None, tz_adjust_to_local=False): setup = h2o.parse_setup(rawkey, destination_frame, header, separator, column_names, column_types, na_strings, - skipped_columns, force_col_types, custom_non_data_line_markers, partition_by, quotechar, escapechar) + skipped_columns, force_col_types, custom_non_data_line_markers, partition_by, quotechar, escapechar, tz_adjust_to_local) return self._parse_raw(setup) def _parse_raw(self, setup): @@ -491,7 +492,8 @@ def _parse_raw(self, setup): "custom_non_data_line_markers": None, "partition_by": None, "single_quotes": None, - "escapechar": None + "escapechar": None, + "tz_adjust_to_local": False } if setup["column_names"]: p["column_names"] = None diff --git a/h2o-py/h2o/h2o.py b/h2o-py/h2o/h2o.py index db67d4fb93be..60408518d50a 100644 --- a/h2o-py/h2o/h2o.py +++ b/h2o-py/h2o/h2o.py @@ -414,7 +414,7 @@ def upload_file(path, destination_frame=None, header=0, sep=None, col_names=None def import_file(path=None, destination_frame=None, parse=True, header=0, sep=None, col_names=None, col_types=None, na_strings=None, pattern=None, skipped_columns=None, force_col_types=False, custom_non_data_line_markers=None, - partition_by=None, quotechar=None, escapechar=None): + partition_by=None, quotechar=None, escapechar=None, tz_adjust_to_local=False): """ Import files into an H2O cluster. The default behavior is to pass-through to the parse phase automatically. @@ -490,6 +490,7 @@ def import_file(path=None, destination_frame=None, parse=True, header=0, sep=Non assert_is_type(partition_by, None, [str], str) assert_is_type(quotechar, None, U("'", '"')) assert_is_type(escapechar, None, I(str, lambda s: len(s) == 1)) + assert_is_type(tz_adjust_to_local, bool) assert isinstance(skipped_columns, (type(None), list)), "The skipped_columns should be an list of column names!" check_frame_id(destination_frame) patharr = path if isinstance(path, list) else [path] @@ -500,7 +501,7 @@ def import_file(path=None, destination_frame=None, parse=True, header=0, sep=Non return lazy_import(path, pattern) else: return H2OFrame()._import_parse(path, pattern, destination_frame, header, sep, col_names, col_types, na_strings, - skipped_columns, force_col_types, custom_non_data_line_markers, partition_by, quotechar, escapechar) + skipped_columns, force_col_types, custom_non_data_line_markers, partition_by, quotechar, escapechar, tz_adjust_to_local) def load_grid(grid_file_path, load_params_references=False): @@ -743,7 +744,8 @@ def import_sql_select(connection_url, select_query, username, password, optimize def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, column_names=None, column_types=None, na_strings=None, skipped_columns=None, force_col_types=False, - custom_non_data_line_markers=None, partition_by=None, quotechar=None, escapechar=None): + custom_non_data_line_markers=None, partition_by=None, quotechar=None, escapechar=None, + tz_adjust_to_local=False): """ Retrieve H2O's best guess as to what the structure of the data file is. @@ -795,6 +797,7 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co :param partition_by: A list of columns the dataset has been partitioned by. None by default. :param quotechar: A hint for the parser which character to expect as quoting character. Only single quote, double quote or None (default) are allowed. None means automatic detection. :param escapechar: (Optional) One ASCII character used to escape other characters. + :param tz_adjust_to_local: (Optional) Adjust the imported data timezone from GMT to cluster timezone. :returns: a dictionary containing parse parameters guessed by the H2O backend. @@ -829,6 +832,7 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co assert_is_type(partition_by, None, [str], str) assert_is_type(quotechar, None, U("'", '"')) assert_is_type(escapechar, None, I(str, lambda s: len(s) == 1)) + assert_is_type(tz_adjust_to_local, bool) check_frame_id(destination_frame) # The H2O backend only accepts things that are quoted @@ -837,7 +841,8 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co # temporary dictionary just to pass the following information to the parser: header, separator kwargs = {"check_header": header, "source_frames": [quoted(frame_id) for frame_id in raw_frames], - "single_quotes": quotechar == "'"} + "single_quotes": quotechar == "'", + "tz_adjust_to_local": tz_adjust_to_local} if separator: kwargs["separator"] = ord(separator) @@ -1604,7 +1609,7 @@ def load_model(path): def export_file(frame, path, force=False, sep=",", compression=None, parts=1, header=True, quote_header=True, - parallel=False, format="csv", write_checksum=True): + parallel=False, format="csv", write_checksum=True, tz_adjust_from_local=False): """ Export a given H2OFrame to a path on the machine this python session is currently connected to. @@ -1659,7 +1664,7 @@ def export_file(frame, path, force=False, sep=",", compression=None, parts=1, he data={"path": path, "num_parts": parts, "force": force, "compression": compression, "separator": ord(sep), "header": header, "quote_header": quote_header, "parallel": parallel, - "format": format, "write_checksum": write_checksum} + "format": format, "write_checksum": write_checksum, "tz_adjust_from_local": tz_adjust_from_local} ), "Export File").poll() diff --git a/h2o-py/tests/testdir_parser/pyunit_parquet_adjust_timezone.py b/h2o-py/tests/testdir_parser/pyunit_parquet_adjust_timezone.py new file mode 100644 index 000000000000..e02e93937b51 --- /dev/null +++ b/h2o-py/tests/testdir_parser/pyunit_parquet_adjust_timezone.py @@ -0,0 +1,40 @@ +import sys + +sys.path.insert(1, "../../../") +from tests import pyunit_utils +from h2o.frame import H2OFrame +from datetime import datetime, timezone, timedelta +import tempfile +import h2o + +''' +Adjust timestamp parquet +''' + +test_local_user_timezone = "America/Chicago" +time_format = '%Y-%m-%d %H:%M:%S' + + +def adjust_timestamp_parquet(): + with tempfile.TemporaryDirectory() as dir: + # prepare the file which will be imported + input_timestamp = '2024-08-02 12:00:00' + original_timestamp_df = H2OFrame({"timestamp": input_timestamp}) + h2o.export_file(original_timestamp_df, path=dir + "/import", format="parquet", write_checksum=False) + + # import the file and see tz_adjust_to_local works + imported_df = h2o.import_file(dir + "/import", tz_adjust_to_local=True) + expected_timestamp = datetime.strptime(input_timestamp, time_format).replace(tzinfo=timezone.utc) + expected_df = H2OFrame({"timestamp": expected_timestamp.astimezone().strftime(time_format)}) + assert imported_df[0, 0] == expected_df[0, 0] + + # export the file and see tz_adjust_from_local works + h2o.export_file(imported_df, path=dir + "/export", tz_adjust_from_local=True) + reimported_without_adjustment_df = h2o.import_file(dir + "/import") + assert original_timestamp_df[0, 0] == reimported_without_adjustment_df[0, 0] + + +if __name__ == "__main__": + pyunit_utils.standalone_test(adjust_timestamp_parquet) +else: + adjust_timestamp_parquet() diff --git a/h2o-test-support/src/main/java/water/TestUtil.java b/h2o-test-support/src/main/java/water/TestUtil.java index 2762e717999b..348ca6186c25 100644 --- a/h2o-test-support/src/main/java/water/TestUtil.java +++ b/h2o-test-support/src/main/java/water/TestUtil.java @@ -915,7 +915,7 @@ public static Frame parseTestFile(String fname, String na_string, int check_head // create new parseSetup in order to store our na_string ParseSetup p = ParseSetup.guessSetup(res, new ParseSetup(DefaultParserProviders.GUESS_INFO, (byte) ',', false, - check_header, 0, null, null, null, null, null, null, null)); + check_header, 0, null, null, null, null, null, null, null, false)); if (skippedColumns != null) { p.setSkippedColumns(skippedColumns); p.setParseColumnIndices(p.getNumberColumns(), skippedColumns); @@ -953,7 +953,7 @@ public static Frame parseTestFile(String fname, String na_string, int check_head // create new parseSetup in order to store our na_string ParseSetup p = ParseSetup.guessSetup(res, new ParseSetup(DefaultParserProviders.GUESS_INFO, (byte) ',', false, - check_header, 0, null, null, null, null, null, null, null)); + check_header, 0, null, null, null, null, null, null, null, false)); if (skippedColumns != null) { p.setSkippedColumns(skippedColumns); p.setParseColumnIndices(p.getNumberColumns(), skippedColumns); @@ -1093,7 +1093,7 @@ protected static Frame parseTestFolder(String fname, String na_string, int check // create new parseSetup in order to store our na_string ParseSetup p = ParseSetup.guessSetup(res, new ParseSetup(DefaultParserProviders.GUESS_INFO, (byte) ',', true, - check_header, 0, null, null, null, null, null, null, null)); + check_header, 0, null, null, null, null, null, null, null, false)); if (skipped_columns != null) { p.setSkippedColumns(skipped_columns); p.setParseColumnIndices(p.getNumberColumns(), skipped_columns);