From 4adc28d4c829e144a3324cf951f4a5da3962c123 Mon Sep 17 00:00:00 2001 From: wendycwong Date: Mon, 21 Oct 2024 10:26:11 -0700 Subject: [PATCH] remove standardize from HGLM as the convention does not do standardization. --- h2o-algos/src/main/java/hex/hglm/HGLM.java | 13 +- .../src/main/java/hex/hglm/HGLMModel.java | 53 +--- .../src/main/java/hex/hglm/HGLMScore.java | 2 +- .../src/main/java/hex/hglm/HGLMTask.java | 4 +- .../src/main/java/hex/hglm/HGLMUtils.java | 134 --------- .../main/java/hex/hglm/MetricBuilderHGLM.java | 4 +- .../main/java/hex/schemas/HGLMModelV3.java | 77 ++--- .../src/main/java/hex/schemas/HGLMV3.java | 5 - .../src/test/java/hex/hglm/HGLMBasicTest.java | 269 ++---------------- h2o-bindings/bin/custom/R/gen_hglm.py | 19 -- h2o-bindings/bin/custom/python/gen_hglm.py | 17 +- .../java/hex/ModelMetricsRegressionHGLM.java | 28 -- h2o-py/h2o/estimators/hglm.py | 34 --- .../pyunit_utils/utils_for_glm_hglm_tests.py | 69 ----- ...7_1p5_noise_var_scoring_history_summary.py | 4 +- ...487_2_noise_var_scoring_history_summary.py | 4 +- ..._noise_variance_scoring_history_summary.py | 4 +- .../hglm/pyunit_GH_8487_coefficients_check.py | 23 +- ..._noise_variance_scoring_history_summary.py | 2 +- ...87_p5_noise_var_scoring_history_summary.py | 2 +- h2o-r/h2o-package/R/hglm.R | 26 -- h2o-r/h2o-package/R/models.R | 4 +- h2o-r/h2o-package/pkgdown/_pkgdown.yml | 2 - 23 files changed, 73 insertions(+), 726 deletions(-) diff --git a/h2o-algos/src/main/java/hex/hglm/HGLM.java b/h2o-algos/src/main/java/hex/hglm/HGLM.java index 465982b332f0..7888242bca1b 100644 --- a/h2o-algos/src/main/java/hex/hglm/HGLM.java +++ b/h2o-algos/src/main/java/hex/hglm/HGLM.java @@ -183,12 +183,6 @@ else if (!trainFrame.vec(_parms._group_column).isCategorical()) if (_parms._tau_e_var_init <= 0) error("tau_e_var_init", "If gen_syn_data is true, tau_e_var_init must be > 0."); } - - if (!_parms._random_intercept && _parms._standardize) - warn("random_intercept and standardize", - "If random_intercept is false and standardize is true, model building process can be unstable" + - " due to the denormalization process which can create singular T matrix. If encounter singlar" + - " T matrix problem, set standardize to false in this case to ensure model building can finish."); } } @@ -214,8 +208,8 @@ public void computeImpl() { * 3. Set modelOutput fields. */ // _dinfo._adaptedFrame will contain group_column. Check and make sure clients will pass that along as well. - _dinfo = new DataInfo(_train.clone(), null, 1, _parms._use_all_factor_levels, _parms._standardize ? - DataInfo.TransformType.STANDARDIZE : DataInfo.TransformType.NONE, DataInfo.TransformType.NONE, + _dinfo = new DataInfo(_train.clone(), null, 1, _parms._use_all_factor_levels, + DataInfo.TransformType.NONE, DataInfo.TransformType.NONE, _parms.missingValuesHandling() == Skip, _parms.missingValuesHandling() == MeanImputation || _parms.missingValuesHandling() == PlugValues, @@ -293,7 +287,6 @@ void fitEM(HGLMModel model, Job job, ScoringHistory scTrain, ScoringHistory scVa if (_parms._showFixedMatVecs) model._output.setModelOutputFixMatVec(engineTask); _state = new ComputationStateHGLM(_job, _parms, _dinfo, engineTask, iteration); - generateNonStandardizeZTZArjTArs(_parms, model); // generate not standardized transpose(Z)*Z, transpose(Zj)*Zj try { if (_parms._max_iterations > 0) { // grab current value of fixed beta, tauEVar, tauUVar @@ -360,7 +353,7 @@ public boolean progress(double[] beta, double[][] ubeta, double[][] tmat, double if (_parms.valid() != null) scoreAndUpdateModel(model, false, scValid); } else { - // calculate log likelihood with current parameter settings, standardize if parms._standardize and vice versa + // calculate log likelihood with current parameter settings double logLikelihood = calHGLMllg(_state._nobs, tmat, tauEVarE10, model._output._arjtarj, rLlh._sse_fixed, rLlh._yMinusXTimesZ); scTrain.addIterationScore(_state._iter, logLikelihood, tauEVarE10); diff --git a/h2o-algos/src/main/java/hex/hglm/HGLMModel.java b/h2o-algos/src/main/java/hex/hglm/HGLMModel.java index a827caec626a..d612293a20ee 100644 --- a/h2o-algos/src/main/java/hex/hglm/HGLMModel.java +++ b/h2o-algos/src/main/java/hex/hglm/HGLMModel.java @@ -61,7 +61,7 @@ protected PredictScoreResult predictScoreImpl(Frame fr, Frame adaptFrm, String d if (gs._computeMetrics) { // only calculate log-likelihood, mse and other metrics if _computeMetrics mb = gs._mb; if (forTraining) { - _output._yminusxtimesz_score = gs._yMinusXTimesZ; + _output._yminusxtimesz = gs._yMinusXTimesZ; _output._yMinusfixPredSquare = mb._yMinusfixPredSquare; } else { // store for all frames other than the training frame _output._yminusxtimesz_valid = gs._yMinusXTimesZ; @@ -88,7 +88,6 @@ private HGLMScore makeScoringTask(Frame adaptFrm, boolean makePredictions, Job j public static class HGLMParameters extends Model.Parameters { public long _seed = -1; public GLMModel.GLMParameters.Family _family; - public boolean _standardize = false; public int _max_iterations = -1; public double[] _initial_fixed_effects; // initial values of fixed coefficients public Key _initial_random_effects; // frame key that contains the initial starting values of random coefficient effects @@ -174,13 +173,10 @@ public static class HGLMModelOutput extends Model.Output { final GLMModel.GLMParameters.Family _random_family; public String[] _fixed_coefficient_names; // include intercept only if _parms._intercept is true public String[] _random_coefficient_names; // include intercept only if _parms._random_intercept = true - public String[] _random_coefficient_names_normalized; public String[] _group_column_names; public long _training_time_ms; - public double[] _beta; // fixed coefficients, not normalized - public double[][] _ubeta; // random coefficients, not normalized - public double[] _beta_normalized; - public double[][] _ubeta_normalized; + public double[] _beta; // fixed coefficients + public double[][] _ubeta; // random coefficients public double[][] _tmat; // calculated with non-standardize random effects coefficients double _tauUVar; public double _tau_e_var; @@ -191,14 +187,10 @@ public static class HGLMModelOutput extends Model.Output { public double[][][] _arjtarj; public double[][][] _afjtarj; //public double[][] _zttimesz; // calculate from standardized or non-standardized Zj - public double[][][] _arjtarj_score; // used during scoring for metrics calculation. Not standardized - // public double[][] _zttimesz_score; // used during scoring for metrics calculation. Not standardized public double[][] _yminusxtimesz; // generate during training - public double[][] _yminusxtimesz_score; // generate during scoring public double[][] _yminusxtimesz_valid; // store same value for frames other than training frame public int _num_fixed_coeffs; public int _num_random_coeffs; - public int _num_random_coeffs_normalized; int[] _randomCatIndices; int[] _randomNumIndices; int[] _randomCatArrayStartIndices; @@ -217,7 +209,6 @@ public static class HGLMModelOutput extends Model.Output { public double _yMinusfixPredSquare; public double _yMinusfixPredSquare_valid; public TwoDimTable _scoring_history_valid; - public double _mse_fixed; // mse with fixed effect only /** * For debugging only. Copy over the generated fixed matrices to model._output. @@ -250,13 +241,13 @@ public HGLMModelOutput(HGLM b, DataInfo dinfo) { _family = b._parms._family; _random_family = b._parms._random_family; } - + public void setModelOutputFields(ComputationStateHGLM state) { _fixed_coefficient_names = state.get_fixedCofficientNames(); _random_coefficient_names = state.get_randomCoefficientNames(); _group_column_names = state.get_groupColumnNames(); _tauUVar = state.get_tauUVar(); - // _tau_e_var = state.get_tauEVarE17(); + // _tau_e_var = state.get_tauEVarE17(); _tau_e_var = state.get_tauEVarE10(); _tmat = state.get_T(); _num_fixed_coeffs = state.get_numFixedCoeffs(); @@ -264,41 +255,11 @@ public void setModelOutputFields(ComputationStateHGLM state) { _numLevel2Units = state.get_numLevel2Units(); _level2UnitIndex = state.get_level2UnitIndex(); _nobs = state._nobs; - if (state._parms._standardize) { // for random coefficients, the names of random coefficients names may change - _beta_normalized = state.get_beta(); - _ubeta_normalized = state.get_ubeta(); - _beta = denormalizedOneBeta(_beta_normalized, _fixed_coefficient_names, _dinfo._adaptedFrame.names(), - state._parms.train(), true); - _ubeta = denormalizedUBeta(_ubeta_normalized, _random_coefficient_names, state._parms._random_columns, - state._parms.train(), state._parms._random_intercept); - _random_coefficient_names_normalized = _random_coefficient_names.clone(); - if (_ubeta_normalized[0].length < _ubeta[0].length) // added intercept term, need to add name to random coeff names - _random_coefficient_names = copyCoefAddIntercept(_random_coefficient_names_normalized); - _tmat = generateNewTmat(_ubeta); - } else { - _beta = state.get_beta(); - _beta_normalized = normalizedOneBeta(_beta, _fixed_coefficient_names, _dinfo._adaptedFrame.names(), - state._parms.train(), true); - _ubeta = state.get_ubeta(); - _ubeta_normalized = normalizedUBeta(_ubeta, _random_coefficient_names, state._parms._random_columns, - state._parms.train(), state._parms._random_intercept); - if (_ubeta[0].length == _ubeta_normalized[0].length) - _random_coefficient_names_normalized = _random_coefficient_names; - else - _random_coefficient_names_normalized = copyCoefAddIntercept(_random_coefficient_names); - } - _num_random_coeffs_normalized = _ubeta_normalized[0].length; + _beta = state.get_beta(); + _ubeta = state.get_ubeta(); _num_random_coeffs = _ubeta[0].length; _iterations = state._iter; } - - public static String[] copyCoefAddIntercept(String[] originalNames) { - int nameLen = originalNames.length; - String[] longerNames = new String[nameLen+1]; - System.arraycopy(originalNames, 0, longerNames, 0, nameLen); - longerNames[nameLen] = "intercept"; - return longerNames; - } @Override public int nclasses() { // only support Gaussian now diff --git a/h2o-algos/src/main/java/hex/hglm/HGLMScore.java b/h2o-algos/src/main/java/hex/hglm/HGLMScore.java index a8c4a56394f4..b1dc7faec689 100644 --- a/h2o-algos/src/main/java/hex/hglm/HGLMScore.java +++ b/h2o-algos/src/main/java/hex/hglm/HGLMScore.java @@ -65,7 +65,7 @@ public HGLMScore(final Job j, final HGLMModel model, DataInfo dinfo, final Strin _randomCatArrayStartIndices = model._output._randomCatArrayStartIndices; _predStartIndexRandom = model._output._predStartIndexRandom; _randomSlopeToo = model._output._randomSlopeToo; - _randomIntercept = _parms._random_intercept || (_parms._standardize); + _randomIntercept = _parms._random_intercept; _tmat = model._output._tmat; // generated from non-standardized random coefficients randomObj = new Random(_parms._seed); _noiseStd = Math.sqrt(_parms._tau_e_var_init); // not affected by standardization/normalization diff --git a/h2o-algos/src/main/java/hex/hglm/HGLMTask.java b/h2o-algos/src/main/java/hex/hglm/HGLMTask.java index 4954e766a78d..0c54f24fad33 100644 --- a/h2o-algos/src/main/java/hex/hglm/HGLMTask.java +++ b/h2o-algos/src/main/java/hex/hglm/HGLMTask.java @@ -80,7 +80,7 @@ public void map(Chunk[] chks) { double residualFixed; DataInfo.Row r = _dinfo.newDenseRow(); for (int rowInd = 0; rowInd < chkLen; rowInd++) { - _dinfo.extractDenseRow(chks, rowInd, r); // numerical values are standardized automatically if standardize=true + _dinfo.extractDenseRow(chks, rowInd, r); if (!r.isBad() && !(r.weight == 0)) { y = r.response(0); level2Index = _parms._use_all_factor_levels ? r.binIds[_level2UnitIndex] - _dinfo._catOffsets[_level2UnitIndex] : @@ -266,7 +266,7 @@ public void map(Chunk[] chks) { int chkLen = chks[0].len(); DataInfo.Row r = _dinfo.newDenseRow(); for (int rowInd = 0; rowInd < chkLen; rowInd++) { - _dinfo.extractDenseRow(chks, rowInd, r); // numerical values are standardized automatically if standardize=true + _dinfo.extractDenseRow(chks, rowInd, r); if (!r.isBad() && !(r.weight == 0)) { y = r.response(0); _YjTYjSum += y * y; diff --git a/h2o-algos/src/main/java/hex/hglm/HGLMUtils.java b/h2o-algos/src/main/java/hex/hglm/HGLMUtils.java index 628b6602af49..5de8e398e423 100644 --- a/h2o-algos/src/main/java/hex/hglm/HGLMUtils.java +++ b/h2o-algos/src/main/java/hex/hglm/HGLMUtils.java @@ -1,19 +1,13 @@ package hex.hglm; import Jama.Matrix; -import hex.DataInfo; import water.DKV; import water.Key; import water.fvec.Frame; import water.util.ArrayUtils; import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import static hex.glm.GLMModel.GLMParameters.MissingValuesHandling.*; import static water.util.ArrayUtils.*; public class HGLMUtils { @@ -137,106 +131,6 @@ public static double calTauEvarEq17(double residualSquare, double tauEVar, doubl double sigmaTrace = tauEVar * trace(cInvArjTArj) ; return (residualSquare + sigmaTrace)*oneOverN; } - - public static double[] denormalizedOneBeta(double[] beta, String[] coeffNames, String[] colNames, - Frame train, boolean interceptPresent) { - int numRandomCoeff = beta.length; - Map coefMean = new HashMap<>(); - Map coefStd = new HashMap<>(); - List randomColList = Arrays.stream(colNames).collect(Collectors.toList()); - genMeanStd(coeffNames, randomColList, train, coefMean, coefStd); - int interceptIndex = interceptPresent ? numRandomCoeff - 1 : numRandomCoeff; - double[] denormalizedUBeta = new double[interceptIndex + 1]; - if (interceptPresent) - denormalizedUBeta[interceptIndex] = beta[interceptIndex]; - - String coefName; - for (int coefInd = 0; coefInd < numRandomCoeff; coefInd++) { - coefName = coeffNames[coefInd]; - if (randomColList.contains(coefName)) { // pick out the numerical columns - denormalizedUBeta[coefInd] = beta[coefInd] / coefStd.get(coefName); - denormalizedUBeta[interceptIndex] -= beta[coefInd] * coefMean.get(coefName) / coefStd.get(coefName); - } else if (coefName != "intercept") { - denormalizedUBeta[coefInd] = beta[coefInd]; - } - } - return denormalizedUBeta; - } - - public static double[][] denormalizedUBeta(double[][] ubeta, String[] randomCoeffNames, String[] randomColNames, - Frame train, boolean randomIntercept) { - int numLevel2 = ubeta.length; - double[][] denormalizedBeta = new double[numLevel2][]; - boolean onlyEnumRandomCols = randomColAllEnum(train, randomColNames); - for (int index=0; index colNamesList = Arrays.stream(columnNames).collect(Collectors.toList()); - Map coefMean = new HashMap<>(); - Map coefStd = new HashMap<>(); - genMeanStd(coeffNames, colNamesList, train, coefMean, coefStd); - - if (interceptPresent) - normalizedBeta[interceptIndex] = beta[interceptIndex]; - - String coefName; - for (int coefInd=0; coefInd < numCoeff; coefInd++) { - coefName = coeffNames[coefInd]; - if (colNamesList.contains(coefName)) { // pick out numerical columns - normalizedBeta[coefInd] = beta[coefInd] * coefStd.get(coefName); - normalizedBeta[interceptIndex] += normalizedBeta[coefInd] * coefMean.get(coefName)/coefStd.get(coefName); - } else if (coefName != "intercept"){ // no change to enum columns - normalizedBeta[coefInd] = beta[coefInd]; - } - } - return normalizedBeta; - } - - /** - * Normalize ubeta, intercept is always the last one - */ - public static double[][] normalizedUBeta(double[][] ubeta, String[] randomCoeffNames, String[] randomColNames, - Frame train, boolean randomIntercept) { - int numLevel2 = ubeta.length; - double[][] normalizedUBeta = new double[numLevel2][]; - boolean onlyEnumRandomCols = randomColAllEnum(train, randomColNames); - for (int index=0; index randomColNames, Frame train, - Map coefMean, Map coefSTD) { - int numCoeff = randomCoeffNames.length; - String coefName; - double colMean; - double colStd; - for (int index=0; index= 0; } - public static void generateNonStandardizeZTZArjTArs(HGLMModel.HGLMParameters parms, HGLMModel model) { - if (parms._standardize) { - boolean orignalRandomIntercept = parms._random_intercept; - parms._random_intercept = parms._random_intercept || !randomColAllEnum(parms.train(), parms._random_columns); - List colNames = Arrays.asList(parms.train().names()); - boolean hasWeights = model._parms._weights_column != null && colNames.contains(model._parms._weights_column); - boolean hasOffsets = model._parms._offset_column != null && colNames.contains(model._parms._offset_column); - DataInfo dinfo = new DataInfo(parms.train().clone(), null, 1, parms._use_all_factor_levels, - DataInfo.TransformType.NONE, DataInfo.TransformType.NONE, - parms.missingValuesHandling() == Skip, parms.missingValuesHandling() == MeanImputation - || parms.missingValuesHandling() == PlugValues, parms.makeImputer(), false, hasWeights, - hasOffsets, false, null); - HGLMTask.ComputationEngineTask engineTask = new HGLMTask.ComputationEngineTask(null, parms, dinfo); - engineTask.doAll(dinfo._adaptedFrame); - model._output._arjtarj_score = engineTask._ArjTArj; - // model._output._zttimesz_score = engineTask._zTTimesZ; - parms._random_intercept = orignalRandomIntercept; - } else { - model._output._arjtarj_score = model._output._arjtarj; - // model._output._zttimesz_score = model._output._zttimesz; - } - } - public static double[][] generateNewTmat(double[][] ubeta) { int numIndex2 = ubeta.length; double oneOverJ = 1.0/numIndex2; @@ -292,9 +163,4 @@ public static double[][] generateNewTmat(double[][] ubeta) { mult(newTmat, oneOverJ); return newTmat; } - - public static boolean randomColAllEnum(Frame train, String[] randomColumns) { - int numRandCols = randomColumns.length; - return Arrays.stream(randomColumns).filter(x -> train.vec(x).isCategorical()).count() == numRandCols; - } } diff --git a/h2o-algos/src/main/java/hex/hglm/MetricBuilderHGLM.java b/h2o-algos/src/main/java/hex/hglm/MetricBuilderHGLM.java index b8341d00b87b..00efc9d7c2c6 100644 --- a/h2o-algos/src/main/java/hex/hglm/MetricBuilderHGLM.java +++ b/h2o-algos/src/main/java/hex/hglm/MetricBuilderHGLM.java @@ -77,8 +77,8 @@ public ModelMetrics makeModelMetrics(Model m, Frame f, Frame adaptedFrame, Frame double[][] tmat = hglmM._output._tmat; // already set with non-standardized random coefficients if (forTraining) { - double loglikelihood = calHGLMllg(metricsRegression._nobs, tmat, hglmM._output._tau_e_var, hglmM._output._arjtarj_score, - this._yMinusfixPredSquare, hglmM._output._yminusxtimesz_score); + double loglikelihood = calHGLMllg(metricsRegression._nobs, tmat, hglmM._output._tau_e_var, hglmM._output._arjtarj, + this._yMinusfixPredSquare, hglmM._output._yminusxtimesz); mm = new ModelMetricsRegressionHGLM(m, f, metricsRegression._nobs, this.weightedSigma(), loglikelihood, this._customMetric, hglmM._output._iterations, hglmM._output._beta, hglmM._output._ubeta, tmat, hglmM._output._tau_e_var, metricsRegression._MSE, this._yMinusfixPredSquare / metricsRegression._nobs, diff --git a/h2o-algos/src/main/java/hex/schemas/HGLMModelV3.java b/h2o-algos/src/main/java/hex/schemas/HGLMModelV3.java index 1e81286b6ebe..ad8c585752fd 100644 --- a/h2o-algos/src/main/java/hex/schemas/HGLMModelV3.java +++ b/h2o-algos/src/main/java/hex/schemas/HGLMModelV3.java @@ -34,23 +34,14 @@ public static final class HGLMModelOutputV3 extends ModelOutputSchemaV3 randomCoeffNamesLen) { // model building, added intercept to coeffs_normalized, extend coeffs - coeffNamesused = coeffNamesNormalized; - coeffsUsed = addInterceptValue(coeffs); - coeffsNormalizedUsed = coeffs_normalized; - } else if (randomCoefNameLenNorm < randomCoeffNamesLen) { // model building with standardization, added intercept to coeffs, extend coeffsNormalized - coeffNamesused = coeffNames; - coeffsUsed = coeffs; - coeffsNormalizedUsed = addInterceptValue(coeffs_normalized); - } else { - coeffNamesused = coeffNames; - coeffsUsed = coeffs; - coeffsNormalizedUsed = coeffs_normalized; - } + coeffNamesused = coeffNames; + coeffsUsed = coeffs; + double[] fCoeffValues = flattenArray(coeffsUsed); - double[] fCoeffValuesNormalized = flattenArray(coeffsNormalizedUsed); String[] fCoeffNames = extendCoeffNames(coeffNamesused, numLevel2Index); String[] fLevel2Vals = extendLevel2Ind(level2Domain, coeffsUsed[0].length); - - String[] colnames = new String[] {"coefficient names", "coefficients", "standardized_coefficients"}; - String[] colFormats = new String[] {"%s", "%.5f", "%.5f"}; - String[] colTypes = new String[] {"string", "double", "double"}; + + String[] colnames = new String[]{"coefficient names", "coefficients"}; + String[] colFormats = new String[]{"%s", "%.5f"}; + String[] colTypes = new String[]{"string", "double"}; TwoDimTable tdt = new TwoDimTable(title1, title2, fLevel2Vals, colnames, colTypes, colFormats, "names"); int tableLen = fCoeffNames.length; - for (int index=0; index rowValues = new ArrayList<>(); int catVal; for (String enumName : enumPredNames) { @@ -535,10 +520,7 @@ public double[] grabRow2Arrays(String[] enumPredNames, String[] numPredNames, bo } for (String numName:numPredNames) { double val = fr.vec(numName).at(rowInd); - if (standardize) - rowValues.add((val - fr.vec(numName).mean())/fr.vec(numName).sigma()); - else - rowValues.add(val); + rowValues.add(val); } if (hasIntercept) @@ -629,7 +611,6 @@ public void testSetInitBetasTvar() { params._initial_fixed_effects = initBetas; params._initial_t_matrix = tMat._key; params._initial_random_effects = ubetaFrame._key; - params._standardize = false; params._tau_e_var_init = sigmaEpsilon; params._max_iterations = 0; HGLMModel model = new HGLM(params).trainModel().get(); @@ -678,7 +659,6 @@ public void testSetInitT() { params._group_column = "RACE"; params._use_all_factor_levels = true; params._random_columns = new String[]{"GLEASON", "DPROS", "DCAPS"}; - params._standardize = false; params._tau_u_var_init = sigmaU; params._max_iterations = 0; HGLMModel model = new HGLM(params).trainModel().get(); @@ -695,190 +675,6 @@ public void checkCorrectTMat(HGLMModel model, double sigmaU) { correctTMat[ind][ind] = sigmaU; checkDoubleArrays(correctTMat, model._output._tmat, 1e-6); } - - /*** - * In this test, I check and make sure the normalization (standardization) and de-normalization (de-standardization) - * of coefficients are done correctly. First, I setup initial coefficient values to build a model that has - * standardize = true. In this case, the initial coefficients are treated as normalized coefficients. - * - * model1 is built and the following should be true: - * 1. model1._output._beta_normalized should equal initBetaStandardize; - * 2. model1._output._ubeta_normalized should equal to the transpose of initUBetaStandardize; - * - * Next, we build a model2 with standardize = false and the initial coefficients are set to model1._output._beta and - * model1._output._ubeta. If the normalization and de-normalization is done correctly, the following should be true: - * 1. model2._output._beta == model1._output._beta; - * 2. model2._output._ubeta == model1._output._ubeta; - * 3. model2._output._beta_normalized = model1._output._beta_normalized == initBetaStandardize; - * 4. model2._output._ubeta_normalized = model1._output._ubeta_normalized == transpose of initUBetaStandardize; - * - * We will be checking all the statements. - */ - @Test - public void testCoeffDeNNormalizationWithRandomIntercept() { - try { - Scope.enter(); - double[] initBetaStandardize = new double[]{0.57305, 0.95066, 0.4277, 0.2814, 0.3727}; - double[][] initUBetaStandardize = new double[][]{{-1.4257393174908208, 1.9459515904358207, -1.5121424866231998, - 0.757565557144771, 1.6454093526843507, 0.521525656276774, 0.15102292603863332, - -0.5629664504958487, 0.39941437871543806, -0.17666156140184344, -0.9012256565441157, - 0.4013361512547679, -0.7655048415710769, 0.9625031349421274, -1.6916150004681492, 0.8967295711861796}, - {0.7307560306666573, -0.43350728257793125, 0.761204681372934, - -0.9665905711121056, -0.0485193797802151, -0.6595712372715338, -0.4616825414753406, - 0.7886590178655907, 0.27241373557806586, -0.04301812863182515, -0.10936899265127145, - 0.8173502195208687, -0.1473779447485634, -2.1395714941712223, -0.9096112739244531, -1.8557521580762681}, - {-1.818395521031121, 0.3423166377645478, 2.803250124441809, - 0.36788162518520634, 0.2854761765342526, 1.9802144801614998, 1.0295144701971513, - -0.0195871711309739, -0.04015765623938129, -0.22232686097490753, -1.1551081071985216, - 0.4799532222692264, 0.1858090583440908, -0.25703386958964214, 1.3293865207793107, -0.6641399983332995}}; - Frame ubetaFrameStandardize = new TestFrameBuilder() - .withColNames("x1", "x3", "intercept") - .withVecTypes(T_NUM, T_NUM, T_NUM) - .withDataForCol(0, initUBetaStandardize[0]) - .withDataForCol(1, initUBetaStandardize[1]) - .withDataForCol(2, initUBetaStandardize[2]) - .build(); - Scope.track(ubetaFrameStandardize); - - Frame fr = parseTestFile("smalldata/hglm_test/semiconductor.csv"); - fr.replace(0, fr.vec(0).toCategoricalVec()).remove(); - DKV.put(fr); - Scope.track(fr); - HGLMModel.HGLMParameters parms = new HGLMModel.HGLMParameters(); - parms._train = fr._key; - parms._response_column = "y"; - parms._ignored_columns = new String[]{"x2", "x4"}; - parms._ignore_const_cols = true; - parms._family = GLMModel.GLMParameters.Family.gaussian; - parms._random_columns = new String[]{"x1", "x3"}; - parms._group_column = "Device"; - parms._max_iterations = 0; - parms._seed = 1234; - parms._initial_fixed_effects = initBetaStandardize; - parms._initial_random_effects = ubetaFrameStandardize._key; - parms._random_intercept = true; - parms._standardize = true; - - // just make sure it runs - HGLMModel model1 = new HGLM(parms).trainModel().get(); - Scope.track_generic(model1); - // the initial coefficients are set to model as standardized coefficients, get the denormalized coefficients here - Frame uinitBetaFrame = makeUBetaFrame(model1._output._ubeta); - Scope.track(uinitBetaFrame); - // build model 2 that is not standardized - parms._standardize = false; - parms._initial_fixed_effects = model1._output._beta; - parms._initial_random_effects = uinitBetaFrame._key; - HGLMModel model2 = new HGLM(parms).trainModel().get(); - Scope.track_generic(model2); - // if all the normalization and de-normalization is done correctly, we should have - TestUtil.checkArrays(model1._output._beta_normalized, initBetaStandardize, 1e-12); - TestUtil.checkDoubleArrays(model1._output._ubeta_normalized, new Matrix(initUBetaStandardize).transpose().getArray(), - 1e-12); - TestUtil.checkArrays(model1._output._beta, model2._output._beta, 1e-12); - TestUtil.checkArrays(model1._output._beta_normalized, model2._output._beta_normalized, 1e-12); - TestUtil.checkDoubleArrays(model1._output._ubeta, model2._output._ubeta, 1e-12); - TestUtil.checkDoubleArrays(model1._output._ubeta_normalized, model2._output._ubeta_normalized, 1e-12); - // manually check a few cases to make sure things are actually running okay - assertEquals(model1._output._ubeta[0][0], model1._output._ubeta_normalized[0][0]/fr.vec("x1").sigma(), 1e-6); - assertEquals(model1._output._ubeta_normalized[1][2], - model1._output._ubeta[1][2]+fr.vec("x1").mean()*model1._output._ubeta[1][0]/fr.vec("x1").sigma() + - fr.vec("x2").mean()*model1._output._ubeta[1][1]/fr.vec("x2").sigma(), 1e-6); - assertEquals(model2._output._beta_normalized[3], model2._output._beta[3]*fr.vec("x6").sigma(), 1e-6); - assertEquals(model2._output._beta_normalized[4], model2._output._beta[4]+ - fr.vec("x1").mean()*model2._output._beta_normalized[0]/fr.vec("x1").sigma()+ - fr.vec("x3").mean()*model2._output._beta_normalized[1]/fr.vec("x3").sigma()+ - fr.vec("x5").mean()*model2._output._beta_normalized[2]/fr.vec("x5").sigma()+ - fr.vec("x6").mean()*model2._output._beta_normalized[3]/fr.vec("x6").sigma(), 1e-6); - } finally { - Scope.exit(); - } - } - - /*** - * This test is exactly like the one in testCoeffDeNNormalizationWithRandomIntercept with the exception that there - * is no random intercept. - */ - @Test - public void testCoeffDeNNormalizationWORandomIntercept() { - try { - Scope.enter(); - double[] initBetaStandardize = new double[]{0.57305, 0.95066, 0.4277, 0.2814, 0.3727}; - double[][] initUBetaStandardize = new double[][]{{-1.4257393174908208, 1.9459515904358207, -1.5121424866231998, - 0.757565557144771, 1.6454093526843507, 0.521525656276774, 0.15102292603863332, - -0.5629664504958487, 0.39941437871543806, -0.17666156140184344, -0.9012256565441157, - 0.4013361512547679, -0.7655048415710769, 0.9625031349421274, -1.6916150004681492, 0.8967295711861796}, - {0.7307560306666573, -0.43350728257793125, 0.761204681372934, - -0.9665905711121056, -0.0485193797802151, -0.6595712372715338, -0.4616825414753406, - 0.7886590178655907, 0.27241373557806586, -0.04301812863182515, -0.10936899265127145, - 0.8173502195208687, -0.1473779447485634, -2.1395714941712223, -0.9096112739244531, -1.8557521580762681}}; - Frame ubetaFrameStandardize = new TestFrameBuilder() - .withColNames("x1", "x3") - .withVecTypes(T_NUM, T_NUM) - .withDataForCol(0, initUBetaStandardize[0]) - .withDataForCol(1, initUBetaStandardize[1]) - .build(); - Scope.track(ubetaFrameStandardize); - - Frame fr = parseTestFile("smalldata/hglm_test/semiconductor.csv"); - fr.replace(0, fr.vec(0).toCategoricalVec()).remove(); - DKV.put(fr); - Scope.track(fr); - HGLMModel.HGLMParameters parms = new HGLMModel.HGLMParameters(); - parms._train = fr._key; - parms._response_column = "y"; - parms._ignored_columns = new String[]{"x2", "x4"}; - parms._ignore_const_cols = true; - parms._family = GLMModel.GLMParameters.Family.gaussian; - parms._random_columns = new String[]{"x1", "x3"}; - parms._group_column = "Device"; - parms._max_iterations = 0; - parms._seed = 1234; - parms._initial_fixed_effects = initBetaStandardize; - parms._initial_random_effects = ubetaFrameStandardize._key; - parms._random_intercept = false; - parms._standardize = true; - - // just make sure it runs - HGLMModel model1 = new HGLM(parms).trainModel().get(); - Scope.track_generic(model1); - // the initial coefficients are set to model as standardized coefficients, get the denormalized coefficients here - Frame uinitBetaFrame = makeUBetaFrame(model1._output._ubeta); - Scope.track(uinitBetaFrame); - // build model 2 that is not standardized - parms._standardize = false; - parms._random_intercept = true; - parms._initial_fixed_effects = model1._output._beta; - parms._initial_random_effects = uinitBetaFrame._key; - HGLMModel model2 = new HGLM(parms).trainModel().get(); - Scope.track_generic(model2); - // If all the normalization and de-normalization is done correctly, we should have - TestUtil.checkArrays(model1._output._beta_normalized, initBetaStandardize, 1e-12); - TestUtil.checkDoubleArrays(model1._output._ubeta_normalized, new Matrix(initUBetaStandardize).transpose().getArray(), - 1e-12); - TestUtil.checkArrays(model1._output._beta, model2._output._beta, 1e-12); - TestUtil.checkDoubleArrays(model1._output._ubeta, model2._output._ubeta, 1e-12); - TestUtil.checkArrays(model1._output._beta_normalized, model2._output._beta_normalized, 1e-12); - // Again, an intercept term is added when you normalize beta. model2 will have an extra column in its ubeta. - // The last column contains value close to 0. - double[][] temp = new Matrix(model2._output._ubeta_normalized).transpose().getArray(); - double[][] model2ubetaN = new Matrix(new double[][] {temp[0], temp[1]}).transpose().getArray(); - TestUtil.checkDoubleArrays(model1._output._ubeta_normalized, model2ubetaN, 1e-12); - // manually check a few cases to make sure things are actually running okay - assertEquals(model1._output._ubeta[0][0], model1._output._ubeta_normalized[0][0]/fr.vec("x1").sigma(), 1e-6); - assertEquals(0, - fr.vec("x1").mean()*model1._output._ubeta[1][0]/fr.vec("x1").sigma() + - fr.vec("x2").mean()*model1._output._ubeta[1][1]/fr.vec("x2").sigma(), 1e-6); - assertEquals(model2._output._beta_normalized[3], model2._output._beta[3]*fr.vec("x6").sigma(), 1e-6); - assertEquals(model2._output._beta_normalized[4], model2._output._beta[4]+ - fr.vec("x1").mean()*model2._output._beta_normalized[0]/fr.vec("x1").sigma()+ - fr.vec("x3").mean()*model2._output._beta_normalized[1]/fr.vec("x3").sigma()+ - fr.vec("x5").mean()*model2._output._beta_normalized[2]/fr.vec("x5").sigma()+ - fr.vec("x6").mean()*model2._output._beta_normalized[3]/fr.vec("x6").sigma(), 1e-6); - } finally { - Scope.exit(); - } - } public static Frame makeUBetaFrame(double[][] initUBeta) { double[][] initUBetaT = new Matrix(initUBeta).transpose().getArray(); @@ -915,7 +711,6 @@ public void testSemiconductor() { parms._random_intercept = true; parms._group_column = "Device"; parms._max_iterations = 1; - parms._standardize = true; // just make sure it runs HGLMModel model = new HGLM(parms).trainModel().get(); @@ -986,16 +781,6 @@ public void testPredictionMetricsSumaryScoringHistoryWRIntercept() { parms._initial_fixed_effects = initBeta; parms._initial_random_effects = ubetaInitFrame._key; parms._random_intercept = true; - parms._standardize = true; - // check prediction with standardize = true - HGLMModel model = new HGLM(parms).trainModel().get(); - Scope.track_generic(model); - Frame predFrame = model.score(fr); - Scope.track(predFrame); - checkPrediction(fr, predFrame, model, 0.0); - - // check prediction again with standardize = false - parms._standardize = false; HGLMModel modelNS = new HGLM(parms).trainModel().get(); Scope.track_generic(modelNS); Frame predFrameNS = modelNS.score(fr); @@ -1052,17 +837,7 @@ public void testPredictionMetricsSumaryScoringHistoryWORIntercept() { parms._initial_fixed_effects = initBeta; parms._initial_random_effects = ubetaInitFrame._key; parms._random_intercept = false; - parms._standardize = true; - - // check prediction without random intercept and with standardization - HGLMModel model = new HGLM(parms).trainModel().get(); - Scope.track_generic(model); - Frame predFrame = model.score(fr); - Scope.track(predFrame); - checkPrediction(fr, predFrame, model, 0.0); - // check prediction without random intercept and without standardization - parms._standardize = false; HGLMModel modelNS = new HGLM(parms).trainModel().get(); Scope.track_generic(modelNS); Frame predFrameNS = modelNS.score(fr); diff --git a/h2o-bindings/bin/custom/R/gen_hglm.py b/h2o-bindings/bin/custom/R/gen_hglm.py index a09e3793c1de..84fc2d2ce9c7 100644 --- a/h2o-bindings/bin/custom/R/gen_hglm.py +++ b/h2o-bindings/bin/custom/R/gen_hglm.py @@ -26,15 +26,6 @@ return(model@model$ubeta) } -#' Extracts the normalized/standardized random effects coefficients of an HGLM model. -#' -#' @param model is a H2O HGLM model. -#' @export -h2o.coef_random_norm <- function(model) { - if (is(model, "H2OModel") && (model@algorithm=="hglm")) - return(model@model$ubeta_normalized) -} - #' Extracts the group_column levels of an HGLM model. The group_column is usually referred to as level 2 predictor. #' #' @param model is a H2O HGLM model. @@ -53,16 +44,6 @@ return(model@model$random_coefficient_names) } -#' Extracts the coefficient names of normalized/standardized random effect coefficients. If no random intercept is -#' set, during the normalization/de-normalization process, an random intercept will be added. -#' -#' @param model is a H2O HGLM model. -#' @export -h2o.coefs_random_names_norm <- function(model) { - if (is(model, "H2OModel") && (model@algorithm=="hglm")) - return(model@model$random_coefficient_names_normalized) -} - #' Extracts scoring history of validation dataframe during training #' #' @param model is a H2O HGLM model. diff --git a/h2o-bindings/bin/custom/python/gen_hglm.py b/h2o-bindings/bin/custom/python/gen_hglm.py index 03c6387c23ad..c1d62db85614 100644 --- a/h2o-bindings/bin/custom/python/gen_hglm.py +++ b/h2o-bindings/bin/custom/python/gen_hglm.py @@ -16,14 +16,7 @@ def coefs_random_names(self): Get the random effect coefficient names including the intercept if applicable. """ return self._model_json["output"]["random_coefficient_names"] - - def coefs_random_names_norm(self): - """ - Get the random effect coefficient names including the intercept if applicable for normalized/standardized - random effect coefficients. - """ - return self._model_json["output"]["random_coefficient_names_normalized"] - + def coefs_random(self): """ Get the random coefficients of the model. @@ -32,14 +25,6 @@ def coefs_random(self): random_coefs = self._model_json["output"]["ubeta"] return dict(zip(level_2_names, random_coefs)) - def coefs_random_norm(self): - """ - Get the normalized/standardized random coefficients of the model. - """ - level_2_names = self.level_2_names() - random_coefs = self._model_json["output"]["ubeta_normalized"] - return dict(zip(level_2_names, random_coefs)) - def scoring_history_valid(self, as_data_frame=True): """ Retrieve Model Score History for validation data frame if present diff --git a/h2o-core/src/main/java/hex/ModelMetricsRegressionHGLM.java b/h2o-core/src/main/java/hex/ModelMetricsRegressionHGLM.java index e7117d845b9c..0a055d83e694 100644 --- a/h2o-core/src/main/java/hex/ModelMetricsRegressionHGLM.java +++ b/h2o-core/src/main/java/hex/ModelMetricsRegressionHGLM.java @@ -37,35 +37,7 @@ public ModelMetricsRegressionHGLM(Model model, Frame frame, long nobs, double si _mse_fixed = mse_fixed; } - - /** - * - * This method calculates the log-likelihood as described in section II.V of the doc. - */ -/* public static double calHGLMllg(long nobs, double[][] tmat, double varResidual, double[][][] zjTTimesZj, - double yMinsXFixSquared, double[][] yMinusXFixTimesZ) { - int numLevel2 = zjTTimesZj.length; - double[][] tmatInv = new Matrix(tmat).inverse().getArray(); - double tmatDeterminant = new Matrix(tmat).det(); - double oneOVar = 1.0 / varResidual; - double oneOVarSq = oneOVar * oneOVar; - double llg = nobs * LOG_2PI + oneOVar * yMinsXFixSquared; - double[][] invTPlusZjTZ; - Matrix yMinusXjFixed; - Matrix yjMinusXjFixed; - for (int ind2 = 0; ind2 < numLevel2; ind2++) { - invTPlusZjTZ = calInvTPZjTZ(tmatInv, zjTTimesZj[ind2], oneOVar); - llg += Math.log(varResidual * new Matrix(invTPlusZjTZ).det() * tmatDeterminant); - yMinusXjFixed = new Matrix(new double[][]{yMinusXFixTimesZ[ind2]}); - yjMinusXjFixed = yMinusXjFixed.times(new Matrix(invTPlusZjTZ).inverse().times(yMinusXjFixed.transpose())); - llg -= oneOVarSq * yjMinusXjFixed.getArray()[0][0]; - } - return -0.5 * llg; - } - public static double[][] calInvTPZjTZ(double[][] tmatInv, double[][] zjTTimesZj, double oneOVar) { - return new Matrix(tmatInv).plus(new Matrix(zjTTimesZj).times(oneOVar)).getArray(); - }*/ /*** * diff --git a/h2o-py/h2o/estimators/hglm.py b/h2o-py/h2o/estimators/hglm.py index cf27a0f7e713..73a1152715cf 100644 --- a/h2o-py/h2o/estimators/hglm.py +++ b/h2o-py/h2o/estimators/hglm.py @@ -41,7 +41,6 @@ def __init__(self, plug_values=None, # type: Optional[Union[None, str, H2OFrame]] family="gaussian", # type: Literal["gaussian"] rand_family=None, # type: Optional[Literal["gaussian"]] - standardize=False, # type: bool max_iterations=-1, # type: int initial_fixed_effects=None, # type: Optional[List[float]] initial_random_effects=None, # type: Optional[Union[None, str, H2OFrame]] @@ -115,9 +114,6 @@ def __init__(self, :param rand_family: rand_family. Set distribution of random effects. Only Gaussian is implemented now. Defaults to ``None``. :type rand_family: Literal["gaussian"], optional - :param standardize: Standardize numeric columns to have zero mean and unit variance. - Defaults to ``False``. - :type standardize: bool :param max_iterations: Maximum number of iterations. Value should >=1. A value of 0 is only set when only the model coefficient names and model coefficient dimensions are needed. Defaults to ``-1``. @@ -185,7 +181,6 @@ def __init__(self, self.plug_values = plug_values self.family = family self.rand_family = rand_family - self.standardize = standardize self.max_iterations = max_iterations self.initial_fixed_effects = initial_fixed_effects self.initial_random_effects = initial_random_effects @@ -427,20 +422,6 @@ def rand_family(self, rand_family): assert_is_type(rand_family, None, Enum("gaussian")) self._parms["rand_family"] = rand_family - @property - def standardize(self): - """ - Standardize numeric columns to have zero mean and unit variance. - - Type: ``bool``, defaults to ``False``. - """ - return self._parms.get("standardize") - - @standardize.setter - def standardize(self, standardize): - assert_is_type(standardize, None, bool) - self._parms["standardize"] = standardize - @property def max_iterations(self): """ @@ -640,13 +621,6 @@ def coefs_random_names(self): """ return self._model_json["output"]["random_coefficient_names"] - def coefs_random_names_norm(self): - """ - Get the random effect coefficient names including the intercept if applicable for normalized/standardized - random effect coefficients. - """ - return self._model_json["output"]["random_coefficient_names_normalized"] - def coefs_random(self): """ Get the random coefficients of the model. @@ -655,14 +629,6 @@ def coefs_random(self): random_coefs = self._model_json["output"]["ubeta"] return dict(zip(level_2_names, random_coefs)) - def coefs_random_norm(self): - """ - Get the normalized/standardized random coefficients of the model. - """ - level_2_names = self.level_2_names() - random_coefs = self._model_json["output"]["ubeta_normalized"] - return dict(zip(level_2_names, random_coefs)) - def scoring_history_valid(self, as_data_frame=True): """ Retrieve Model Score History for validation data frame if present diff --git a/h2o-py/tests/pyunit_utils/utils_for_glm_hglm_tests.py b/h2o-py/tests/pyunit_utils/utils_for_glm_hglm_tests.py index d5439952aeeb..d855cfc254d9 100644 --- a/h2o-py/tests/pyunit_utils/utils_for_glm_hglm_tests.py +++ b/h2o-py/tests/pyunit_utils/utils_for_glm_hglm_tests.py @@ -147,30 +147,6 @@ def find_model_iterations(glm_model): iteration_index = glm_model._model_json["output"]["model_summary"].col_header.index("number_of_iterations") return cell_values[lengths-1][iteration_index] -def normalize_denormalize_random_coefs(random_coefs, random_coefs_names, level_2_names, numerical_cols, training_frame, normalize = True): - """ - Given a random effect coefficients dict, this method will standardize/normalize the coefficients - - :param random_coefs: python dict with random column names and a list of random coefficients for each level 2 index - :param random_coefs_names: python list of random coefficient name - :param level_2_names: python string list of level 2 values - :param numerical_cols: numerical columns of the frame - :param training_frame: h2o frame used to build the model - :return: python dict with random columns names and a list of normalized/standardized random coefficients - """ - normalized_coefs = dict() - # extract random coefficients for each level 2 value - for level2_val in level_2_names: - # extract dict for one level 2 value - dictLevel2 = extract_coef_dict(random_coefs, level2_val, random_coefs_names) - if normalize: - transform_one_coef = normalize_coefs(dictLevel2, numerical_cols, training_frame) - else: - transform_one_coef = denormalize_coefs(dictLevel2, numerical_cols, training_frame) - - add_to_random_coef_dict(normalized_coefs, transform_one_coef, level2_val, random_coefs_names) - return normalized_coefs - def add_to_random_coef_dict(normalized_coefs, normalized_one_coefs, level2_val, random_coefs_names): one_list = [] for one_name in random_coefs_names: @@ -184,51 +160,6 @@ def extract_coef_dict(random_coeffs, level2_name, random_coefs_names): random_coef_level2[cname] = random_coeffs[level2_name][index] index = index+1 return random_coef_level2 - - - -def normalize_coefs(coefs, numerical_cols, training_frame): - """ - Given a coefficient as a dict, the method will normalized/standardized the given coefficents and return it in another - dict. - - :param coefs: coefficients as a dict without normalization/standardization - :param numerical_cols: column names of numerical columns - :param training_frame: H2O frame used to train the model - :return: a python dict with normalized/standardized coefficients - """ - intercept_adjust = 0 - all_coefs_names = coefs.keys() - normalized_coefs = coefs.copy() - # only numerical coefficients are changed. - for cname in numerical_cols: - if cname in all_coefs_names: - cmean = training_frame[cname].mean()[0,0] - csigma = training_frame[cname].sd()[0] - normalized_coefs[cname] = coefs[cname] * csigma - intercept_adjust = intercept_adjust + normalized_coefs[cname]*cmean/csigma - if "intercept" in all_coefs_names: - normalized_coefs["intercept"] = coefs["intercept"]+intercept_adjust - else: - normalized_coefs["intercept"] = intercept_adjust - return normalized_coefs - -def denormalize_coefs(coefs_normalized, numerical_cols, training_frame): - intercept_adjust = 0 - all_coefs_names = coefs_normalized.keys() - denormalize_coefs = coefs_normalized.copy() - for cname in numerical_cols: - if cname in all_coefs_names: - cmean = training_frame[cname].mean()[0,0] - csigma = training_frame[cname].sd()[0] - denormalize_coefs[cname] = coefs_normalized[cname] / csigma - intercept_adjust = intercept_adjust - cmean * coefs_normalized[cname] / csigma - - if "intercept" in all_coefs_names: - denormalize_coefs["intercept"] = denormalize_coefs["intercept"] + intercept_adjust - else: - denormalize_coefs["intercept"] = intercept_adjust - return denormalize_coefs def compare_dicts_with_tupple(dict1, dict2, tolerance=1e-6): keys = dict1.keys() diff --git a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_1p5_noise_var_scoring_history_summary.py b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_1p5_noise_var_scoring_history_summary.py index 02279374e526..2e37c3798148 100644 --- a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_1p5_noise_var_scoring_history_summary.py +++ b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_1p5_noise_var_scoring_history_summary.py @@ -5,7 +5,7 @@ from h2o.estimators.hglm import H2OHGLMEstimator as hglm from tests.pyunit_utils import utils_for_glm_hglm_tests -# in this test, want to check the following with standardization and with random intercept: +# in this test, want to check the following with random intercept: # 1.scoring history (both training and valid) # 2. the model summary # 3. Fixed effect coefficients, normal and standardized @@ -20,7 +20,7 @@ def test_scoring_history_model_summary(): x.remove("C1") random_columns = ["C2", "C3", "C4"] hglm_model = hglm(random_columns=random_columns, group_column = "C1", score_each_iteration=True, seed=12345, - max_iterations = 20, random_intercept = False, standardize=False) + max_iterations = 20, random_intercept = False) hglm_model.train(x=x, y=y, training_frame=train, validation_frame=valid) # grab various metrics model_metrics = hglm_model.training_model_metrics() diff --git a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_2_noise_var_scoring_history_summary.py b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_2_noise_var_scoring_history_summary.py index 4ea4954f4697..0ff356a873bc 100644 --- a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_2_noise_var_scoring_history_summary.py +++ b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_2_noise_var_scoring_history_summary.py @@ -5,7 +5,7 @@ from h2o.estimators.hglm import H2OHGLMEstimator as hglm from tests.pyunit_utils import utils_for_glm_hglm_tests -# in this test, want to check the following with standardization and with random intercept: +# in this test, want to check the following with random intercept: # 1.scoring history (both training and valid) # 2. the model summary # 3. Fixed effect coefficients, normal and standardized @@ -20,7 +20,7 @@ def test_scoring_history_model_summary(): x.remove("C1") random_columns = ["C2", "C3", "C4"] hglm_model = hglm(random_columns=random_columns, group_column = "C1", score_each_iteration=True, seed=12345, - max_iterations = 20, random_intercept = False, standardize=False) + max_iterations = 20, random_intercept = False) hglm_model.train(x=x, y=y, training_frame=train, validation_frame=valid) # grab various metrics model_metrics = hglm_model.training_model_metrics() diff --git a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_3_noise_variance_scoring_history_summary.py b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_3_noise_variance_scoring_history_summary.py index 8c9bff043de2..552fc2750103 100644 --- a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_3_noise_variance_scoring_history_summary.py +++ b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_3_noise_variance_scoring_history_summary.py @@ -5,7 +5,7 @@ from h2o.estimators.hglm import H2OHGLMEstimator as hglm from tests.pyunit_utils import utils_for_glm_hglm_tests -# in this test, want to check the following with standardization and with random intercept: +# in this test, want to check the following with random intercept: # 1.scoring history (both training and valid) # 2. the model summary # 3. Fixed effect coefficients, normal and standardized @@ -20,7 +20,7 @@ def test_scoring_history_model_summary(): x.remove("C1") random_columns = ["C2", "C3", "C10", "C20"] hglm_model = hglm(random_columns=random_columns, group_column = "C1", score_each_iteration=True, seed=12345, - random_intercept = True, standardize = False, max_iterations=10) + random_intercept = True, max_iterations=10) hglm_model.train(x=x, y=y, training_frame=train, validation_frame=valid) print(hglm_model) # make sure this one works. # grab various metrics diff --git a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_coefficients_check.py b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_coefficients_check.py index a6589b29fede..22df5d17ae5b 100644 --- a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_coefficients_check.py +++ b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_coefficients_check.py @@ -5,7 +5,7 @@ from h2o.estimators.hglm import H2OHGLMEstimator as hglm from tests.pyunit_utils import utils_for_glm_hglm_tests -# in this test, want to check the following with standardization and de-standardization and with random intercept: +# in this test, want to check to make sure we are getting our coefficients # 1. Fixed effect coefficients; # 2. Random effect coefficients. def test_scoring_history_model_summary(): @@ -17,15 +17,12 @@ def test_scoring_history_model_summary(): x.remove("C1") random_columns = ["C2", "C3", "C10", "C20"] hglm_model = hglm(random_columns=random_columns, group_column = "C1", score_each_iteration=True, seed=12345, - max_iterations=10, standardize=True) + max_iterations=10) hglm_model.train(x=x, y=y, training_frame=train, validation_frame=valid) # grab various metrics coef = hglm_model.coef() - coef_norm = hglm_model.coef_norm() coef_random = hglm_model.coefs_random() coef_random_names = hglm_model.coefs_random_names() - coef_random_norm = hglm_model.coefs_random_norm() - coef_random_names_norm = hglm_model.coefs_random_names_norm() residual_var = hglm_model.residual_variance() mse = hglm_model.mse() mse_fixed = hglm_model.mean_residual_fixed() @@ -41,17 +38,11 @@ def test_scoring_history_model_summary(): " not.".format(mse_fixed, mse) assert mse < mse_fixed_valid, "residual error with only fixed effects from validation frames {0} should exceed that" \ " of mse {1} but is not.".format(mse_fixed_valid, mse) - # check coefficients and normalized coefficients are converted correctly. - numerical_columns = ["C10", "C20", "C30", "C40", "C50"] - coef_norm_manually = utils_for_glm_hglm_tests.normalize_coefs(coef, numerical_columns, train) - pyunit_utils.assertCoefDictEqual(coef_norm, coef_norm_manually, 1e-6) - coef_manually = utils_for_glm_hglm_tests.denormalize_coefs(coef_norm, numerical_columns, train) - pyunit_utils.assertCoefDictEqual(coef, coef_manually, 1e-6) - # check random effect coefficients and normalized random effect coefficients are converted correctly. - random_coeffs_norm_manually = utils_for_glm_hglm_tests.normalize_denormalize_random_coefs(coef_random, coef_random_names, level2_names, numerical_columns, train, normalize=True) - random_coeffs_manually = utils_for_glm_hglm_tests.normalize_denormalize_random_coefs(coef_random_norm, coef_random_names_norm, level2_names, numerical_columns, train, normalize=False) - utils_for_glm_hglm_tests.compare_dicts_with_tupple(coef_random, random_coeffs_manually, tolerance=1e-6) - utils_for_glm_hglm_tests.compare_dicts_with_tupple(coef_random_norm, random_coeffs_norm_manually, tolerance=1e-6) + assert len(coef) == len(coef_random_names), "fixed coefficient length {0} should equal to random coefficient names" \ + " length: {1}".format(len(coef), len(coef_random_names)) + assert len(coef)*len(level2_names) == len(coef_random), \ + "expected random coefficient length: {0}, actual random coefficient names length " \ + "{1}".format(len(coef)*len(level2_names),len(coef_random)) if __name__ == "__main__": pyunit_utils.standalone_test(test_scoring_history_model_summary) diff --git a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_p05_noise_variance_scoring_history_summary.py b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_p05_noise_variance_scoring_history_summary.py index ab741897ac61..c23f2128cc5b 100644 --- a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_p05_noise_variance_scoring_history_summary.py +++ b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_p05_noise_variance_scoring_history_summary.py @@ -20,7 +20,7 @@ def test_scoring_history_model_summary(): x.remove("C1") random_columns = ["C10", "C20", "C30"] hglm_model = hglm(random_columns=random_columns, group_column="C1", score_each_iteration=True, seed=12345, - random_intercept=False, max_iterations=10, standardize=False, em_epsilon=0.000001) + random_intercept=False, max_iterations=10, em_epsilon=0.000001) hglm_model.train(x=x, y=y, training_frame=train, validation_frame=valid) print(hglm_model) # make sure this one works. # grab various metrics diff --git a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_p5_noise_var_scoring_history_summary.py b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_p5_noise_var_scoring_history_summary.py index e209b7b452ab..0aa576ac51bc 100644 --- a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_p5_noise_var_scoring_history_summary.py +++ b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_p5_noise_var_scoring_history_summary.py @@ -20,7 +20,7 @@ def test_scoring_history_model_summary(): x.remove("C1") random_columns = ["C2", "C3", "C4"] hglm_model = hglm(random_columns=random_columns, group_column = "C1", score_each_iteration=True, seed=12345, - max_iterations = 10, random_intercept = True, standardize=False) + max_iterations = 10, random_intercept = True) hglm_model.train(x=x, y=y, training_frame=train, validation_frame=valid) # grab various metrics model_metrics = hglm_model.training_model_metrics() diff --git a/h2o-r/h2o-package/R/hglm.R b/h2o-r/h2o-package/R/hglm.R index 2c37e12d0bef..854c57a46ccb 100644 --- a/h2o-r/h2o-package/R/hglm.R +++ b/h2o-r/h2o-package/R/hglm.R @@ -40,7 +40,6 @@ #' @param family Family. Only gaussian is supported now. Must be one of: "gaussian". Defaults to gaussian. #' @param rand_family rand_family. Set distribution of random effects. Only Gaussian is implemented now. Must be one of: #' "gaussian". -#' @param standardize \code{Logical}. Standardize numeric columns to have zero mean and unit variance. Defaults to FALSE. #' @param max_iterations Maximum number of iterations. Value should >=1. A value of 0 is only set when only the model coefficient #' names and model coefficient dimensions are needed. Defaults to -1. #' @param initial_fixed_effects An array that contains initial values of the fixed effects coefficient. @@ -94,7 +93,6 @@ h2o.hglm <- function(x, plug_values = NULL, family = c("gaussian"), rand_family = c("gaussian"), - standardize = FALSE, max_iterations = -1, initial_fixed_effects = NULL, initial_random_effects = NULL, @@ -165,8 +163,6 @@ h2o.hglm <- function(x, parms$family <- family if (!missing(rand_family)) parms$rand_family <- rand_family - if (!missing(standardize)) - parms$standardize <- standardize if (!missing(max_iterations)) parms$max_iterations <- max_iterations if (!missing(initial_fixed_effects)) @@ -214,7 +210,6 @@ h2o.hglm <- function(x, plug_values = NULL, family = c("gaussian"), rand_family = c("gaussian"), - standardize = FALSE, max_iterations = -1, initial_fixed_effects = NULL, initial_random_effects = NULL, @@ -290,8 +285,6 @@ h2o.hglm <- function(x, parms$family <- family if (!missing(rand_family)) parms$rand_family <- rand_family - if (!missing(standardize)) - parms$standardize <- standardize if (!missing(max_iterations)) parms$max_iterations <- max_iterations if (!missing(initial_fixed_effects)) @@ -340,15 +333,6 @@ h2o.coef_random <- function(model) { return(model@model$ubeta) } -#' Extracts the normalized/standardized random effects coefficients of an HGLM model. -#' -#' @param model is a H2O HGLM model. -#' @export -h2o.coef_random_norm <- function(model) { - if (is(model, "H2OModel") && (model@algorithm=="hglm")) - return(model@model$ubeta_normalized) -} - #' Extracts the group_column levels of an HGLM model. The group_column is usually referred to as level 2 predictor. #' #' @param model is a H2O HGLM model. @@ -367,16 +351,6 @@ h2o.coefs_random_names <- function(model) { return(model@model$random_coefficient_names) } -#' Extracts the coefficient names of normalized/standardized random effect coefficients. If no random intercept is -#' set, during the normalization/de-normalization process, an random intercept will be added. -#' -#' @param model is a H2O HGLM model. -#' @export -h2o.coefs_random_names_norm <- function(model) { - if (is(model, "H2OModel") && (model@algorithm=="hglm")) - return(model@model$random_coefficient_names_normalized) -} - #' Extracts scoring history of validation dataframe during training #' #' @param model is a H2O HGLM model. diff --git a/h2o-r/h2o-package/R/models.R b/h2o-r/h2o-package/R/models.R index dcd2429fdf38..764d262b100d 100755 --- a/h2o-r/h2o-package/R/models.R +++ b/h2o-r/h2o-package/R/models.R @@ -3104,7 +3104,7 @@ h2o.coef_names <- function(object) { #' @export h2o.coef_norm <- function(object, predictorSize=-1) { if (is(object, "H2OModel") && - (object@algorithm %in% c("glm", "gam", "coxph", "modelselection", "hglm"))) { + (object@algorithm %in% c("glm", "gam", "coxph", "modelselection"))) { if (object@algorithm == "modelselection") { if (object@allparameters$mode == "maxrsweep" && !object@allparameters$build_glm_model) { @@ -3174,7 +3174,7 @@ h2o.coef_norm <- function(object, predictorSize=-1) { ) } } else { - stop("Can only extract coefficients from GAMs/GLMs/HGLMs/CoxPHs/ModelSelections") + stop("Can only extract coefficients from GAMs/GLMs/CoxPHs/ModelSelections") } } diff --git a/h2o-r/h2o-package/pkgdown/_pkgdown.yml b/h2o-r/h2o-package/pkgdown/_pkgdown.yml index 341749c6db11..b4c42b3fa98b 100644 --- a/h2o-r/h2o-package/pkgdown/_pkgdown.yml +++ b/h2o-r/h2o-package/pkgdown/_pkgdown.yml @@ -76,8 +76,6 @@ reference: - h2o.coef_norm - h2o.coef_random - h2o.coefs_random_names - - h2o.coefs_random_names_norm - - h2o.coef_random_norm - h2o.coef_with_p_values - h2o.colnames - h2o.columns_by_type