Skip to content

Commit

Permalink
GH-8487: remove standardize from HGLM as the convention does not do s…
Browse files Browse the repository at this point in the history
…tandardization.
  • Loading branch information
wendycwong committed Oct 22, 2024
1 parent e3a4ed5 commit a473673
Show file tree
Hide file tree
Showing 27 changed files with 99 additions and 752 deletions.
13 changes: 3 additions & 10 deletions h2o-algos/src/main/java/hex/hglm/HGLM.java
Original file line number Diff line number Diff line change
Expand Up @@ -183,12 +183,6 @@ else if (!trainFrame.vec(_parms._group_column).isCategorical())
if (_parms._tau_e_var_init <= 0)
error("tau_e_var_init", "If gen_syn_data is true, tau_e_var_init must be > 0.");
}

if (!_parms._random_intercept && _parms._standardize)
warn("random_intercept and standardize",
"If random_intercept is false and standardize is true, model building process can be unstable" +
" due to the denormalization process which can create singular T matrix. If encounter singlar" +
" T matrix problem, set standardize to false in this case to ensure model building can finish.");
}
}

Expand All @@ -214,8 +208,8 @@ public void computeImpl() {
* 3. Set modelOutput fields.
*/
// _dinfo._adaptedFrame will contain group_column. Check and make sure clients will pass that along as well.
_dinfo = new DataInfo(_train.clone(), null, 1, _parms._use_all_factor_levels, _parms._standardize ?
DataInfo.TransformType.STANDARDIZE : DataInfo.TransformType.NONE, DataInfo.TransformType.NONE,
_dinfo = new DataInfo(_train.clone(), null, 1, _parms._use_all_factor_levels,
DataInfo.TransformType.NONE, DataInfo.TransformType.NONE,
_parms.missingValuesHandling() == Skip,
_parms.missingValuesHandling() == MeanImputation
|| _parms.missingValuesHandling() == PlugValues,
Expand Down Expand Up @@ -293,7 +287,6 @@ void fitEM(HGLMModel model, Job job, ScoringHistory scTrain, ScoringHistory scVa
if (_parms._showFixedMatVecs)
model._output.setModelOutputFixMatVec(engineTask);
_state = new ComputationStateHGLM(_job, _parms, _dinfo, engineTask, iteration);
generateNonStandardizeZTZArjTArs(_parms, model); // generate not standardized transpose(Z)*Z, transpose(Zj)*Zj
try {
if (_parms._max_iterations > 0) {
// grab current value of fixed beta, tauEVar, tauUVar
Expand Down Expand Up @@ -360,7 +353,7 @@ public boolean progress(double[] beta, double[][] ubeta, double[][] tmat, double
if (_parms.valid() != null)
scoreAndUpdateModel(model, false, scValid);
} else {
// calculate log likelihood with current parameter settings, standardize if parms._standardize and vice versa
// calculate log likelihood with current parameter settings
double logLikelihood = calHGLMllg(_state._nobs, tmat, tauEVarE10, model._output._arjtarj, rLlh._sse_fixed,
rLlh._yMinusXTimesZ);
scTrain.addIterationScore(_state._iter, logLikelihood, tauEVarE10);
Expand Down
53 changes: 7 additions & 46 deletions h2o-algos/src/main/java/hex/hglm/HGLMModel.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ protected PredictScoreResult predictScoreImpl(Frame fr, Frame adaptFrm, String d
if (gs._computeMetrics) { // only calculate log-likelihood, mse and other metrics if _computeMetrics
mb = gs._mb;
if (forTraining) {
_output._yminusxtimesz_score = gs._yMinusXTimesZ;
_output._yminusxtimesz = gs._yMinusXTimesZ;
_output._yMinusfixPredSquare = mb._yMinusfixPredSquare;
} else { // store for all frames other than the training frame
_output._yminusxtimesz_valid = gs._yMinusXTimesZ;
Expand All @@ -88,7 +88,6 @@ private HGLMScore makeScoringTask(Frame adaptFrm, boolean makePredictions, Job j
public static class HGLMParameters extends Model.Parameters {
public long _seed = -1;
public GLMModel.GLMParameters.Family _family;
public boolean _standardize = false;
public int _max_iterations = -1;
public double[] _initial_fixed_effects; // initial values of fixed coefficients
public Key _initial_random_effects; // frame key that contains the initial starting values of random coefficient effects
Expand Down Expand Up @@ -174,13 +173,10 @@ public static class HGLMModelOutput extends Model.Output {
final GLMModel.GLMParameters.Family _random_family;
public String[] _fixed_coefficient_names; // include intercept only if _parms._intercept is true
public String[] _random_coefficient_names; // include intercept only if _parms._random_intercept = true
public String[] _random_coefficient_names_normalized;
public String[] _group_column_names;
public long _training_time_ms;
public double[] _beta; // fixed coefficients, not normalized
public double[][] _ubeta; // random coefficients, not normalized
public double[] _beta_normalized;
public double[][] _ubeta_normalized;
public double[] _beta; // fixed coefficients
public double[][] _ubeta; // random coefficients
public double[][] _tmat; // calculated with non-standardize random effects coefficients
double _tauUVar;
public double _tau_e_var;
Expand All @@ -191,14 +187,10 @@ public static class HGLMModelOutput extends Model.Output {
public double[][][] _arjtarj;
public double[][][] _afjtarj;
//public double[][] _zttimesz; // calculate from standardized or non-standardized Zj
public double[][][] _arjtarj_score; // used during scoring for metrics calculation. Not standardized
// public double[][] _zttimesz_score; // used during scoring for metrics calculation. Not standardized
public double[][] _yminusxtimesz; // generate during training
public double[][] _yminusxtimesz_score; // generate during scoring
public double[][] _yminusxtimesz_valid; // store same value for frames other than training frame
public int _num_fixed_coeffs;
public int _num_random_coeffs;
public int _num_random_coeffs_normalized;
int[] _randomCatIndices;
int[] _randomNumIndices;
int[] _randomCatArrayStartIndices;
Expand All @@ -217,7 +209,6 @@ public static class HGLMModelOutput extends Model.Output {
public double _yMinusfixPredSquare;
public double _yMinusfixPredSquare_valid;
public TwoDimTable _scoring_history_valid;
public double _mse_fixed; // mse with fixed effect only

/**
* For debugging only. Copy over the generated fixed matrices to model._output.
Expand Down Expand Up @@ -250,55 +241,25 @@ public HGLMModelOutput(HGLM b, DataInfo dinfo) {
_family = b._parms._family;
_random_family = b._parms._random_family;
}

public void setModelOutputFields(ComputationStateHGLM state) {
_fixed_coefficient_names = state.get_fixedCofficientNames();
_random_coefficient_names = state.get_randomCoefficientNames();
_group_column_names = state.get_groupColumnNames();
_tauUVar = state.get_tauUVar();
// _tau_e_var = state.get_tauEVarE17();
// _tau_e_var = state.get_tauEVarE17();
_tau_e_var = state.get_tauEVarE10();
_tmat = state.get_T();
_num_fixed_coeffs = state.get_numFixedCoeffs();
_num_random_coeffs = state.get_numRandomCoeffs();
_numLevel2Units = state.get_numLevel2Units();
_level2UnitIndex = state.get_level2UnitIndex();
_nobs = state._nobs;
if (state._parms._standardize) { // for random coefficients, the names of random coefficients names may change
_beta_normalized = state.get_beta();
_ubeta_normalized = state.get_ubeta();
_beta = denormalizedOneBeta(_beta_normalized, _fixed_coefficient_names, _dinfo._adaptedFrame.names(),
state._parms.train(), true);
_ubeta = denormalizedUBeta(_ubeta_normalized, _random_coefficient_names, state._parms._random_columns,
state._parms.train(), state._parms._random_intercept);
_random_coefficient_names_normalized = _random_coefficient_names.clone();
if (_ubeta_normalized[0].length < _ubeta[0].length) // added intercept term, need to add name to random coeff names
_random_coefficient_names = copyCoefAddIntercept(_random_coefficient_names_normalized);
_tmat = generateNewTmat(_ubeta);
} else {
_beta = state.get_beta();
_beta_normalized = normalizedOneBeta(_beta, _fixed_coefficient_names, _dinfo._adaptedFrame.names(),
state._parms.train(), true);
_ubeta = state.get_ubeta();
_ubeta_normalized = normalizedUBeta(_ubeta, _random_coefficient_names, state._parms._random_columns,
state._parms.train(), state._parms._random_intercept);
if (_ubeta[0].length == _ubeta_normalized[0].length)
_random_coefficient_names_normalized = _random_coefficient_names;
else
_random_coefficient_names_normalized = copyCoefAddIntercept(_random_coefficient_names);
}
_num_random_coeffs_normalized = _ubeta_normalized[0].length;
_beta = state.get_beta();
_ubeta = state.get_ubeta();
_num_random_coeffs = _ubeta[0].length;
_iterations = state._iter;
}

public static String[] copyCoefAddIntercept(String[] originalNames) {
int nameLen = originalNames.length;
String[] longerNames = new String[nameLen+1];
System.arraycopy(originalNames, 0, longerNames, 0, nameLen);
longerNames[nameLen] = "intercept";
return longerNames;
}

@Override
public int nclasses() { // only support Gaussian now
Expand Down
2 changes: 1 addition & 1 deletion h2o-algos/src/main/java/hex/hglm/HGLMScore.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ public HGLMScore(final Job j, final HGLMModel model, DataInfo dinfo, final Strin
_randomCatArrayStartIndices = model._output._randomCatArrayStartIndices;
_predStartIndexRandom = model._output._predStartIndexRandom;
_randomSlopeToo = model._output._randomSlopeToo;
_randomIntercept = _parms._random_intercept || (_parms._standardize);
_randomIntercept = _parms._random_intercept;
_tmat = model._output._tmat; // generated from non-standardized random coefficients
randomObj = new Random(_parms._seed);
_noiseStd = Math.sqrt(_parms._tau_e_var_init); // not affected by standardization/normalization
Expand Down
4 changes: 2 additions & 2 deletions h2o-algos/src/main/java/hex/hglm/HGLMTask.java
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ public void map(Chunk[] chks) {
double residualFixed;
DataInfo.Row r = _dinfo.newDenseRow();
for (int rowInd = 0; rowInd < chkLen; rowInd++) {
_dinfo.extractDenseRow(chks, rowInd, r); // numerical values are standardized automatically if standardize=true
_dinfo.extractDenseRow(chks, rowInd, r);
if (!r.isBad() && !(r.weight == 0)) {
y = r.response(0);
level2Index = _parms._use_all_factor_levels ? r.binIds[_level2UnitIndex] - _dinfo._catOffsets[_level2UnitIndex] :
Expand Down Expand Up @@ -266,7 +266,7 @@ public void map(Chunk[] chks) {
int chkLen = chks[0].len();
DataInfo.Row r = _dinfo.newDenseRow();
for (int rowInd = 0; rowInd < chkLen; rowInd++) {
_dinfo.extractDenseRow(chks, rowInd, r); // numerical values are standardized automatically if standardize=true
_dinfo.extractDenseRow(chks, rowInd, r);
if (!r.isBad() && !(r.weight == 0)) {
y = r.response(0);
_YjTYjSum += y * y;
Expand Down
134 changes: 0 additions & 134 deletions h2o-algos/src/main/java/hex/hglm/HGLMUtils.java
Original file line number Diff line number Diff line change
@@ -1,19 +1,13 @@
package hex.hglm;

import Jama.Matrix;
import hex.DataInfo;
import water.DKV;
import water.Key;
import water.fvec.Frame;
import water.util.ArrayUtils;

import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import static hex.glm.GLMModel.GLMParameters.MissingValuesHandling.*;
import static water.util.ArrayUtils.*;

public class HGLMUtils {
Expand Down Expand Up @@ -137,106 +131,6 @@ public static double calTauEvarEq17(double residualSquare, double tauEVar, doubl
double sigmaTrace = tauEVar * trace(cInvArjTArj) ;
return (residualSquare + sigmaTrace)*oneOverN;
}

public static double[] denormalizedOneBeta(double[] beta, String[] coeffNames, String[] colNames,
Frame train, boolean interceptPresent) {
int numRandomCoeff = beta.length;
Map<String, Double> coefMean = new HashMap<>();
Map<String, Double> coefStd = new HashMap<>();
List<String> randomColList = Arrays.stream(colNames).collect(Collectors.toList());
genMeanStd(coeffNames, randomColList, train, coefMean, coefStd);
int interceptIndex = interceptPresent ? numRandomCoeff - 1 : numRandomCoeff;
double[] denormalizedUBeta = new double[interceptIndex + 1];
if (interceptPresent)
denormalizedUBeta[interceptIndex] = beta[interceptIndex];

String coefName;
for (int coefInd = 0; coefInd < numRandomCoeff; coefInd++) {
coefName = coeffNames[coefInd];
if (randomColList.contains(coefName)) { // pick out the numerical columns
denormalizedUBeta[coefInd] = beta[coefInd] / coefStd.get(coefName);
denormalizedUBeta[interceptIndex] -= beta[coefInd] * coefMean.get(coefName) / coefStd.get(coefName);
} else if (coefName != "intercept") {
denormalizedUBeta[coefInd] = beta[coefInd];
}
}
return denormalizedUBeta;
}

public static double[][] denormalizedUBeta(double[][] ubeta, String[] randomCoeffNames, String[] randomColNames,
Frame train, boolean randomIntercept) {
int numLevel2 = ubeta.length;
double[][] denormalizedBeta = new double[numLevel2][];
boolean onlyEnumRandomCols = randomColAllEnum(train, randomColNames);
for (int index=0; index<numLevel2; index++) {
if (onlyEnumRandomCols)
denormalizedBeta[index] = ubeta[index].clone();
else
denormalizedBeta[index] = denormalizedOneBeta(ubeta[index], randomCoeffNames, randomColNames, train,
randomIntercept);
}
return denormalizedBeta;
}

public static double[] normalizedOneBeta(double[] beta, String[] coeffNames, String[] columnNames,
Frame train, boolean interceptPresent) {
int numCoeff = beta.length;
int interceptIndex = interceptPresent ? numCoeff-1 : numCoeff;
double[] normalizedBeta = new double[interceptIndex+1];
List<String> colNamesList = Arrays.stream(columnNames).collect(Collectors.toList());
Map<String, Double> coefMean = new HashMap<>();
Map<String, Double> coefStd = new HashMap<>();
genMeanStd(coeffNames, colNamesList, train, coefMean, coefStd);

if (interceptPresent)
normalizedBeta[interceptIndex] = beta[interceptIndex];

String coefName;
for (int coefInd=0; coefInd < numCoeff; coefInd++) {
coefName = coeffNames[coefInd];
if (colNamesList.contains(coefName)) { // pick out numerical columns
normalizedBeta[coefInd] = beta[coefInd] * coefStd.get(coefName);
normalizedBeta[interceptIndex] += normalizedBeta[coefInd] * coefMean.get(coefName)/coefStd.get(coefName);
} else if (coefName != "intercept"){ // no change to enum columns
normalizedBeta[coefInd] = beta[coefInd];
}
}
return normalizedBeta;
}

/**
* Normalize ubeta, intercept is always the last one
*/
public static double[][] normalizedUBeta(double[][] ubeta, String[] randomCoeffNames, String[] randomColNames,
Frame train, boolean randomIntercept) {
int numLevel2 = ubeta.length;
double[][] normalizedUBeta = new double[numLevel2][];
boolean onlyEnumRandomCols = randomColAllEnum(train, randomColNames);
for (int index=0; index<numLevel2; index++) {
if (onlyEnumRandomCols)
normalizedUBeta[index] = ubeta[index].clone();
else
normalizedUBeta[index] = normalizedOneBeta(ubeta[index], randomCoeffNames, randomColNames, train, randomIntercept);
}
return normalizedUBeta;
}

public static void genMeanStd(String[] randomCoeffNames, List<String> randomColNames, Frame train,
Map<String, Double> coefMean, Map<String, Double> coefSTD) {
int numCoeff = randomCoeffNames.length;
String coefName;
double colMean;
double colStd;
for (int index=0; index<numCoeff; index++) {
coefName = randomCoeffNames[index];
if (randomColNames.contains(coefName)) {
colMean = train.vec(coefName).mean();
colStd = train.vec(coefName).sigma();
coefMean.putIfAbsent(coefName, colMean);
coefSTD.putIfAbsent(coefName, colStd);
}
}
}

public static double[][] fillZTTimesZ(double[][][] arjTArj) {
int numLevel2 = arjTArj.length;
Expand All @@ -258,29 +152,6 @@ public static boolean checkPositiveG(int numLevel2Units, double[][] tMat) {
return (new Matrix(gMat).det()) >= 0;
}

public static void generateNonStandardizeZTZArjTArs(HGLMModel.HGLMParameters parms, HGLMModel model) {
if (parms._standardize) {
boolean orignalRandomIntercept = parms._random_intercept;
parms._random_intercept = parms._random_intercept || !randomColAllEnum(parms.train(), parms._random_columns);
List<String> colNames = Arrays.asList(parms.train().names());
boolean hasWeights = model._parms._weights_column != null && colNames.contains(model._parms._weights_column);
boolean hasOffsets = model._parms._offset_column != null && colNames.contains(model._parms._offset_column);
DataInfo dinfo = new DataInfo(parms.train().clone(), null, 1, parms._use_all_factor_levels,
DataInfo.TransformType.NONE, DataInfo.TransformType.NONE,
parms.missingValuesHandling() == Skip, parms.missingValuesHandling() == MeanImputation
|| parms.missingValuesHandling() == PlugValues, parms.makeImputer(), false, hasWeights,
hasOffsets, false, null);
HGLMTask.ComputationEngineTask engineTask = new HGLMTask.ComputationEngineTask(null, parms, dinfo);
engineTask.doAll(dinfo._adaptedFrame);
model._output._arjtarj_score = engineTask._ArjTArj;
// model._output._zttimesz_score = engineTask._zTTimesZ;
parms._random_intercept = orignalRandomIntercept;
} else {
model._output._arjtarj_score = model._output._arjtarj;
// model._output._zttimesz_score = model._output._zttimesz;
}
}

public static double[][] generateNewTmat(double[][] ubeta) {
int numIndex2 = ubeta.length;
double oneOverJ = 1.0/numIndex2;
Expand All @@ -292,9 +163,4 @@ public static double[][] generateNewTmat(double[][] ubeta) {
mult(newTmat, oneOverJ);
return newTmat;
}

public static boolean randomColAllEnum(Frame train, String[] randomColumns) {
int numRandCols = randomColumns.length;
return Arrays.stream(randomColumns).filter(x -> train.vec(x).isCategorical()).count() == numRandCols;
}
}
4 changes: 2 additions & 2 deletions h2o-algos/src/main/java/hex/hglm/MetricBuilderHGLM.java
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ public ModelMetrics makeModelMetrics(Model m, Frame f, Frame adaptedFrame, Frame
double[][] tmat = hglmM._output._tmat; // already set with non-standardized random coefficients

if (forTraining) {
double loglikelihood = calHGLMllg(metricsRegression._nobs, tmat, hglmM._output._tau_e_var, hglmM._output._arjtarj_score,
this._yMinusfixPredSquare, hglmM._output._yminusxtimesz_score);
double loglikelihood = calHGLMllg(metricsRegression._nobs, tmat, hglmM._output._tau_e_var, hglmM._output._arjtarj,
this._yMinusfixPredSquare, hglmM._output._yminusxtimesz);
mm = new ModelMetricsRegressionHGLM(m, f, metricsRegression._nobs, this.weightedSigma(), loglikelihood,
this._customMetric, hglmM._output._iterations, hglmM._output._beta, hglmM._output._ubeta,
tmat, hglmM._output._tau_e_var, metricsRegression._MSE, this._yMinusfixPredSquare / metricsRegression._nobs,
Expand Down
Loading

0 comments on commit a473673

Please sign in to comment.