From 93239ffec2fec336176e531d2d486a29e6199163 Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Thu, 21 Oct 2021 22:42:37 +1100 Subject: [PATCH 01/23] Sketch out prototype --- mesmer/prototype/calibrate.py | 148 +++++++++++++++++++++++++ mesmer/prototype/calibrate_multiple.py | 25 +++++ 2 files changed, 173 insertions(+) create mode 100644 mesmer/prototype/calibrate.py create mode 100644 mesmer/prototype/calibrate_multiple.py diff --git a/mesmer/prototype/calibrate.py b/mesmer/prototype/calibrate.py new file mode 100644 index 00000000..8d8075e6 --- /dev/null +++ b/mesmer/prototype/calibrate.py @@ -0,0 +1,148 @@ +import abc + + +class MesmerCalibrateBase(metaclass=abc.ABCMeta): + """ + Abstract base class for calibration + """ + + # then we could build up utils, whether they were static methods of the + # base class or their own utils module doesn't really matter... + # an example is below, but we could have others to handle grouping, adding + # dimension information back to numpy arrays etc. + @staticmethod + def _flatten(target, predictor): + """ + Flatten xarray dataarrays into basic numpy arrays + + Parameters + ---------- + target : xarray.DataArray + + predictor : xarray.DataArray + + Returns + ------- + (np.ndarray, np.ndarray, dict) + Flattened target, flattened predictor and info required to convert + back into xarray if desired + """ + + @staticmethod + def _get_calibration_groups(target, groups): + return target.groupby(groups) + + +class MesmerCalibrateTargetOnly(MesmerCalibrateBase): + @abc.abstractmethod + def calibrate(self, predictor, **kwargs): + """ + Calibrate a model + + Parameters + ---------- + predictor : xarray.DataArray + Predictor data array + + **kwargs + Passed onto the calibration method + + Returns + ------- + xarray.DataArray + Fitting coefficients (have to think about how best to label and + store) + """ + +class MesmerCalibrateTargetPredictor(MesmerCalibrateBase): + @abc.abstractmethod + def calibrate(self, predictor, target, **kwargs): + """ + Calibrate a model + + Parameters + ---------- + predictor : xarray.DataArray + Predictor data array + + target : xarray.DataArray + Target data + + **kwargs + Passed onto the calibration method + + Returns + ------- + xarray.DataArray + Fitting coefficients (have to think about how best to label and + store) + """ + + +class LinearRegression(MesmerCalibrateTargetPredictor): + """ + following + + https://github.com/MESMER-group/mesmer/blob/d73e8f521a2e1d081a48b775ba14dd764cb671e8/mesmer/calibrate_mesmer/train_lt.py#L165 + + All the lines above and below that line are basically just data + preparation, which makes it very hard to see what the model actually is + """ + @staticmethod + def _regress_single_point(target_point, predictor, weights=None): + # this is the method that actually does the regression + args = [predictor, target_point] + if weights is not None: + args.append(weights) + + reg = LinearRegression().fit(*args) + + return reg + + def calibrate(self, target, predictor, weights=None, groups=["gridpoint"]): + # this method handles the pre-processing to get the data ready for the + # regression, does the regression (or calls another method to do so), + # then puts it all back into a DataArray for output + + # would need to be smarter here too to handle multiple predictors and/ + # or target variables but that should be doable + res = { + "intercepts": [], + "coef": [], + } + for target_group in self._get_calibration_groups(target, groups): + res_group = target_group.apply(self._regress_single_point, predictor, weights) + # need to be smarter about this, but idea is to store things using + # xarray DataArray + res["intercepts"].append(res_group.intercept_) + res["coef"].append(res_group.coef_) + + res = xr.DataArray(res) + return res + + +class AutoRegression(MesmerCalibrateTargetOnly): + def _regress_single(target, lags): + res = AutoReg(target, lags=lags).fit() + + return res + + def calibrate(self, target, groups=["gridpoint"], lags=1): + # would need to be smarter here too to handle multiple predictors and/ + # or target variables but that should be doable + res = { + "intercepts": [], + "coef": [], + "stds": [], + } + for target_group in self._get_calibration_groups(target, groups): + res_group = target_group.apply(self._regress_single, lags) + # need to be smarter about this, but idea is to store things using + # xarray DataArray + res["intercepts"].append(res_group.params[0]) + res["coef"].append(res_group.params[1]) + res["stds"].append(np.sqrt(res_group.sigma2)) + + res = xr.DataArray(res) + + return res diff --git a/mesmer/prototype/calibrate_multiple.py b/mesmer/prototype/calibrate_multiple.py new file mode 100644 index 00000000..103d454c --- /dev/null +++ b/mesmer/prototype/calibrate_multiple.py @@ -0,0 +1,25 @@ +def calibrate_multiple_scenarios_and_ensemble_members(targets, predictors, calibration_class, calibration_kwargs, weighting_style): + """ + Calibrate based on multiple scenarios and ensemble members per scenario + + Parameters + ---------- + targets : xarray.DataArray + Target variables to calibrate to. This should have a scenario and an ensemble-member dimension (you'd need a different function to prepare things such that the dimensions all worked) + + predictors : xarray.DataArray + Predictor variables, as above re comments about prepping data + + calibration_class : mesmer.calibration.MesmerCalibrateBase + Class to use for calibration + + calibration_kwargs : dict + Keyword arguments to pass to the calibration class + + weighting_style : ["equal", "equal_scenario", "equal_ensemble_member"] + How to weight combinations of scenarios and ensemble members. "equal" --> all scenario - ensemble member combination are treated equally. "equal_scenario" --> all scenarios get equal weight (so if a scenario has more ensemble members then each ensemble member gets less weight). "equal_ensemble_member" --> all ensemble members get the same weight (so if an ensemble member is reported for more than one scenario then each scenario gets less weight for that ensemble member) + """ + # this function (or class) would handle all of the looping over scenarios + # and ensemble members. It would call the calibration class on each + # scenario and ensemble member and know how much weight to give each + # before combining them From 9ae897f43c8a048825597412f0fdc4b134b65846 Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Sun, 24 Oct 2021 17:47:31 +1100 Subject: [PATCH 02/23] Mark path which is never used to simply things --- mesmer/calibrate_mesmer/train_lt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mesmer/calibrate_mesmer/train_lt.py b/mesmer/calibrate_mesmer/train_lt.py index 13ae4e93..9c57317f 100644 --- a/mesmer/calibrate_mesmer/train_lt.py +++ b/mesmer/calibrate_mesmer/train_lt.py @@ -222,5 +222,5 @@ def train_lt(preds, targs, esm, cfg, save_params=True): if len(params_lv) > 0: return params_lt, params_lv - else: - return params_lt + + raise NotImplementedError("This path is never used") From 35b5fed8b08d85ff17dcea0b7f9d892134bd2272 Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Sun, 24 Oct 2021 17:54:28 +1100 Subject: [PATCH 03/23] Block out another branch --- mesmer/calibrate_mesmer/train_lt.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mesmer/calibrate_mesmer/train_lt.py b/mesmer/calibrate_mesmer/train_lt.py index 9c57317f..41ea51d6 100644 --- a/mesmer/calibrate_mesmer/train_lt.py +++ b/mesmer/calibrate_mesmer/train_lt.py @@ -182,6 +182,9 @@ def train_lt(preds, targs, esm, cfg, save_params=True): ] coef_idx += 1 + else: + raise NotImplementedError() + # save the local trend paramters if requested if save_params: dir_mesmer_params = cfg.dir_mesmer_params From 92f1e4ec681d20d4cf51b13e97d3bf85e9bcd3c2 Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Sun, 24 Oct 2021 19:46:34 +1100 Subject: [PATCH 04/23] Implemente prototype method --- mesmer/calibrate_mesmer/train_lt.py | 2 +- mesmer/prototype/calibrate.py | 80 ++++++++----- tests/integration/test_prototype.py | 168 ++++++++++++++++++++++++++++ 3 files changed, 222 insertions(+), 28 deletions(-) create mode 100644 tests/integration/test_prototype.py diff --git a/mesmer/calibrate_mesmer/train_lt.py b/mesmer/calibrate_mesmer/train_lt.py index 41ea51d6..31654139 100644 --- a/mesmer/calibrate_mesmer/train_lt.py +++ b/mesmer/calibrate_mesmer/train_lt.py @@ -84,13 +84,13 @@ def train_lt(preds, targs, esm, cfg, save_params=True): assumption is fulfilled or rewrite code such that no longer necessary) """ - targ_names = list(targs.keys()) targ_name = targ_names[0] # because same approach for each targ pred_names = list(preds.keys()) # specify necessary variables from config file wgt_scen_tr_eq = cfg.wgt_scen_tr_eq + # This code will ever only work with a single target variable method_lt = cfg.methods[targ_name]["lt"] method_lv = cfg.methods[targ_name]["lv"] method_lt_each_gp_sep = cfg.method_lt_each_gp_sep diff --git a/mesmer/prototype/calibrate.py b/mesmer/prototype/calibrate.py index 8d8075e6..a64ab2a5 100644 --- a/mesmer/prototype/calibrate.py +++ b/mesmer/prototype/calibrate.py @@ -1,37 +1,54 @@ import abc +import numpy as np +import sklearn.linear_model +import xarray as xr + class MesmerCalibrateBase(metaclass=abc.ABCMeta): """ Abstract base class for calibration """ + def _flatten_predictors(self, predictors, flat_dim_order): + predictors_flat = [] + predictor_order = [] + for v, vals in predictors.items(): + predictor_order.append(v) + predictors_flat.append(self._flatten(vals, flat_dim_order)) - # then we could build up utils, whether they were static methods of the - # base class or their own utils module doesn't really matter... - # an example is below, but we could have others to handle grouping, adding - # dimension information back to numpy arrays etc. - @staticmethod - def _flatten(target, predictor): - """ - Flatten xarray dataarrays into basic numpy arrays + predictors_flat = np.vstack(predictors_flat).T - Parameters - ---------- - target : xarray.DataArray + return predictors_flat, predictor_order - predictor : xarray.DataArray + @staticmethod + def _flatten(values, flat_dim_order, dropna=True): + out = values.transpose(*flat_dim_order).values.flatten() + # TODO: better handling of nan + # Probably a better solution is to pass all the flattening up a level + # i.e. should be done before calibration is attempted/have a function + # to handle flattening scenarios + if dropna: + out = out[np.where(~np.isnan(out))] - Returns - ------- - (np.ndarray, np.ndarray, dict) - Flattened target, flattened predictor and info required to convert - back into xarray if desired - """ + return out @staticmethod def _get_calibration_groups(target, groups): return target.groupby(groups) + @staticmethod + def _unflatten_calibrated_values(res, dim_names, coords): + res_flat = np.vstack(list(res.values())) + coords = { + **coords, + "predictor": list(res.keys()) + } + + out = xr.DataArray(res_flat, dims=["predictor"] + dim_names, coords=coords) + + return out + + class MesmerCalibrateTargetOnly(MesmerCalibrateBase): @abc.abstractmethod @@ -95,11 +112,11 @@ def _regress_single_point(target_point, predictor, weights=None): if weights is not None: args.append(weights) - reg = LinearRegression().fit(*args) + reg = sklearn.linear_model.LinearRegression().fit(*args) return reg - def calibrate(self, target, predictor, weights=None, groups=["gridpoint"]): + def calibrate(self, target, predictors, weights=None, groups="gridpoint"): # this method handles the pre-processing to get the data ready for the # regression, does the regression (or calls another method to do so), # then puts it all back into a DataArray for output @@ -107,17 +124,26 @@ def calibrate(self, target, predictor, weights=None, groups=["gridpoint"]): # would need to be smarter here too to handle multiple predictors and/ # or target variables but that should be doable res = { - "intercepts": [], - "coef": [], + **{k: [] for k in predictors}, + "intercept": [], } - for target_group in self._get_calibration_groups(target, groups): - res_group = target_group.apply(self._regress_single_point, predictor, weights) + + flat_dim_order = [d for d in target.dims if d != groups] + + predictor_numpy, predictor_order = self._flatten_predictors(predictors, flat_dim_order) + for group_val, target_group in self._get_calibration_groups(target, groups): + target_group_flat = self._flatten(target_group, flat_dim_order) + + res_group = self._regress_single_point(target_group_flat, predictor_numpy, weights) + # need to be smarter about this, but idea is to store things using # xarray DataArray - res["intercepts"].append(res_group.intercept_) - res["coef"].append(res_group.coef_) + res["intercept"].append(res_group.intercept_) + for i, p in enumerate(predictor_order): + res[p].append(res_group.coef_[i]) + + res = self._unflatten_calibrated_values(res, dim_names=[groups], coords={groups: getattr(target, groups)}) - res = xr.DataArray(res) return res diff --git a/tests/integration/test_prototype.py b/tests/integration/test_prototype.py new file mode 100644 index 00000000..5c059420 --- /dev/null +++ b/tests/integration/test_prototype.py @@ -0,0 +1,168 @@ +import numpy as np +import numpy.testing as npt +import xarray as xr + +from mesmer.calibrate_mesmer.train_lt import train_lt +from mesmer.prototype.calibrate import LinearRegression + + +class _MockConfig: + def __init__( + self, + method_lt="OLS", + method_lv="OLS_AR1_sci", + separate_gridpoints=True, + weight_scenarios_equally=True, + target_variable="tas", + ): + self.methods = {} + self.methods[target_variable] = {} + self.methods[target_variable]["lt"] = method_lt + self.methods[target_variable]["lv"] = method_lv + self.method_lt_each_gp_sep = separate_gridpoints + self.wgt_scen_tr_eq = weight_scenarios_equally + + +def _do_legacy_run( + emulator_tas, + emulator_tas_squared, + emulator_hfds, + global_variability, + esm_tas, + cfg, +): + preds_legacy = {} + for name, vals in ( + ("gttas", emulator_tas), + ("gttas2", emulator_tas_squared), + ("gthfds", emulator_hfds), + ("gvtas", global_variability), + ): + preds_legacy[name] = {} + for scenario, vals_scen in vals.groupby("scenario"): + # we have to force this to have an extra dimension, run, for legacy + # to work although this really isn't how it should be because it + # means you have some weird reshaping to do at a really low level + preds_legacy[name][scenario] = vals_scen.dropna(dim="time").values[np.newaxis, :] + + + targs_legacy = {} + for name, vals in ( + ("tas", esm_tas), + ): + targs_legacy[name] = {} + for scenario, vals_scen in vals.groupby("scenario"): + # we have to force this to have an extra dimension, run, for legacy + # to work although this really isn't how it should be + # order of dimensions is very important for legacy too + targs_legacy[name][scenario] = vals_scen.T.dropna(dim="time").values[np.newaxis, :, :] + + + res_legacy = train_lt( + preds_legacy, + targs_legacy, + esm="esm_name", + cfg=cfg, + save_params=False, + ) + + return res_legacy + + +def test_prototype_train_lt( +): + time = [1850, 1950, 2014, 2015, 2050, 2100, 2300] + scenarios = ["hist", "ssp126"] + + pred_dims = ["scenario", "time"] + pred_coords = dict( + time=time, + scenario=scenarios, + ) + + emulator_tas = xr.DataArray( + np.array([ + [0, 0.5, 1, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, 1.1, 1.5, 1.4, 1.2], + ]), + dims=pred_dims, + coords=pred_coords, + ) + emulator_tas_squared = emulator_tas ** 2 + global_variability = xr.DataArray( + np.array([ + [-0.1, 0.1, 0.03, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, 0.04, 0.2, -0.03, 0.0], + ]), + dims=pred_dims, + coords=pred_coords, + ) + emulator_hfds = xr.DataArray( + np.array([ + [0.5, 1.5, 2.0, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, 2.1, 2.0, 1.5, 0.4], + ]), + dims=pred_dims, + coords=pred_coords, + ) + + # we wouldn't actually start like this, but we'd write a utils function + # to simply go from lat-lon to gridpoint and back + targ_dims = ["scenario", "gridpoint", "time"] + targ_coords = dict( + time=time, + scenario=scenarios, + gridpoint=[0, 1], + lat=(["gridpoint"], [-60, 60]), + lon=(["gridpoint"], [120, 240]), + ) + esm_tas = xr.DataArray( + np.array([ + [ + [0.6, 1.6, 2.6, np.nan, np.nan, np.nan, np.nan], + [0.43, 1.13, 2.21, np.nan, np.nan, np.nan, np.nan], + ], + [ + [np.nan, np.nan, np.nan, 2.11, 2.01, 1.54, 1.22], + [np.nan, np.nan, np.nan, 2.19, 2.04, 1.53, 1.21], + ] + ]), + dims=targ_dims, + coords=targ_coords, + ) + + res_legacy = _do_legacy_run( + emulator_tas, + emulator_tas_squared, + emulator_hfds, + global_variability, + esm_tas, + cfg=_MockConfig(), + ) + + + res_updated = LinearRegression().calibrate( + esm_tas, + predictors={ + "emulator_tas": emulator_tas, + "emulator_tas_squared": emulator_tas_squared, + "emulator_hfds": emulator_hfds, + "global_variability": global_variability, + } + ) + + # check that calibrated parameters match for each predictor variable + for updated_name, legacy_vals in ( + ("emulator_tas", res_legacy[0]["coef_gttas"]["tas"]), + ("emulator_tas_squared", res_legacy[0]["coef_gttas2"]["tas"]), + ("emulator_hfds", res_legacy[0]["coef_gthfds"]["tas"]), + ("global_variability", res_legacy[1]["coef_gvtas"]["tas"]), + ("intercept", res_legacy[0]["intercept"]["tas"]), + ): + npt.assert_allclose(res_updated.sel(predictor=updated_name), legacy_vals) + + +# things that aren't handled well: +# - checking that ensemble member and scenario actually make a sensible set +# - dropping of nans in prototyped method +# - units (should probably be using dataset rather than dataarray for outputs) From 47fa2c6618162e92e6c74de190d3548a93f99d24 Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Mon, 25 Oct 2021 13:12:41 +1100 Subject: [PATCH 05/23] Tidy up --- mesmer/prototype/calibrate.py | 199 ++++++++++++++-------------- tests/integration/test_prototype.py | 8 +- 2 files changed, 106 insertions(+), 101 deletions(-) diff --git a/mesmer/prototype/calibrate.py b/mesmer/prototype/calibrate.py index a64ab2a5..40f6f761 100644 --- a/mesmer/prototype/calibrate.py +++ b/mesmer/prototype/calibrate.py @@ -9,90 +9,80 @@ class MesmerCalibrateBase(metaclass=abc.ABCMeta): """ Abstract base class for calibration """ - def _flatten_predictors(self, predictors, flat_dim_order): - predictors_flat = [] - predictor_order = [] - for v, vals in predictors.items(): - predictor_order.append(v) - predictors_flat.append(self._flatten(vals, flat_dim_order)) + @staticmethod + def _get_stack_coord_name(inp_array): + stack_coord_name = "stacked_coord" + if stack_coord_name in inp_array.dims: + stack_coord_name = "memser_stacked_coord" - predictors_flat = np.vstack(predictors_flat).T + if stack_coord_name in inp_array.dims: + raise NotImplementedError("You have dimensions we can't safely unstack yet") - return predictors_flat, predictor_order + return stack_coord_name @staticmethod - def _flatten(values, flat_dim_order, dropna=True): - out = values.transpose(*flat_dim_order).values.flatten() - # TODO: better handling of nan - # Probably a better solution is to pass all the flattening up a level - # i.e. should be done before calibration is attempted/have a function - # to handle flattening scenarios - if dropna: - out = out[np.where(~np.isnan(out))] - - return out + def _check_coords_match(obj, obj_other, check_coord): + coords_match = obj.coords[check_coord].equals(obj_other.coords[check_coord]) + if not coords_match: + raise AssertionError(f"{check_coord} is not the same on {obj} and {obj_other}") @staticmethod - def _get_calibration_groups(target, groups): - return target.groupby(groups) + def _flatten_predictors(predictors, dims_to_flatten, stack_coord_name): + predictors_flat = [] + for v, vals in predictors.items(): + if stack_coord_name in vals.dims: + raise AssertionError(f"{stack_coord_name} is already in {vals.dims}") - @staticmethod - def _unflatten_calibrated_values(res, dim_names, coords): - res_flat = np.vstack(list(res.values())) - coords = { - **coords, - "predictor": list(res.keys()) - } + vals_flat = vals.stack({stack_coord_name: dims_to_flatten}).dropna(stack_coord_name) + vals_flat.name = v + predictors_flat.append(vals_flat) - out = xr.DataArray(res_flat, dims=["predictor"] + dim_names, coords=coords) + out = xr.merge(predictors_flat).to_stacked_array("predictor", sample_dims=[stack_coord_name]) return out + def _flatten_predictors_and_target(self, predictors, target): + dims_to_flatten = self._get_predictor_dims(predictors) + stack_coord_name = self._get_stack_coord_name(target) + target_flattened = target.stack({stack_coord_name: dims_to_flatten}).dropna(stack_coord_name) + predictors_flattened = self._flatten_predictors(predictors, dims_to_flatten, stack_coord_name) + self._check_coords_match(target_flattened, predictors_flattened, stack_coord_name) -class MesmerCalibrateTargetOnly(MesmerCalibrateBase): - @abc.abstractmethod - def calibrate(self, predictor, **kwargs): - """ - Calibrate a model + return predictors_flattened, target_flattened, stack_coord_name - Parameters - ---------- - predictor : xarray.DataArray - Predictor data array + @staticmethod + def _get_calibration_groups(target_flattened, stack_coord_name): + calibration_groups = list(set(target_flattened.dims) - {stack_coord_name}) + if len(calibration_groups) > 1: + raise NotImplementedError() - **kwargs - Passed onto the calibration method - - Returns - ------- - xarray.DataArray - Fitting coefficients (have to think about how best to label and - store) - """ + return calibration_groups[0] class MesmerCalibrateTargetPredictor(MesmerCalibrateBase): @abc.abstractmethod - def calibrate(self, predictor, target, **kwargs): + def calibrate(self, target, predictor, **kwargs): """ Calibrate a model + [TODO: update this based on whatever LinearRegression.calibrate's docs + end up looking like] + Parameters ---------- - predictor : xarray.DataArray - Predictor data array - target : xarray.DataArray Target data + predictor : xarray.DataArray + Predictor data array + **kwargs Passed onto the calibration method Returns ------- xarray.DataArray - Fitting coefficients (have to think about how best to label and - store) + Fitting coefficients [TODO description of how labelled] """ @@ -106,7 +96,7 @@ class LinearRegression(MesmerCalibrateTargetPredictor): preparation, which makes it very hard to see what the model actually is """ @staticmethod - def _regress_single_point(target_point, predictor, weights=None): + def _regress_single_group(target_point, predictor, weights=None): # this is the method that actually does the regression args = [predictor, target_point] if weights is not None: @@ -116,59 +106,74 @@ def _regress_single_point(target_point, predictor, weights=None): return reg - def calibrate(self, target, predictors, weights=None, groups="gridpoint"): - # this method handles the pre-processing to get the data ready for the - # regression, does the regression (or calls another method to do so), - # then puts it all back into a DataArray for output - - # would need to be smarter here too to handle multiple predictors and/ - # or target variables but that should be doable - res = { - **{k: [] for k in predictors}, - "intercept": [], - } + @staticmethod + def _get_predictor_dims(predictors): + predictors_dims = {k: v.dims for k, v in predictors.items()} + predictors_dims_unique = set(predictors_dims.values()) + if len(predictors_dims_unique) > 1: + raise AssertionError(f"Dimensions of predictors are not all the same, we have: {predictors_dims}") - flat_dim_order = [d for d in target.dims if d != groups] + return list(predictors_dims_unique)[0] - predictor_numpy, predictor_order = self._flatten_predictors(predictors, flat_dim_order) - for group_val, target_group in self._get_calibration_groups(target, groups): - target_group_flat = self._flatten(target_group, flat_dim_order) + def _calibrate_groups(self, target_flattened, predictor_numpy, predictor_names, weights, calibration_groups): + def _calibrate_group(target_group): + target_group_numpy = target_group.values + res_group = self._regress_single_group(target_group_numpy, predictor_numpy, weights) - res_group = self._regress_single_point(target_group_flat, predictor_numpy, weights) + res_xr = xr.DataArray(np.concatenate([[res_group.intercept_], res_group.coef_]), dims=["predictor"], coords={"predictor": predictor_names}) - # need to be smarter about this, but idea is to store things using - # xarray DataArray - res["intercept"].append(res_group.intercept_) - for i, p in enumerate(predictor_order): - res[p].append(res_group.coef_[i]) + return res_xr - res = self._unflatten_calibrated_values(res, dim_names=[groups], coords={groups: getattr(target, groups)}) + res = target_flattened.groupby(calibration_groups).apply(_calibrate_group) return res + def calibrate(self, target, predictors, weights=None): + """ + Calibrate a linear regression model -class AutoRegression(MesmerCalibrateTargetOnly): - def _regress_single(target, lags): - res = AutoReg(target, lags=lags).fit() + Parameters + ---------- + target : xarray.DataArray + Target data - return res + predictors : dict[str: xarray.DataArray] + Predictors to use, each key gives the name of the predictor, the + values give the values of the predictor. - def calibrate(self, target, groups=["gridpoint"], lags=1): - # would need to be smarter here too to handle multiple predictors and/ - # or target variables but that should be doable - res = { - "intercepts": [], - "coef": [], - "stds": [], - } - for target_group in self._get_calibration_groups(target, groups): - res_group = target_group.apply(self._regress_single, lags) - # need to be smarter about this, but idea is to store things using - # xarray DataArray - res["intercepts"].append(res_group.params[0]) - res["coef"].append(res_group.params[1]) - res["stds"].append(np.sqrt(res_group.sigma2)) - - res = xr.DataArray(res) + weights : optional, xarray.DataArray + Weights to use for dimensions in ``predictors`` - return res + Returns + ------- + xarray.DataArray + Fitting coefficients, the dimensions of which are the combination + of "predictor" and all dimensions in ``target`` which are not in + ``predictors``. For example, the resulting dimensions could be + ["gridpoint", "predictor"]. + + Notes + ----- + We flatten ``target``, ``predictors`` and ``weights`` across the + dimensions shared by ``target`` and ``predictors`` to create the input + arrays for the linear regression. + """ + predictors_flattened, target_flattened, stack_coord_name = self._flatten_predictors_and_target( + predictors, target + ) + + # stacked coord has to go first for the regression to be setup correctly + predictors_flattened_dim_order = [stack_coord_name] + [d for d in predictors_flattened.dims if d != stack_coord_name] + predictors_flattened_reordered = predictors_flattened.transpose(*predictors_flattened_dim_order) + predictors_plus_intercept = ["intercept"] + list(predictors_flattened_reordered["variable"].values) + predictor_numpy = predictors_flattened_reordered.values + + calibration_groups = self._get_calibration_groups(target_flattened, stack_coord_name) + + return self._calibrate_groups( + target_flattened, + predictor_numpy, + predictor_names=predictors_plus_intercept, + weights=weights, + calibration_groups=calibration_groups, + ) diff --git a/tests/integration/test_prototype.py b/tests/integration/test_prototype.py index 5c059420..90ea8063 100644 --- a/tests/integration/test_prototype.py +++ b/tests/integration/test_prototype.py @@ -162,7 +162,7 @@ def test_prototype_train_lt( npt.assert_allclose(res_updated.sel(predictor=updated_name), legacy_vals) -# things that aren't handled well: -# - checking that ensemble member and scenario actually make a sensible set -# - dropping of nans in prototyped method -# - units (should probably be using dataset rather than dataarray for outputs) +# things that aren't tested well: +# - what happens if ensemble member and scenario don't actually make a coherent set +# - units (should probably be using dataset rather than dataarray for inputs and outputs?) +# - weights From b408db01dedda6a6702f7551061ea5a0fa2a7fa9 Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Mon, 25 Oct 2021 13:13:26 +1100 Subject: [PATCH 06/23] Format --- mesmer/prototype/calibrate.py | 74 ++++++++++++++++++++------ mesmer/prototype/calibrate_multiple.py | 4 +- tests/integration/test_prototype.py | 68 ++++++++++++----------- 3 files changed, 97 insertions(+), 49 deletions(-) diff --git a/mesmer/prototype/calibrate.py b/mesmer/prototype/calibrate.py index 40f6f761..e54af425 100644 --- a/mesmer/prototype/calibrate.py +++ b/mesmer/prototype/calibrate.py @@ -9,6 +9,7 @@ class MesmerCalibrateBase(metaclass=abc.ABCMeta): """ Abstract base class for calibration """ + @staticmethod def _get_stack_coord_name(inp_array): stack_coord_name = "stacked_coord" @@ -24,7 +25,9 @@ def _get_stack_coord_name(inp_array): def _check_coords_match(obj, obj_other, check_coord): coords_match = obj.coords[check_coord].equals(obj_other.coords[check_coord]) if not coords_match: - raise AssertionError(f"{check_coord} is not the same on {obj} and {obj_other}") + raise AssertionError( + f"{check_coord} is not the same on {obj} and {obj_other}" + ) @staticmethod def _flatten_predictors(predictors, dims_to_flatten, stack_coord_name): @@ -33,11 +36,15 @@ def _flatten_predictors(predictors, dims_to_flatten, stack_coord_name): if stack_coord_name in vals.dims: raise AssertionError(f"{stack_coord_name} is already in {vals.dims}") - vals_flat = vals.stack({stack_coord_name: dims_to_flatten}).dropna(stack_coord_name) + vals_flat = vals.stack({stack_coord_name: dims_to_flatten}).dropna( + stack_coord_name + ) vals_flat.name = v predictors_flat.append(vals_flat) - out = xr.merge(predictors_flat).to_stacked_array("predictor", sample_dims=[stack_coord_name]) + out = xr.merge(predictors_flat).to_stacked_array( + "predictor", sample_dims=[stack_coord_name] + ) return out @@ -45,9 +52,15 @@ def _flatten_predictors_and_target(self, predictors, target): dims_to_flatten = self._get_predictor_dims(predictors) stack_coord_name = self._get_stack_coord_name(target) - target_flattened = target.stack({stack_coord_name: dims_to_flatten}).dropna(stack_coord_name) - predictors_flattened = self._flatten_predictors(predictors, dims_to_flatten, stack_coord_name) - self._check_coords_match(target_flattened, predictors_flattened, stack_coord_name) + target_flattened = target.stack({stack_coord_name: dims_to_flatten}).dropna( + stack_coord_name + ) + predictors_flattened = self._flatten_predictors( + predictors, dims_to_flatten, stack_coord_name + ) + self._check_coords_match( + target_flattened, predictors_flattened, stack_coord_name + ) return predictors_flattened, target_flattened, stack_coord_name @@ -59,6 +72,7 @@ def _get_calibration_groups(target_flattened, stack_coord_name): return calibration_groups[0] + class MesmerCalibrateTargetPredictor(MesmerCalibrateBase): @abc.abstractmethod def calibrate(self, target, predictor, **kwargs): @@ -95,6 +109,7 @@ class LinearRegression(MesmerCalibrateTargetPredictor): All the lines above and below that line are basically just data preparation, which makes it very hard to see what the model actually is """ + @staticmethod def _regress_single_group(target_point, predictor, weights=None): # this is the method that actually does the regression @@ -111,16 +126,31 @@ def _get_predictor_dims(predictors): predictors_dims = {k: v.dims for k, v in predictors.items()} predictors_dims_unique = set(predictors_dims.values()) if len(predictors_dims_unique) > 1: - raise AssertionError(f"Dimensions of predictors are not all the same, we have: {predictors_dims}") + raise AssertionError( + f"Dimensions of predictors are not all the same, we have: {predictors_dims}" + ) return list(predictors_dims_unique)[0] - def _calibrate_groups(self, target_flattened, predictor_numpy, predictor_names, weights, calibration_groups): + def _calibrate_groups( + self, + target_flattened, + predictor_numpy, + predictor_names, + weights, + calibration_groups, + ): def _calibrate_group(target_group): target_group_numpy = target_group.values - res_group = self._regress_single_group(target_group_numpy, predictor_numpy, weights) + res_group = self._regress_single_group( + target_group_numpy, predictor_numpy, weights + ) - res_xr = xr.DataArray(np.concatenate([[res_group.intercept_], res_group.coef_]), dims=["predictor"], coords={"predictor": predictor_names}) + res_xr = xr.DataArray( + np.concatenate([[res_group.intercept_], res_group.coef_]), + dims=["predictor"], + coords={"predictor": predictor_names}, + ) return res_xr @@ -158,17 +188,27 @@ def calibrate(self, target, predictors, weights=None): dimensions shared by ``target`` and ``predictors`` to create the input arrays for the linear regression. """ - predictors_flattened, target_flattened, stack_coord_name = self._flatten_predictors_and_target( - predictors, target - ) + ( + predictors_flattened, + target_flattened, + stack_coord_name, + ) = self._flatten_predictors_and_target(predictors, target) # stacked coord has to go first for the regression to be setup correctly - predictors_flattened_dim_order = [stack_coord_name] + [d for d in predictors_flattened.dims if d != stack_coord_name] - predictors_flattened_reordered = predictors_flattened.transpose(*predictors_flattened_dim_order) - predictors_plus_intercept = ["intercept"] + list(predictors_flattened_reordered["variable"].values) + predictors_flattened_dim_order = [stack_coord_name] + [ + d for d in predictors_flattened.dims if d != stack_coord_name + ] + predictors_flattened_reordered = predictors_flattened.transpose( + *predictors_flattened_dim_order + ) + predictors_plus_intercept = ["intercept"] + list( + predictors_flattened_reordered["variable"].values + ) predictor_numpy = predictors_flattened_reordered.values - calibration_groups = self._get_calibration_groups(target_flattened, stack_coord_name) + calibration_groups = self._get_calibration_groups( + target_flattened, stack_coord_name + ) return self._calibrate_groups( target_flattened, diff --git a/mesmer/prototype/calibrate_multiple.py b/mesmer/prototype/calibrate_multiple.py index 103d454c..f42369b5 100644 --- a/mesmer/prototype/calibrate_multiple.py +++ b/mesmer/prototype/calibrate_multiple.py @@ -1,4 +1,6 @@ -def calibrate_multiple_scenarios_and_ensemble_members(targets, predictors, calibration_class, calibration_kwargs, weighting_style): +def calibrate_multiple_scenarios_and_ensemble_members( + targets, predictors, calibration_class, calibration_kwargs, weighting_style +): """ Calibrate based on multiple scenarios and ensemble members per scenario diff --git a/tests/integration/test_prototype.py b/tests/integration/test_prototype.py index 90ea8063..dbb1dd7e 100644 --- a/tests/integration/test_prototype.py +++ b/tests/integration/test_prototype.py @@ -43,20 +43,20 @@ def _do_legacy_run( # we have to force this to have an extra dimension, run, for legacy # to work although this really isn't how it should be because it # means you have some weird reshaping to do at a really low level - preds_legacy[name][scenario] = vals_scen.dropna(dim="time").values[np.newaxis, :] - + preds_legacy[name][scenario] = vals_scen.dropna(dim="time").values[ + np.newaxis, : + ] targs_legacy = {} - for name, vals in ( - ("tas", esm_tas), - ): + for name, vals in (("tas", esm_tas),): targs_legacy[name] = {} for scenario, vals_scen in vals.groupby("scenario"): # we have to force this to have an extra dimension, run, for legacy # to work although this really isn't how it should be # order of dimensions is very important for legacy too - targs_legacy[name][scenario] = vals_scen.T.dropna(dim="time").values[np.newaxis, :, :] - + targs_legacy[name][scenario] = vals_scen.T.dropna(dim="time").values[ + np.newaxis, :, : + ] res_legacy = train_lt( preds_legacy, @@ -69,8 +69,7 @@ def _do_legacy_run( return res_legacy -def test_prototype_train_lt( -): +def test_prototype_train_lt(): time = [1850, 1950, 2014, 2015, 2050, 2100, 2300] scenarios = ["hist", "ssp126"] @@ -81,27 +80,33 @@ def test_prototype_train_lt( ) emulator_tas = xr.DataArray( - np.array([ - [0, 0.5, 1, np.nan, np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan, 1.1, 1.5, 1.4, 1.2], - ]), + np.array( + [ + [0, 0.5, 1, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, 1.1, 1.5, 1.4, 1.2], + ] + ), dims=pred_dims, coords=pred_coords, ) emulator_tas_squared = emulator_tas ** 2 global_variability = xr.DataArray( - np.array([ - [-0.1, 0.1, 0.03, np.nan, np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan, 0.04, 0.2, -0.03, 0.0], - ]), + np.array( + [ + [-0.1, 0.1, 0.03, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, 0.04, 0.2, -0.03, 0.0], + ] + ), dims=pred_dims, coords=pred_coords, ) emulator_hfds = xr.DataArray( - np.array([ - [0.5, 1.5, 2.0, np.nan, np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan, 2.1, 2.0, 1.5, 0.4], - ]), + np.array( + [ + [0.5, 1.5, 2.0, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, 2.1, 2.0, 1.5, 0.4], + ] + ), dims=pred_dims, coords=pred_coords, ) @@ -117,16 +122,18 @@ def test_prototype_train_lt( lon=(["gridpoint"], [120, 240]), ) esm_tas = xr.DataArray( - np.array([ - [ - [0.6, 1.6, 2.6, np.nan, np.nan, np.nan, np.nan], - [0.43, 1.13, 2.21, np.nan, np.nan, np.nan, np.nan], - ], + np.array( [ - [np.nan, np.nan, np.nan, 2.11, 2.01, 1.54, 1.22], - [np.nan, np.nan, np.nan, 2.19, 2.04, 1.53, 1.21], + [ + [0.6, 1.6, 2.6, np.nan, np.nan, np.nan, np.nan], + [0.43, 1.13, 2.21, np.nan, np.nan, np.nan, np.nan], + ], + [ + [np.nan, np.nan, np.nan, 2.11, 2.01, 1.54, 1.22], + [np.nan, np.nan, np.nan, 2.19, 2.04, 1.53, 1.21], + ], ] - ]), + ), dims=targ_dims, coords=targ_coords, ) @@ -140,7 +147,6 @@ def test_prototype_train_lt( cfg=_MockConfig(), ) - res_updated = LinearRegression().calibrate( esm_tas, predictors={ @@ -148,7 +154,7 @@ def test_prototype_train_lt( "emulator_tas_squared": emulator_tas_squared, "emulator_hfds": emulator_hfds, "global_variability": global_variability, - } + }, ) # check that calibrated parameters match for each predictor variable From b93de5877fbe84a39f713b6d0fc1cd36ed78eac6 Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Mon, 25 Oct 2021 13:21:12 +1100 Subject: [PATCH 07/23] Tidy up a bit more --- mesmer/prototype/calibrate.py | 60 +++++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/mesmer/prototype/calibrate.py b/mesmer/prototype/calibrate.py index e54af425..c403d28e 100644 --- a/mesmer/prototype/calibrate.py +++ b/mesmer/prototype/calibrate.py @@ -10,6 +10,17 @@ class MesmerCalibrateBase(metaclass=abc.ABCMeta): Abstract base class for calibration """ + @staticmethod + def _get_predictor_dims(predictors): + predictors_dims = {k: v.dims for k, v in predictors.items()} + predictors_dims_unique = set(predictors_dims.values()) + if len(predictors_dims_unique) > 1: + raise AssertionError( + f"Dimensions of predictors are not all the same, we have: {predictors_dims}" + ) + + return list(predictors_dims_unique)[0] + @staticmethod def _get_stack_coord_name(inp_array): stack_coord_name = "stacked_coord" @@ -121,17 +132,6 @@ def _regress_single_group(target_point, predictor, weights=None): return reg - @staticmethod - def _get_predictor_dims(predictors): - predictors_dims = {k: v.dims for k, v in predictors.items()} - predictors_dims_unique = set(predictors_dims.values()) - if len(predictors_dims_unique) > 1: - raise AssertionError( - f"Dimensions of predictors are not all the same, we have: {predictors_dims}" - ) - - return list(predictors_dims_unique)[0] - def _calibrate_groups( self, target_flattened, @@ -158,6 +158,25 @@ def _calibrate_group(target_group): return res + @staticmethod + def _get_predictors_numpy_and_output_predictor_order( + predictors_flattened, stack_coord_name + ): + # stacked coord has to go first for the regression to be setup correctly + predictors_flattened_dim_order = [stack_coord_name] + [ + d for d in predictors_flattened.dims if d != stack_coord_name + ] + predictors_flattened_reordered = predictors_flattened.transpose( + *predictors_flattened_dim_order + ) + predictors_numpy = predictors_flattened_reordered.values + + predictors_plus_intercept_order = ["intercept"] + list( + predictors_flattened_reordered["variable"].values + ) + + return predictors_numpy, predictors_plus_intercept_order + def calibrate(self, target, predictors, weights=None): """ Calibrate a linear regression model @@ -194,17 +213,12 @@ def calibrate(self, target, predictors, weights=None): stack_coord_name, ) = self._flatten_predictors_and_target(predictors, target) - # stacked coord has to go first for the regression to be setup correctly - predictors_flattened_dim_order = [stack_coord_name] + [ - d for d in predictors_flattened.dims if d != stack_coord_name - ] - predictors_flattened_reordered = predictors_flattened.transpose( - *predictors_flattened_dim_order - ) - predictors_plus_intercept = ["intercept"] + list( - predictors_flattened_reordered["variable"].values + ( + predictors_numpy, + predictors_plus_intercept_order, + ) = self._get_predictors_numpy_and_output_predictor_order( + predictors_flattened, stack_coord_name ) - predictor_numpy = predictors_flattened_reordered.values calibration_groups = self._get_calibration_groups( target_flattened, stack_coord_name @@ -212,8 +226,8 @@ def calibrate(self, target, predictors, weights=None): return self._calibrate_groups( target_flattened, - predictor_numpy, - predictor_names=predictors_plus_intercept, + predictors_numpy, + predictor_names=predictors_plus_intercept_order, weights=weights, calibration_groups=calibration_groups, ) From de1d293cbcd42364c7d88ee78f467f1ac0bd60b3 Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Tue, 26 Oct 2021 10:52:48 +1100 Subject: [PATCH 08/23] Change calibration pattern to simplify use of classes --- mesmer/prototype/__init__.py | 0 mesmer/prototype/calibrate.py | 195 ++++--------------------- mesmer/prototype/calibrate_multiple.py | 65 +++++++++ tests/integration/test_prototype.py | 15 +- 4 files changed, 109 insertions(+), 166 deletions(-) create mode 100644 mesmer/prototype/__init__.py diff --git a/mesmer/prototype/__init__.py b/mesmer/prototype/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mesmer/prototype/calibrate.py b/mesmer/prototype/calibrate.py index c403d28e..26dc2cd5 100644 --- a/mesmer/prototype/calibrate.py +++ b/mesmer/prototype/calibrate.py @@ -10,79 +10,6 @@ class MesmerCalibrateBase(metaclass=abc.ABCMeta): Abstract base class for calibration """ - @staticmethod - def _get_predictor_dims(predictors): - predictors_dims = {k: v.dims for k, v in predictors.items()} - predictors_dims_unique = set(predictors_dims.values()) - if len(predictors_dims_unique) > 1: - raise AssertionError( - f"Dimensions of predictors are not all the same, we have: {predictors_dims}" - ) - - return list(predictors_dims_unique)[0] - - @staticmethod - def _get_stack_coord_name(inp_array): - stack_coord_name = "stacked_coord" - if stack_coord_name in inp_array.dims: - stack_coord_name = "memser_stacked_coord" - - if stack_coord_name in inp_array.dims: - raise NotImplementedError("You have dimensions we can't safely unstack yet") - - return stack_coord_name - - @staticmethod - def _check_coords_match(obj, obj_other, check_coord): - coords_match = obj.coords[check_coord].equals(obj_other.coords[check_coord]) - if not coords_match: - raise AssertionError( - f"{check_coord} is not the same on {obj} and {obj_other}" - ) - - @staticmethod - def _flatten_predictors(predictors, dims_to_flatten, stack_coord_name): - predictors_flat = [] - for v, vals in predictors.items(): - if stack_coord_name in vals.dims: - raise AssertionError(f"{stack_coord_name} is already in {vals.dims}") - - vals_flat = vals.stack({stack_coord_name: dims_to_flatten}).dropna( - stack_coord_name - ) - vals_flat.name = v - predictors_flat.append(vals_flat) - - out = xr.merge(predictors_flat).to_stacked_array( - "predictor", sample_dims=[stack_coord_name] - ) - - return out - - def _flatten_predictors_and_target(self, predictors, target): - dims_to_flatten = self._get_predictor_dims(predictors) - stack_coord_name = self._get_stack_coord_name(target) - - target_flattened = target.stack({stack_coord_name: dims_to_flatten}).dropna( - stack_coord_name - ) - predictors_flattened = self._flatten_predictors( - predictors, dims_to_flatten, stack_coord_name - ) - self._check_coords_match( - target_flattened, predictors_flattened, stack_coord_name - ) - - return predictors_flattened, target_flattened, stack_coord_name - - @staticmethod - def _get_calibration_groups(target_flattened, stack_coord_name): - calibration_groups = list(set(target_flattened.dims) - {stack_coord_name}) - if len(calibration_groups) > 1: - raise NotImplementedError() - - return calibration_groups[0] - class MesmerCalibrateTargetPredictor(MesmerCalibrateBase): @abc.abstractmethod @@ -124,110 +51,50 @@ class LinearRegression(MesmerCalibrateTargetPredictor): @staticmethod def _regress_single_group(target_point, predictor, weights=None): # this is the method that actually does the regression - args = [predictor, target_point] + args = [predictor.T, target_point.reshape(-1, 1)] if weights is not None: args.append(weights) - reg = sklearn.linear_model.LinearRegression().fit(*args) + out_array = np.concatenate([reg.intercept_, *reg.coef_]) - return reg + return out_array - def _calibrate_groups( + def calibrate( self, target_flattened, - predictor_numpy, - predictor_names, - weights, - calibration_groups, - ): - def _calibrate_group(target_group): - target_group_numpy = target_group.values - res_group = self._regress_single_group( - target_group_numpy, predictor_numpy, weights - ) - - res_xr = xr.DataArray( - np.concatenate([[res_group.intercept_], res_group.coef_]), - dims=["predictor"], - coords={"predictor": predictor_names}, - ) - - return res_xr - - res = target_flattened.groupby(calibration_groups).apply(_calibrate_group) - - return res - - @staticmethod - def _get_predictors_numpy_and_output_predictor_order( - predictors_flattened, stack_coord_name + predictors_flattened, + stack_coord_name, + predictor_name="predictor", + weights=None, + predictor_temporary_name="__pred_store__", ): - # stacked coord has to go first for the regression to be setup correctly - predictors_flattened_dim_order = [stack_coord_name] + [ - d for d in predictors_flattened.dims if d != stack_coord_name - ] - predictors_flattened_reordered = predictors_flattened.transpose( - *predictors_flattened_dim_order - ) - predictors_numpy = predictors_flattened_reordered.values - - predictors_plus_intercept_order = ["intercept"] + list( - predictors_flattened_reordered["variable"].values - ) - - return predictors_numpy, predictors_plus_intercept_order - - def calibrate(self, target, predictors, weights=None): """ - Calibrate a linear regression model - - Parameters - ---------- - target : xarray.DataArray - Target data - - predictors : dict[str: xarray.DataArray] - Predictors to use, each key gives the name of the predictor, the - values give the values of the predictor. + TODO: redo docstring + """ + if predictor_name not in predictors_flattened.dims: + raise AssertionError(f"{predictor_name} not in {predictors_flattened.dims}") - weights : optional, xarray.DataArray - Weights to use for dimensions in ``predictors`` + if predictor_temporary_name in predictors_flattened.dims: + raise AssertionError( + f"{predictor_temporary_name} already in {predictors_flattened.dims}, choose a different temporary name" + ) - Returns - ------- - xarray.DataArray - Fitting coefficients, the dimensions of which are the combination - of "predictor" and all dimensions in ``target`` which are not in - ``predictors``. For example, the resulting dimensions could be - ["gridpoint", "predictor"]. - - Notes - ----- - We flatten ``target``, ``predictors`` and ``weights`` across the - dimensions shared by ``target`` and ``predictors`` to create the input - arrays for the linear regression. - """ - ( - predictors_flattened, + res = xr.apply_ufunc( + self._regress_single_group, target_flattened, - stack_coord_name, - ) = self._flatten_predictors_and_target(predictors, target) - - ( - predictors_numpy, - predictors_plus_intercept_order, - ) = self._get_predictors_numpy_and_output_predictor_order( - predictors_flattened, stack_coord_name + predictors_flattened, + input_core_dims=[[stack_coord_name], [predictor_name, stack_coord_name]], + output_core_dims=((predictor_temporary_name,),), + vectorize=True, + kwargs=dict(weights=weights), ) - calibration_groups = self._get_calibration_groups( - target_flattened, stack_coord_name + # assuming that predictor's names are in the 'variable' coordinate + predictors_plus_intercept_order = ["intercept"] + list( + predictors_flattened["variable"].values ) + res = res.assign_coords( + {predictor_temporary_name: predictors_plus_intercept_order} + ).rename({predictor_temporary_name: predictor_name}) - return self._calibrate_groups( - target_flattened, - predictors_numpy, - predictor_names=predictors_plus_intercept_order, - weights=weights, - calibration_groups=calibration_groups, - ) + return res diff --git a/mesmer/prototype/calibrate_multiple.py b/mesmer/prototype/calibrate_multiple.py index f42369b5..231352a5 100644 --- a/mesmer/prototype/calibrate_multiple.py +++ b/mesmer/prototype/calibrate_multiple.py @@ -1,3 +1,68 @@ +import xarray as xr + + +def _get_predictor_dims(predictors): + predictors_dims = {k: v.dims for k, v in predictors.items()} + predictors_dims_unique = set(predictors_dims.values()) + if len(predictors_dims_unique) > 1: + raise AssertionError( + f"Dimensions of predictors are not all the same, we have: {predictors_dims}" + ) + + return list(predictors_dims_unique)[0] + + +def _get_stack_coord_name(inp_array): + stack_coord_name = "stacked_coord" + if stack_coord_name in inp_array.dims: + stack_coord_name = "memser_stacked_coord" + + if stack_coord_name in inp_array.dims: + raise NotImplementedError("You have dimensions we can't safely unstack yet") + + return stack_coord_name + + +def _check_coords_match(obj, obj_other, check_coord): + coords_match = obj.coords[check_coord].equals(obj_other.coords[check_coord]) + if not coords_match: + raise AssertionError(f"{check_coord} is not the same on {obj} and {obj_other}") + + +def _flatten_predictors(predictors, dims_to_flatten, stack_coord_name): + predictors_flat = [] + for v, vals in predictors.items(): + if stack_coord_name in vals.dims: + raise AssertionError(f"{stack_coord_name} is already in {vals.dims}") + + vals_flat = vals.stack({stack_coord_name: dims_to_flatten}).dropna( + stack_coord_name + ) + vals_flat.name = v + predictors_flat.append(vals_flat) + + out = xr.merge(predictors_flat).to_stacked_array( + "predictor", sample_dims=[stack_coord_name] + ) + + return out + + +def flatten_predictors_and_target(predictors, target): + dims_to_flatten = _get_predictor_dims(predictors) + stack_coord_name = _get_stack_coord_name(target) + + target_flattened = target.stack({stack_coord_name: dims_to_flatten}).dropna( + stack_coord_name + ) + predictors_flattened = _flatten_predictors( + predictors, dims_to_flatten, stack_coord_name + ) + _check_coords_match(target_flattened, predictors_flattened, stack_coord_name) + + return predictors_flattened, target_flattened, stack_coord_name + + def calibrate_multiple_scenarios_and_ensemble_members( targets, predictors, calibration_class, calibration_kwargs, weighting_style ): diff --git a/tests/integration/test_prototype.py b/tests/integration/test_prototype.py index dbb1dd7e..41dbcc54 100644 --- a/tests/integration/test_prototype.py +++ b/tests/integration/test_prototype.py @@ -4,6 +4,7 @@ from mesmer.calibrate_mesmer.train_lt import train_lt from mesmer.prototype.calibrate import LinearRegression +from mesmer.prototype.calibrate_multiple import flatten_predictors_and_target class _MockConfig: @@ -147,14 +148,24 @@ def test_prototype_train_lt(): cfg=_MockConfig(), ) - res_updated = LinearRegression().calibrate( - esm_tas, + ( + predictors_flattened, + target_flattened, + stack_coord_name, + ) = flatten_predictors_and_target( predictors={ "emulator_tas": emulator_tas, "emulator_tas_squared": emulator_tas_squared, "emulator_hfds": emulator_hfds, "global_variability": global_variability, }, + target=esm_tas, + ) + + res_updated = LinearRegression().calibrate( + target_flattened, + predictors_flattened, + stack_coord_name, ) # check that calibrated parameters match for each predictor variable From 3d97ec16d23a6e0a2d71bd592227932e0fb9da1b Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Tue, 26 Oct 2021 12:46:31 +1100 Subject: [PATCH 09/23] Try doing auto-regression implementation --- mesmer/calibrate_mesmer/train_gv.py | 4 +- mesmer/create_emulations/create_emus_gv.py | 2 + mesmer/prototype/calibrate.py | 82 ++++++++++--- mesmer/prototype/calibrate_multiple.py | 81 +++++++++++++ tests/integration/test_prototype.py | 135 ++++++++++++++++++++- 5 files changed, 280 insertions(+), 24 deletions(-) diff --git a/mesmer/calibrate_mesmer/train_gv.py b/mesmer/calibrate_mesmer/train_gv.py index 3d65ec5d..561bd718 100644 --- a/mesmer/calibrate_mesmer/train_gv.py +++ b/mesmer/calibrate_mesmer/train_gv.py @@ -176,7 +176,7 @@ def train_gv_AR(params_gv, gv, max_lag, sel_crit): gv[scen][run], maxlag=max_lag, ic=sel_crit, old_names=False ).ar_lags # if order > 0 is selected,add selected order to vector - if len(run_ar_lags) > 0: + if run_ar_lags is not None: AR_order_runs_tmp[run] = run_ar_lags[-1] AR_order_scens_tmp[scen_idx] = np.percentile( @@ -201,6 +201,7 @@ def train_gv_AR(params_gv, gv, max_lag, sel_crit): AR_std_innovs_tmp = 0 for run in np.arange(nr_runs): + # AR parameters for scenario are average of runs AR_model_tmp = AutoReg( gv[scen][run], lags=AR_order_sel, old_names=False ).fit() @@ -208,6 +209,7 @@ def train_gv_AR(params_gv, gv, max_lag, sel_crit): AR_coefs_tmp += AR_model_tmp.params[1:] / nr_runs AR_std_innovs_tmp += np.sqrt(AR_model_tmp.sigma2) / nr_runs + # AR parameters are average over scenarios params_gv["AR_int"] += AR_int_tmp / nr_scens params_gv["AR_coefs"] += AR_coefs_tmp / nr_scens params_gv["AR_std_innovs"] += AR_std_innovs_tmp / nr_scens diff --git a/mesmer/create_emulations/create_emus_gv.py b/mesmer/create_emulations/create_emus_gv.py index 326c69a9..b2a02b07 100644 --- a/mesmer/create_emulations/create_emus_gv.py +++ b/mesmer/create_emulations/create_emus_gv.py @@ -149,6 +149,7 @@ def create_emus_gv_AR(params_gv, nr_emus_v, nr_ts_emus_v, seed): np.random.seed(seed) # buffer so that initial start at 0 does not influence overall result + # Should this buffer be based on the length of ar_lags instead of hard-coded? buffer = 50 # re-name params for easier reading of code below @@ -175,6 +176,7 @@ def create_emus_gv_AR(params_gv, nr_emus_v, nr_ts_emus_v, seed): for t in np.arange(ar_lags[-1], len(emus_gv[i])): # avoid misleading indices emus_gv[i, t] = ( ar_int + # could probably be replaced with ArmaProcess.generate_samples + sum( ar_coefs[k] * emus_gv[i, t - ar_lags[k]] for k in np.arange(len(ar_lags)) diff --git a/mesmer/prototype/calibrate.py b/mesmer/prototype/calibrate.py index 26dc2cd5..7bb646cb 100644 --- a/mesmer/prototype/calibrate.py +++ b/mesmer/prototype/calibrate.py @@ -2,6 +2,7 @@ import numpy as np import sklearn.linear_model +import statsmodels.tsa.ar_model import xarray as xr @@ -15,26 +16,8 @@ class MesmerCalibrateTargetPredictor(MesmerCalibrateBase): @abc.abstractmethod def calibrate(self, target, predictor, **kwargs): """ - Calibrate a model - - [TODO: update this based on whatever LinearRegression.calibrate's docs - end up looking like] - - Parameters - ---------- - target : xarray.DataArray - Target data - - predictor : xarray.DataArray - Predictor data array - - **kwargs - Passed onto the calibration method - - Returns - ------- - xarray.DataArray - Fitting coefficients [TODO description of how labelled] + [TODO: update this based on however LinearRegression.calibrate's docs + end up looking] """ @@ -98,3 +81,62 @@ def calibrate( ).rename({predictor_temporary_name: predictor_name}) return res + + +class MesmerCalibrateTarget(MesmerCalibrateBase): + @abc.abstractmethod + def calibrate(self, target, **kwargs): + """ + [TODO: update this based on however LinearRegression.calibrate's docs + end up looking] + """ + + @staticmethod + def _check_target_is_one_dimensional(target, return_numpy_values): + if len(target.dims) > 1: + raise AssertionError(f"More than one dimension, found {target.dims}") + + if not return_numpy_values: + return None + + return target.dropna(dim=target.dims[0]).values + + +class AutoRegression1DOrderSelection(MesmerCalibrateTarget): + def calibrate( + self, + target, + maxlag=12, + ic="bic", + ): + target_numpy = self._check_target_is_one_dimensional( + target, return_numpy_values=True + ) + + calibrated = statsmodels.tsa.ar_model.ar_select_order( + target_numpy, maxlag=maxlag, ic=ic, old_names=False + ) + + return calibrated.ar_lags + + +class AutoRegression1D(MesmerCalibrateTarget): + def calibrate( + self, + target, + order, + ): + target_numpy = self._check_target_is_one_dimensional( + target, return_numpy_values=True + ) + + calibrated = statsmodels.tsa.ar_model.AutoReg( + target_numpy, lags=order, old_names=False + ).fit() + + return { + "intercept": calibrated.params[0], + "lag_coefficients": calibrated.params[1:], + # I don't know what this is so a better name could probably be chosen + "standard_innovations": np.sqrt(calibrated.sigma2), + } diff --git a/mesmer/prototype/calibrate_multiple.py b/mesmer/prototype/calibrate_multiple.py index 231352a5..95166b7f 100644 --- a/mesmer/prototype/calibrate_multiple.py +++ b/mesmer/prototype/calibrate_multiple.py @@ -1,5 +1,8 @@ +import numpy as np import xarray as xr +from .calibrate import AutoRegression1DOrderSelection, AutoRegression1D + def _get_predictor_dims(predictors): predictors_dims = {k: v.dims for k, v in predictors.items()} @@ -63,6 +66,84 @@ def flatten_predictors_and_target(predictors, target): return predictors_flattened, target_flattened, stack_coord_name +def _select_auto_regressive_process_order( + target, + maxlag, + ic, + scenario_level="scenario", + ensemble_member_level="ensemble_member", + q=50, + interpolation="nearest", +): + """ + + interpolation : str + Passed to :func:`numpy.percentile`. Interpolation is not a good way to + go here because it could lead to an AR order that wasn't actually chosen by any run. We recommend using the default value i.e. "nearest". + """ + order = [] + for _, scenario_vals in target.groupby(scenario_level): + scenario_orders = [] + for _, em_vals in scenario_vals.groupby(ensemble_member_level): + em_orders = AutoRegression1DOrderSelection().calibrate( + em_vals, maxlag=maxlag, ic=ic + ) + + if em_orders is not None: + scenario_orders.append(em_orders[-1]) + else: + scenario_orders.append(0) + + if scenario_orders: + order.append( + np.percentile(scenario_orders, q=q, interpolation=interpolation) + ) + else: + order.append(0) + + res = int(np.percentile(order, q=q, interpolation=interpolation)) + + return res + + +def _derive_auto_regressive_process_parameters( + target, order, scenario_level="scenario", ensemble_member_level="ensemble_member" +): + # I don't like the fact that I'm duplicating these loops, surely there is a better way + parameters = { + "intercept": [], + "lag_coefficients": [], + "standard_innovations": [], + } + for _, scenario_vals in target.groupby(scenario_level): + scenario_parameters = {k: [] for k in parameters} + for _, em_vals in scenario_vals.groupby(ensemble_member_level): + em_parameters = AutoRegression1D().calibrate(em_vals, order=order) + for k, v in em_parameters.items(): + scenario_parameters[k].append(v) + + scenario_parameters_average = { + k: np.vstack(v).mean(axis=0) for k, v in scenario_parameters.items() + } + for k, v in scenario_parameters_average.items(): + parameters[k].append(v) + + parameters_average = {k: np.vstack(v).mean(axis=0) for k, v in parameters.items()} + + return parameters_average + + +def calibrate_auto_regressive_process_multiple_scenarios_and_ensemble_members( + target, + maxlag=12, + ic="bic", +): + ar_order = _select_auto_regressive_process_order(target, maxlag, ic) + ar_params = _derive_auto_regressive_process_parameters(target, ar_order) + + return ar_params + + def calibrate_multiple_scenarios_and_ensemble_members( targets, predictors, calibration_class, calibration_kwargs, weighting_style ): diff --git a/tests/integration/test_prototype.py b/tests/integration/test_prototype.py index 41dbcc54..bad805dd 100644 --- a/tests/integration/test_prototype.py +++ b/tests/integration/test_prototype.py @@ -1,10 +1,16 @@ import numpy as np import numpy.testing as npt +import pytest import xarray as xr +from statsmodels.tsa.arima_process import ArmaProcess from mesmer.calibrate_mesmer.train_lt import train_lt +from mesmer.calibrate_mesmer.train_gv import train_gv from mesmer.prototype.calibrate import LinearRegression -from mesmer.prototype.calibrate_multiple import flatten_predictors_and_target +from mesmer.prototype.calibrate_multiple import ( + calibrate_auto_regressive_process_multiple_scenarios_and_ensemble_members, + flatten_predictors_and_target, +) class _MockConfig: @@ -12,6 +18,7 @@ def __init__( self, method_lt="OLS", method_lv="OLS_AR1_sci", + method_gv="AR", separate_gridpoints=True, weight_scenarios_equally=True, target_variable="tas", @@ -20,11 +27,18 @@ def __init__( self.methods[target_variable] = {} self.methods[target_variable]["lt"] = method_lt self.methods[target_variable]["lv"] = method_lv + self.methods[target_variable]["gv"] = method_gv + + # this has to be set but isn't actually used + self.preds = {} + self.preds[target_variable] = {} + self.preds[target_variable]["gv"] = None + self.method_lt_each_gp_sep = separate_gridpoints self.wgt_scen_tr_eq = weight_scenarios_equally -def _do_legacy_run( +def _do_legacy_run_train_lt( emulator_tas, emulator_tas_squared, emulator_hfds, @@ -139,7 +153,7 @@ def test_prototype_train_lt(): coords=targ_coords, ) - res_legacy = _do_legacy_run( + res_legacy = _do_legacy_run_train_lt( emulator_tas, emulator_tas_squared, emulator_hfds, @@ -179,6 +193,121 @@ def test_prototype_train_lt(): npt.assert_allclose(res_updated.sel(predictor=updated_name), legacy_vals) +def _do_legacy_run_train_gv( + esm_tas_global_variability, + cfg, +): + targs_legacy = {} + var_name = "tas" + + targs_legacy = {} + for scenario, vals_scen in esm_tas_global_variability.groupby("scenario"): + targs_legacy[scenario] = ( + vals_scen.T.dropna(dim="time").transpose("ensemble_member", "time").values + ) + + res_legacy = train_gv( + targs_legacy, + targ=var_name, + esm="esm_name", + cfg=cfg, + save_params=False, + max_lag=2, + ) + + return res_legacy + + +@pytest.mark.parametrize( + "ar", + ( + [1, 0.5, 0.3], + [1, 0.5, 0.3, 0.3, 0.7], + [0.9, 1, 0.2, -0.1], + ), +) +def test_prototype_train_gv(ar): + time_history = range(1850, 2014 + 1) + time_scenario = range(2015, 2100 + 1) + time = list(time_history) + list(time_scenario) + + magnitude = np.array([0.1]) + + scenarios = ["hist", "ssp126"] + + # we wouldn't actually start like this, but we'd write a utils function + # to simply go from lat-lon to gridpoint and back + targ_dims = ["scenario", "ensemble_member", "time"] + targ_coords = dict( + time=time, + scenario=scenarios, + ensemble_member=["r1i1p1f1", "r2i1p1f1"], + ) + esm_tas_global_variability = xr.DataArray( + np.array( + [ + [ + np.concatenate( + [ + ArmaProcess(ar, magnitude).generate_sample( + nsample=len(time_history) + ), + np.nan * np.zeros(len(time_scenario)), + ] + ), + np.concatenate( + [ + ArmaProcess(ar, magnitude).generate_sample( + nsample=len(time_history) + ), + np.nan * np.zeros(len(time_scenario)), + ] + ), + ], + [ + np.concatenate( + [ + np.nan * np.zeros(len(time_history)), + ArmaProcess(ar, magnitude).generate_sample( + nsample=len(time_scenario) + ), + ] + ), + np.concatenate( + [ + np.nan * np.zeros(len(time_history)), + ArmaProcess(ar, magnitude).generate_sample( + nsample=len(time_scenario) + ), + ] + ), + ], + ] + ), + dims=targ_dims, + coords=targ_coords, + ) + + res_legacy = _do_legacy_run_train_gv( + esm_tas_global_variability, + cfg=_MockConfig(), + ) + + res_updated = ( + calibrate_auto_regressive_process_multiple_scenarios_and_ensemble_members( + esm_tas_global_variability, + maxlag=2, + ) + ) + + for key, comparison in ( + ("intercept", res_legacy["AR_int"]), + ("lag_coefficients", res_legacy["AR_coefs"]), + ("standard_innovations", res_legacy["AR_std_innovs"]), + ): + npt.assert_allclose(res_updated[key], comparison) + + # things that aren't tested well: # - what happens if ensemble member and scenario don't actually make a coherent set # - units (should probably be using dataset rather than dataarray for inputs and outputs?) From 14c429df8f3cf51f11bed6de5355543c733a78d2 Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Wed, 27 Oct 2021 12:12:26 +1100 Subject: [PATCH 10/23] Clean up loops --- mesmer/calibrate_mesmer/train_lv.py | 8 +- mesmer/prototype/calibrate_multiple.py | 132 ++++++++++++------------- 2 files changed, 69 insertions(+), 71 deletions(-) diff --git a/mesmer/calibrate_mesmer/train_lv.py b/mesmer/calibrate_mesmer/train_lv.py index 23fce905..938d19d5 100644 --- a/mesmer/calibrate_mesmer/train_lv.py +++ b/mesmer/calibrate_mesmer/train_lv.py @@ -251,6 +251,12 @@ def train_lv_AR1_sci(params_lv, targs, y, wgt_scen_eq, aux, cfg): params_lv["AR1_std_innovs"][targ_name] += AR1_std_innovs_runs / nr_scens # determine localization radius, empirical cov matrix, and localized ecov matrix + import pdb + + pdb.set_trace() + # y_targ has dimensions (draw (flattened version of time and scenario and ensemble member), gridpoint) + # wgt_scen_eq has dimensions (draw (flattened version of time and scenario and ensemble member),) + # aux["phi_gc"][year] has dimensions (gridpoint, gridpoint) ( params_lv["L"][targ_name], params_lv["ecov"][targ_name], @@ -361,6 +367,7 @@ def train_lv_find_localized_ecov(y, wgt_scen_eq, aux, cfg): # each cv sample gets its own likelihood -> can sum them up for overall # likelihood # sum over all samples = wgt average * nr_samples + # TODO: check why this can't be done with sum llh_cv_fold_sum = np.average( llh_cv_each_sample, weights=wgt_scen_eq_cv ) * len(wgt_scen_eq_cv) @@ -369,7 +376,6 @@ def train_lv_find_localized_ecov(y, wgt_scen_eq, aux, cfg): llh_cv_sum[L] += llh_cv_fold_sum idx_L += 1 - if llh_cv_sum[L] > llh_max: L_sel = L llh_max = llh_cv_sum[L] diff --git a/mesmer/prototype/calibrate_multiple.py b/mesmer/prototype/calibrate_multiple.py index 95166b7f..8e40f8ea 100644 --- a/mesmer/prototype/calibrate_multiple.py +++ b/mesmer/prototype/calibrate_multiple.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd import xarray as xr from .calibrate import AutoRegression1DOrderSelection, AutoRegression1D @@ -66,6 +67,21 @@ def flatten_predictors_and_target(predictors, target): return predictors_flattened, target_flattened, stack_coord_name +def _loop_levels(inp, levels): + # annoyingly, there doesn't seem to be an inbuilt solution for this + # https://github.com/pydata/xarray/issues/2438 + def _yield_level(inph, levels_left, out_names): + for name, values in inph.groupby(levels_left[0]): + out_names_here = out_names + [name] + if len(levels_left) == 1: + yield tuple(out_names_here), values + else: + yield from _yield_level(values, levels_left[1:], out_names_here) + + for names, values in _yield_level(inp, levels, []): + yield names, values + + def _select_auto_regressive_process_order( target, maxlag, @@ -81,27 +97,31 @@ def _select_auto_regressive_process_order( Passed to :func:`numpy.percentile`. Interpolation is not a good way to go here because it could lead to an AR order that wasn't actually chosen by any run. We recommend using the default value i.e. "nearest". """ - order = [] - for _, scenario_vals in target.groupby(scenario_level): - scenario_orders = [] - for _, em_vals in scenario_vals.groupby(ensemble_member_level): - em_orders = AutoRegression1DOrderSelection().calibrate( - em_vals, maxlag=maxlag, ic=ic - ) - - if em_orders is not None: - scenario_orders.append(em_orders[-1]) - else: - scenario_orders.append(0) + store = [] - if scenario_orders: - order.append( - np.percentile(scenario_orders, q=q, interpolation=interpolation) - ) - else: - order.append(0) + for (scenario, ensemble_member), values in _loop_levels( + target, (scenario_level, ensemble_member_level) + ): + orders = AutoRegression1DOrderSelection().calibrate( + values, maxlag=maxlag, ic=ic + ) + keep_order = 0 if orders is None else orders[-1] + store.append( + { + "scenario": scenario, + "ensemble_member": ensemble_member, + "order": keep_order, + } + ) - res = int(np.percentile(order, q=q, interpolation=interpolation)) + store = pd.DataFrame(store).set_index(["scenario", "ensemble_member"]) + res = ( + store.groupby("scenario")["order"] + # first operation gives result by scenario (i.e. over ensemble members) + .quantile(q=q / 100, interpolation=interpolation) + # second one gives result over all scenarios + .quantile(q=q / 100, interpolation=interpolation) + ) return res @@ -109,28 +129,29 @@ def _select_auto_regressive_process_order( def _derive_auto_regressive_process_parameters( target, order, scenario_level="scenario", ensemble_member_level="ensemble_member" ): - # I don't like the fact that I'm duplicating these loops, surely there is a better way - parameters = { - "intercept": [], - "lag_coefficients": [], - "standard_innovations": [], - } - for _, scenario_vals in target.groupby(scenario_level): - scenario_parameters = {k: [] for k in parameters} - for _, em_vals in scenario_vals.groupby(ensemble_member_level): - em_parameters = AutoRegression1D().calibrate(em_vals, order=order) - for k, v in em_parameters.items(): - scenario_parameters[k].append(v) - - scenario_parameters_average = { - k: np.vstack(v).mean(axis=0) for k, v in scenario_parameters.items() - } - for k, v in scenario_parameters_average.items(): - parameters[k].append(v) - - parameters_average = {k: np.vstack(v).mean(axis=0) for k, v in parameters.items()} - - return parameters_average + store = [] + for (scenario, ensemble_member), values in _loop_levels( + target, (scenario_level, ensemble_member_level) + ): + parameters = AutoRegression1D().calibrate(values, order=order) + parameters["scenario"] = scenario + parameters["ensemble_member"] = ensemble_member + store.append(parameters) + + store = pd.DataFrame(store).set_index(["scenario", "ensemble_member"]) + + def _axis_mean(inp): + return inp.apply(np.mean, axis=0) + + res = ( + store.groupby("scenario") + # first operation gives result by scenario (i.e. over ensemble members) + .apply(_axis_mean) + # second one gives result over all scenarios + .apply(np.mean, axis=0).to_dict() + ) + + return res def calibrate_auto_regressive_process_multiple_scenarios_and_ensemble_members( @@ -142,32 +163,3 @@ def calibrate_auto_regressive_process_multiple_scenarios_and_ensemble_members( ar_params = _derive_auto_regressive_process_parameters(target, ar_order) return ar_params - - -def calibrate_multiple_scenarios_and_ensemble_members( - targets, predictors, calibration_class, calibration_kwargs, weighting_style -): - """ - Calibrate based on multiple scenarios and ensemble members per scenario - - Parameters - ---------- - targets : xarray.DataArray - Target variables to calibrate to. This should have a scenario and an ensemble-member dimension (you'd need a different function to prepare things such that the dimensions all worked) - - predictors : xarray.DataArray - Predictor variables, as above re comments about prepping data - - calibration_class : mesmer.calibration.MesmerCalibrateBase - Class to use for calibration - - calibration_kwargs : dict - Keyword arguments to pass to the calibration class - - weighting_style : ["equal", "equal_scenario", "equal_ensemble_member"] - How to weight combinations of scenarios and ensemble members. "equal" --> all scenario - ensemble member combination are treated equally. "equal_scenario" --> all scenarios get equal weight (so if a scenario has more ensemble members then each ensemble member gets less weight). "equal_ensemble_member" --> all ensemble members get the same weight (so if an ensemble member is reported for more than one scenario then each scenario gets less weight for that ensemble member) - """ - # this function (or class) would handle all of the looping over scenarios - # and ensemble members. It would call the calibration class on each - # scenario and ensemble member and know how much weight to give each - # before combining them From 4fd57aae90c674a29c17cc1b46a101b3383ed4ee Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Wed, 27 Oct 2021 12:18:34 +1100 Subject: [PATCH 11/23] Remove pdb statement --- mesmer/calibrate_mesmer/train_lv.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/mesmer/calibrate_mesmer/train_lv.py b/mesmer/calibrate_mesmer/train_lv.py index 938d19d5..c023b964 100644 --- a/mesmer/calibrate_mesmer/train_lv.py +++ b/mesmer/calibrate_mesmer/train_lv.py @@ -251,9 +251,6 @@ def train_lv_AR1_sci(params_lv, targs, y, wgt_scen_eq, aux, cfg): params_lv["AR1_std_innovs"][targ_name] += AR1_std_innovs_runs / nr_scens # determine localization radius, empirical cov matrix, and localized ecov matrix - import pdb - - pdb.set_trace() # y_targ has dimensions (draw (flattened version of time and scenario and ensemble member), gridpoint) # wgt_scen_eq has dimensions (draw (flattened version of time and scenario and ensemble member),) # aux["phi_gc"][year] has dimensions (gridpoint, gridpoint) From 1026161bbb6faccea29839fc26b0c7d038abac37 Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Wed, 24 Nov 2021 13:06:12 +1100 Subject: [PATCH 12/23] Sketch out test for train lv --- tests/integration/test_prototype.py | 55 +++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/tests/integration/test_prototype.py b/tests/integration/test_prototype.py index bad805dd..db877d3c 100644 --- a/tests/integration/test_prototype.py +++ b/tests/integration/test_prototype.py @@ -5,6 +5,7 @@ from statsmodels.tsa.arima_process import ArmaProcess from mesmer.calibrate_mesmer.train_lt import train_lt +from mesmer.calibrate_mesmer.train_lv import train_lv from mesmer.calibrate_mesmer.train_gv import train_gv from mesmer.prototype.calibrate import LinearRegression from mesmer.prototype.calibrate_multiple import ( @@ -308,6 +309,60 @@ def test_prototype_train_gv(ar): npt.assert_allclose(res_updated[key], comparison) +def _do_legacy_run_train_lv( + esm_tas_global_variability, + cfg, +): + targs_legacy = {} + + targs_legacy = {} + for scenario, vals_scen in esm_tas_global_variability.groupby("scenario"): + targs_legacy[scenario] = ( + vals_scen.T.dropna(dim="time").transpose("ensemble_member", "time").values + ) + + aux = { + "phi_gc": calculate_gaspari_cohn_correlation_function(), + } + res_legacy = train_lv( + preds={}, + targs=targs_legacy, + esm="test", + cfg=cfg, + save_params=False, + aux=aux, + params_lv={}, # unclear why this is passed in + ) + + return res_legacy + + +def test_prototype_train_lv(): + # input is residual local variability we want to reproduce (for Leah, residual + # means after removing local trend and local variability due to global variability + # but it could be whatever in reality) + + # also input the phi gc stuff (that's part of the calibration but doesn't go in the train + # lv function for some reason --> put it in calibrate_local_variability prototype function) + + res_legacy = _do_legacy_run_train_lv( + esm_tas_residual_local_variability, + cfg=_MockConfig(), + ) + + res_updated = ( + calibrate_auto_regressive_process_multiple_scenarios_and_ensemble_members( + esm_tas_global_variability, + maxlag=2, + ) + ) + + for key, comparison in ( + ("intercept", res_legacy["AR_int"]), + ("lag_coefficients", res_legacy["AR_coefs"]), + ("standard_innovations", res_legacy["AR_std_innovs"]), + ): + npt.assert_allclose(res_updated[key], comparison) # things that aren't tested well: # - what happens if ensemble member and scenario don't actually make a coherent set # - units (should probably be using dataset rather than dataarray for inputs and outputs?) From d4e4e7fa1d0ae6ca0a8a47ba8c084403736cda6a Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Thu, 25 Nov 2021 18:04:27 +1100 Subject: [PATCH 13/23] More notes about how to do train lv --- tests/integration/test_prototype.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_prototype.py b/tests/integration/test_prototype.py index db877d3c..91e3bd4a 100644 --- a/tests/integration/test_prototype.py +++ b/tests/integration/test_prototype.py @@ -310,13 +310,13 @@ def test_prototype_train_gv(ar): def _do_legacy_run_train_lv( - esm_tas_global_variability, + esm_tas_residual_local_variability, cfg, ): targs_legacy = {} targs_legacy = {} - for scenario, vals_scen in esm_tas_global_variability.groupby("scenario"): + for scenario, vals_scen in esm_tas_residual_local_variability.groupby("scenario"): targs_legacy[scenario] = ( vals_scen.T.dropna(dim="time").transpose("ensemble_member", "time").values ) @@ -337,13 +337,25 @@ def _do_legacy_run_train_lv( return res_legacy +# how train_lv works: +# 1. AR1 process for each individual gridpoint (use calibrate_auto_regressive_process_multiple_scenarios_and_ensemble_members) +# 2. find localised empirical covariance matrix +# 3. combine AR1 and localised empirical covariance matrix to get localised empirical covariance matrix +# of innovations (i.e. errors) which can be used for later draws (again, with a bit of custom code +# that feels like it should really be done using an existing library) + + def test_prototype_train_lv(): # input is residual local variability we want to reproduce (for Leah, residual # means after removing local trend and local variability due to global variability # but it could be whatever in reality) # also input the phi gc stuff (that's part of the calibration but doesn't go in the train - # lv function for some reason --> put it in calibrate_local_variability prototype function) + # lv function for some reason --> put it in calibrate_local_variability prototype function + # although split that function to allow for pre-calculating distance between points in future + # as a performance boost) + + # see how much code I can reuse for the AR1 calibration res_legacy = _do_legacy_run_train_lv( esm_tas_residual_local_variability, From 3470ee43543fc088ef533ab4f274c6f0ee977c9b Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Thu, 25 Nov 2021 20:45:30 +1100 Subject: [PATCH 14/23] Start working on geodesic functions --- mesmer/prototype/utils.py | 70 ++++++++++++++++++++++++++ tests/integration/test_prototype.py | 78 +++++++++++++++++++++++++++-- 2 files changed, 144 insertions(+), 4 deletions(-) create mode 100644 mesmer/prototype/utils.py diff --git a/mesmer/prototype/utils.py b/mesmer/prototype/utils.py new file mode 100644 index 00000000..3c3fac79 --- /dev/null +++ b/mesmer/prototype/utils.py @@ -0,0 +1,70 @@ +import numpy as np +import pyproj + + +def calculate_gaspari_cohn_correlation_matrix( + latitudes, + longitudes, +): + """ + Calculate Gaspari-Cohn correlation matrix + + Parameters + ---------- + latitudes : :obj:`xr.DataArray` + Latitudes (one-dimensional) + + longitudes : :obj:`xr.DataArray` + Longitudes (one-dimensional) + """ + # I wonder if xarray can apply a function to all pairs of points in arrays + # or something + geodist = calculate_geodistance_exact(longitudes, latitudes) + + +def calculate_geodistance_exact(latitudes, longitudes): + """ + Calculate exact great circle distance based on WSG 84 + + Parameters + ---------- + latitudes : :obj:`xr.DataArray` + Latitudes (one-dimensional) + + longitudes : :obj:`xr.DataArray` + Longitudes (one-dimensional) + + Returns + ------- + geodist : np.array + 2D array of great circle distances. + """ + if longitudes.shape != latitudes.shape or longitudes.ndim != 1: + raise ValueError("lon and lat need to be 1D arrays of the same shape") + + geod = pyproj.Geod(ellps="WGS84") + + n_points = longitudes.shape[0] + + geodist = np.zeros([n_points, n_points]) + + # calculate only the upper right half of the triangle first + for i in range(n_points): + + # need to duplicate gridpoint (required by geod.inv) + lat = np.tile(latitudes[i], n_points - (i + 1)) + lon = np.tile(longitudes[i], n_points - (i + 1)) + + geodist[i, i + 1 :] = geod.inv(lon, lat, longitudes[i + 1 :], latitudes[i + 1 :])[2] + + # convert m to km + geodist /= 1000 + + # fill the lower left half of the triangle (in-place) + geodist += np.transpose(geodist) + + # should be able to keep co-ordinate information here too + import pdb + pdb.set_trace() + + return geodist diff --git a/tests/integration/test_prototype.py b/tests/integration/test_prototype.py index 91e3bd4a..a3e54bfe 100644 --- a/tests/integration/test_prototype.py +++ b/tests/integration/test_prototype.py @@ -12,6 +12,7 @@ calibrate_auto_regressive_process_multiple_scenarios_and_ensemble_members, flatten_predictors_and_target, ) +from mesmer.prototype.utils import calculate_gaspari_cohn_correlation_matrix class _MockConfig: @@ -236,8 +237,6 @@ def test_prototype_train_gv(ar): scenarios = ["hist", "ssp126"] - # we wouldn't actually start like this, but we'd write a utils function - # to simply go from lat-lon to gridpoint and back targ_dims = ["scenario", "ensemble_member", "time"] targ_coords = dict( time=time, @@ -318,11 +317,14 @@ def _do_legacy_run_train_lv( targs_legacy = {} for scenario, vals_scen in esm_tas_residual_local_variability.groupby("scenario"): targs_legacy[scenario] = ( - vals_scen.T.dropna(dim="time").transpose("ensemble_member", "time").values + vals_scen.T.dropna(dim="time").transpose("ensemble_member", "time", "gridpoint").values ) aux = { - "phi_gc": calculate_gaspari_cohn_correlation_function(), + "phi_gc": calculate_gaspari_cohn_correlation_matrix( + latitudes=esm_tas_residual_local_variability.lat, + longitudes=esm_tas_residual_local_variability.lon, + ), } res_legacy = train_lv( preds={}, @@ -356,6 +358,74 @@ def test_prototype_train_lv(): # as a performance boost) # see how much code I can reuse for the AR1 calibration + time_history = range(1850, 2014 + 1) + time_scenario = range(2015, 2100 + 1) + time = list(time_history) + list(time_scenario) + scenarios = ["hist", "ssp126"] + + # we wouldn't actually start like this, but we'd write a utils function + # to simply go from lat-lon to gridpoint and back + targ_dims = ["scenario", "ensemble_member", "gridpoint", "time"] + targ_coords = dict( + time=time, + scenario=scenarios, + ensemble_member=["r1i1p1f1", "r2i1p1f1"], + gridpoint=[0, 1], + lat=(["gridpoint"], [-60, 60]), + lon=(["gridpoint"], [120, 240]), + ) + + ar = np.array([1]) + magnitude = np.array([0.05]) + + def _get_history_sample(): + return np.concatenate( + [ + ArmaProcess(ar, magnitude).generate_sample( + nsample=len(time_history) + ), + np.nan * np.zeros(len(time_scenario)), + ] + ) + + def _get_scenario_sample(): + return np.concatenate( + [ + np.nan * np.zeros(len(time_history)), + ArmaProcess(ar, magnitude).generate_sample( + nsample=len(time_scenario) + ), + ] + ) + + esm_tas_residual_local_variability = xr.DataArray( + np.array( + [ + [ + [ + _get_history_sample(), + _get_history_sample(), + ], + [ + _get_history_sample(), + _get_history_sample(), + ] + ], + [ + [ + _get_scenario_sample(), + _get_scenario_sample(), + ], + [ + _get_scenario_sample(), + _get_scenario_sample(), + ], + ], + ] + ), + dims=targ_dims, + coords=targ_coords, + ) res_legacy = _do_legacy_run_train_lv( esm_tas_residual_local_variability, From 3b23a6da0b2b3a3941406c62afc43f0498a8cf9f Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Fri, 26 Nov 2021 15:58:26 +1100 Subject: [PATCH 15/23] Get legacy training running --- mesmer/prototype/utils.py | 92 +++++++++++++++++++++++++---- tests/integration/test_prototype.py | 33 +++++++---- 2 files changed, 100 insertions(+), 25 deletions(-) diff --git a/mesmer/prototype/utils.py b/mesmer/prototype/utils.py index 3c3fac79..1f749cf9 100644 --- a/mesmer/prototype/utils.py +++ b/mesmer/prototype/utils.py @@ -1,13 +1,15 @@ import numpy as np import pyproj +import xarray as xr -def calculate_gaspari_cohn_correlation_matrix( +def calculate_gaspari_cohn_correlation_matrices( latitudes, longitudes, + localisation_radii, ): """ - Calculate Gaspari-Cohn correlation matrix + Calculate Gaspari-Cohn correlation matrices for a range of localisation radiis Parameters ---------- @@ -16,10 +18,30 @@ def calculate_gaspari_cohn_correlation_matrix( longitudes : :obj:`xr.DataArray` Longitudes (one-dimensional) + + localisation_radii : list-like + Localisation radii to test (in metres) + + Returns + ------- + dict[float: :obj:`xr.DataArray`] + Gaspari-Cohn correlation matrix (values) for each localisation radius (keys) + + Notes + ----- + Values in ``localisation_radii`` should not exceed 10'000 by much because + it can lead to ``ValueError: the input matrix must be positive semidefinite`` """ # I wonder if xarray can apply a function to all pairs of points in arrays # or something - geodist = calculate_geodistance_exact(longitudes, latitudes) + geodistance = calculate_geodistance_exact(latitudes, longitudes) + + gaspari_cohn_correlation_matrices = { + lr : calculate_gaspari_cohn_values(geodistance / lr) + for lr in localisation_radii + } + + return gaspari_cohn_correlation_matrices def calculate_geodistance_exact(latitudes, longitudes): @@ -36,8 +58,9 @@ def calculate_geodistance_exact(latitudes, longitudes): Returns ------- - geodist : np.array - 2D array of great circle distances. + :obj:`xr.DataArray` + 2D array of great circle distances between points represented by ``latitudes`` + and ``longitudes`` """ if longitudes.shape != latitudes.shape or longitudes.ndim != 1: raise ValueError("lon and lat need to be 1D arrays of the same shape") @@ -46,7 +69,7 @@ def calculate_geodistance_exact(latitudes, longitudes): n_points = longitudes.shape[0] - geodist = np.zeros([n_points, n_points]) + geodistance = np.zeros([n_points, n_points]) # calculate only the upper right half of the triangle first for i in range(n_points): @@ -55,16 +78,59 @@ def calculate_geodistance_exact(latitudes, longitudes): lat = np.tile(latitudes[i], n_points - (i + 1)) lon = np.tile(longitudes[i], n_points - (i + 1)) - geodist[i, i + 1 :] = geod.inv(lon, lat, longitudes[i + 1 :], latitudes[i + 1 :])[2] + geodistance[i, i + 1 :] = geod.inv(lon, lat, longitudes.values[i + 1 :], latitudes.values[i + 1 :])[2] # convert m to km - geodist /= 1000 + geodistance /= 1000 # fill the lower left half of the triangle (in-place) - geodist += np.transpose(geodist) + geodistance += np.transpose(geodistance) + + if latitudes.dims != longitudes.dims: + raise AssertionError(f"latitudes and longitudes have different dims: {latitudes.dims} vs. {longitudes.dims}") + + geodistance = xr.DataArray(geodistance, dims=list(latitudes.dims) * 2, coords=latitudes.coords) - # should be able to keep co-ordinate information here too - import pdb - pdb.set_trace() + return geodistance - return geodist + +def calculate_gaspari_cohn_values(inputs): + """ + Calculate smooth, exponentially decaying Gaspari-Cohn values + + Parameters + ---------- + inputs : :obj:`xr.DataArray` + Inputs at which to calculate the value of the smooth, exponentially decaying Gaspari-Cohn + correlation function (these could be e.g. normalised geographical distances) + + Returns + ------- + :obj:`xr.DataArray` + Gaspari-Cohn correlation function applied to each point in ``inputs`` + """ + inputs_abs = abs(inputs) + out = np.zeros_like(inputs) + + sel_zero_to_one = (inputs_abs.values >= 0) & (inputs_abs.values < 1) + r_s = inputs_abs.values[sel_zero_to_one] + out[sel_zero_to_one] = ( + 1 - 5 / 3 * r_s ** 2 + 5 / 8 * r_s ** 3 + 1 / 2 * r_s ** 4 - 1 / 4 * r_s ** 5 + ) + + sel_one_to_two = (inputs_abs.values >= 1) & (inputs_abs.values < 2) + r_s = inputs_abs.values[sel_one_to_two] + + out[sel_one_to_two] = ( + 4 + - 5 * r_s + + 5 / 3 * r_s ** 2 + + 5 / 8 * r_s ** 3 + - 1 / 2 * r_s ** 4 + + 1 / 12 * r_s ** 5 + - 2 / (3 * r_s) + ) + + out = xr.DataArray(out, dims=inputs.dims, coords=inputs.coords) + + return out diff --git a/tests/integration/test_prototype.py b/tests/integration/test_prototype.py index a3e54bfe..9eb4f6eb 100644 --- a/tests/integration/test_prototype.py +++ b/tests/integration/test_prototype.py @@ -12,7 +12,7 @@ calibrate_auto_regressive_process_multiple_scenarios_and_ensemble_members, flatten_predictors_and_target, ) -from mesmer.prototype.utils import calculate_gaspari_cohn_correlation_matrix +from mesmer.prototype.utils import calculate_gaspari_cohn_correlation_matrices class _MockConfig: @@ -24,6 +24,7 @@ def __init__( separate_gridpoints=True, weight_scenarios_equally=True, target_variable="tas", + cross_validation_max_iterations=30, ): self.methods = {} self.methods[target_variable] = {} @@ -39,6 +40,8 @@ def __init__( self.method_lt_each_gp_sep = separate_gridpoints self.wgt_scen_tr_eq = weight_scenarios_equally + self.max_iter_cv = cross_validation_max_iterations + def _do_legacy_run_train_lt( emulator_tas, @@ -310,22 +313,25 @@ def test_prototype_train_gv(ar): def _do_legacy_run_train_lv( esm_tas_residual_local_variability, + localisation_radii, cfg, ): - targs_legacy = {} - - targs_legacy = {} + targs_legacy = {"tas": {}} for scenario, vals_scen in esm_tas_residual_local_variability.groupby("scenario"): - targs_legacy[scenario] = ( + targs_legacy["tas"][scenario] = ( vals_scen.T.dropna(dim="time").transpose("ensemble_member", "time", "gridpoint").values ) - aux = { - "phi_gc": calculate_gaspari_cohn_correlation_matrix( - latitudes=esm_tas_residual_local_variability.lat, - longitudes=esm_tas_residual_local_variability.lon, - ), + gaspari_cohn_correlation_matrices = calculate_gaspari_cohn_correlation_matrices( + latitudes=esm_tas_residual_local_variability.lat, + longitudes=esm_tas_residual_local_variability.lon, + localisation_radii=localisation_radii, + ) + gaspari_cohn_correlation_matrices = { + k: v.values for k, v in gaspari_cohn_correlation_matrices.items() } + aux = {"phi_gc": gaspari_cohn_correlation_matrices} + res_legacy = train_lv( preds={}, targs=targs_legacy, @@ -357,6 +363,8 @@ def test_prototype_train_lv(): # although split that function to allow for pre-calculating distance between points in future # as a performance boost) + localisation_radii = np.arange(700, 2000, 1000) + # see how much code I can reuse for the AR1 calibration time_history = range(1850, 2014 + 1) time_scenario = range(2015, 2100 + 1) @@ -429,13 +437,14 @@ def _get_scenario_sample(): res_legacy = _do_legacy_run_train_lv( esm_tas_residual_local_variability, + localisation_radii, cfg=_MockConfig(), ) res_updated = ( calibrate_auto_regressive_process_multiple_scenarios_and_ensemble_members( - esm_tas_global_variability, - maxlag=2, + esm_tas_residual_local_variability, + localisation_radii, ) ) From 88ec4ad5f9191f5bdb3f4356b5601af5b3deedaf Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Fri, 26 Nov 2021 17:37:55 +1100 Subject: [PATCH 16/23] Finish reimplementing train_lv --- mesmer/prototype/calibrate_multiple.py | 115 +++++++++++++++++++++++++ tests/integration/test_prototype.py | 8 +- 2 files changed, 119 insertions(+), 4 deletions(-) diff --git a/mesmer/prototype/calibrate_multiple.py b/mesmer/prototype/calibrate_multiple.py index 8e40f8ea..71c196fc 100644 --- a/mesmer/prototype/calibrate_multiple.py +++ b/mesmer/prototype/calibrate_multiple.py @@ -1,8 +1,10 @@ import numpy as np import pandas as pd +import scipy.stats import xarray as xr from .calibrate import AutoRegression1DOrderSelection, AutoRegression1D +from .utils import calculate_gaspari_cohn_correlation_matrices def _get_predictor_dims(predictors): @@ -33,6 +35,15 @@ def _check_coords_match(obj, obj_other, check_coord): raise AssertionError(f"{check_coord} is not the same on {obj} and {obj_other}") +def _flatten(inp, dims_to_flatten): + stack_coord_name = _get_stack_coord_name(inp) + inp_flat = inp.stack({stack_coord_name: dims_to_flatten}).dropna( + stack_coord_name + ) + + return inp_flat, stack_coord_name + + def _flatten_predictors(predictors, dims_to_flatten, stack_coord_name): predictors_flat = [] for v, vals in predictors.items(): @@ -163,3 +174,107 @@ def calibrate_auto_regressive_process_multiple_scenarios_and_ensemble_members( ar_params = _derive_auto_regressive_process_parameters(target, ar_order) return ar_params + + +def calibrate_auto_regressive_process_with_spatially_correlated_errors_multiple_scenarios_and_ensemble_members( + target, + localisation_radii, + max_cross_validation_iterations=30, + gridpoint_dim_name="gridpoint", +): + gridpoint_autoregression_parameters = { + gridpoint: _derive_auto_regressive_process_parameters(gridpoint_vals, order=1) + for gridpoint, gridpoint_vals in target.groupby("gridpoint") + } + + gaspari_cohn_correlation_matrices = calculate_gaspari_cohn_correlation_matrices( + target.lat, + target.lon, + localisation_radii, + ) + + localised_empirical_covariance_matrix = _calculate_localised_empirical_covariance_matrix( + target, + localisation_radii, + gaspari_cohn_correlation_matrices, + max_cross_validation_iterations, + gridpoint_dim_name=gridpoint_dim_name, + ) + + gridpoint_autoregression_coeffcients = np.hstack([v["lag_coefficients"] for v in gridpoint_autoregression_parameters.values()]) + + localised_empirical_covariance_matrix_with_ar1_errors = ( + (1 - gridpoint_autoregression_coeffcients ** 2) + * localised_empirical_covariance_matrix + ) + + return localised_empirical_covariance_matrix_with_ar1_errors + + +def _calculate_localised_empirical_covariance_matrix( + target, + localisation_radii, + gaspari_cohn_correlation_matrices, + max_cross_validation_iterations, + gridpoint_dim_name="gridpoint", +): + dims_to_flatten = [d for d in target.dims if d != gridpoint_dim_name] + target_flattened, stack_coord_name = _flatten(target, dims_to_flatten) + target_flattened = target_flattened.transpose(stack_coord_name, gridpoint_dim_name) + + number_samples = target_flattened[stack_coord_name].shape[0] + number_iterations = min([number_samples, max_cross_validation_iterations]) + + # setup cross-validation stuff + index_cross_validation_out = np.zeros([number_iterations, number_samples], dtype=bool) + + for i in range(number_iterations): + index_cross_validation_out[i, i::max_cross_validation_iterations] = True + + # No idea what these are either + log_likelihood_cross_validation_sum_max = -10000 + + for lr in localisation_radii: + log_likelihood_cross_validation_sum = 0 + + for i in range(number_iterations): + # extract folds (no idea why these are called folds) + target_estimator = target_flattened.isel(**{stack_coord_name: ~index_cross_validation_out[i]}).values + target_cross_validation = target_flattened.isel(**{stack_coord_name: index_cross_validation_out[i]}).values + # selecting relevant weights goes in here + + empirical_covariance = np.cov(target_estimator, rowvar=False) + # must be a better way to handle ensuring that the dimensions line up correctly (rather than + # just cheating by using `.values`) + empirical_covariance_localised = empirical_covariance * gaspari_cohn_correlation_matrices[lr].values + + # calculate likelihood of cross validation samples + log_likelihood_cross_validation_samples = scipy.stats.multivariate_normal.logpdf( + target_cross_validation, + mean=np.zeros(gaspari_cohn_correlation_matrices[lr].shape[0]), + cov=empirical_covariance_localised, + allow_singular=True, + ) + log_likelihood_cross_validation_samples_weighted_sum = np.average( + log_likelihood_cross_validation_samples, + # weights=wgt_scen_eq_cv # TODO: weights handling + ) * log_likelihood_cross_validation_samples.shape[0] + + # add to full sum over all folds + log_likelihood_cross_validation_sum += log_likelihood_cross_validation_samples_weighted_sum + + if log_likelihood_cross_validation_sum > log_likelihood_cross_validation_sum_max: + log_likelihood_cross_validation_sum_max = log_likelihood_cross_validation_sum + else: + # experience tells us that once we start selecting large localisation radii, performance + # will not improve ==> break (reduces computational effort and number of singular matrices + # encountered) + break + + # TODO: replace print with logging + print(f"Selected localisation radius: {lr}") + + empirical_covariance = np.cov(target_flattened.values, rowvar=False) + empirical_covariance_localised = empirical_covariance * gaspari_cohn_correlation_matrices[lr].values + + return empirical_covariance_localised diff --git a/tests/integration/test_prototype.py b/tests/integration/test_prototype.py index 9eb4f6eb..e4e8c9f5 100644 --- a/tests/integration/test_prototype.py +++ b/tests/integration/test_prototype.py @@ -10,6 +10,7 @@ from mesmer.prototype.calibrate import LinearRegression from mesmer.prototype.calibrate_multiple import ( calibrate_auto_regressive_process_multiple_scenarios_and_ensemble_members, + calibrate_auto_regressive_process_with_spatially_correlated_errors_multiple_scenarios_and_ensemble_members, flatten_predictors_and_target, ) from mesmer.prototype.utils import calculate_gaspari_cohn_correlation_matrices @@ -442,18 +443,17 @@ def _get_scenario_sample(): ) res_updated = ( - calibrate_auto_regressive_process_multiple_scenarios_and_ensemble_members( + calibrate_auto_regressive_process_with_spatially_correlated_errors_multiple_scenarios_and_ensemble_members( esm_tas_residual_local_variability, localisation_radii, ) ) for key, comparison in ( - ("intercept", res_legacy["AR_int"]), - ("lag_coefficients", res_legacy["AR_coefs"]), - ("standard_innovations", res_legacy["AR_std_innovs"]), + ("localised_empirical_covariance_matrix_with_ar1_errors", res_legacy["loc_ecov_AR1_innovs"]["tas"]), ): npt.assert_allclose(res_updated[key], comparison) + # things that aren't tested well: # - what happens if ensemble member and scenario don't actually make a coherent set # - units (should probably be using dataset rather than dataarray for inputs and outputs?) From 08a51348058fe2c180596920573c26d89bba79eb Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Thu, 21 Sep 2023 18:16:58 +0200 Subject: [PATCH 17/23] linting --- mesmer/prototype/calibrate_multiple.py | 85 ++++++++++++++++---------- mesmer/prototype/utils.py | 25 +++++--- tests/integration/test_prototype.py | 32 +++++----- 3 files changed, 85 insertions(+), 57 deletions(-) diff --git a/mesmer/prototype/calibrate_multiple.py b/mesmer/prototype/calibrate_multiple.py index 71c196fc..049b1ce6 100644 --- a/mesmer/prototype/calibrate_multiple.py +++ b/mesmer/prototype/calibrate_multiple.py @@ -3,7 +3,7 @@ import scipy.stats import xarray as xr -from .calibrate import AutoRegression1DOrderSelection, AutoRegression1D +from .calibrate import AutoRegression1D, AutoRegression1DOrderSelection from .utils import calculate_gaspari_cohn_correlation_matrices @@ -37,9 +37,7 @@ def _check_coords_match(obj, obj_other, check_coord): def _flatten(inp, dims_to_flatten): stack_coord_name = _get_stack_coord_name(inp) - inp_flat = inp.stack({stack_coord_name: dims_to_flatten}).dropna( - stack_coord_name - ) + inp_flat = inp.stack({stack_coord_name: dims_to_flatten}).dropna(stack_coord_name) return inp_flat, stack_coord_name @@ -193,20 +191,23 @@ def calibrate_auto_regressive_process_with_spatially_correlated_errors_multiple_ localisation_radii, ) - localised_empirical_covariance_matrix = _calculate_localised_empirical_covariance_matrix( - target, - localisation_radii, - gaspari_cohn_correlation_matrices, - max_cross_validation_iterations, - gridpoint_dim_name=gridpoint_dim_name, + localised_empirical_covariance_matrix = ( + _calculate_localised_empirical_covariance_matrix( + target, + localisation_radii, + gaspari_cohn_correlation_matrices, + max_cross_validation_iterations, + gridpoint_dim_name=gridpoint_dim_name, + ) ) - gridpoint_autoregression_coeffcients = np.hstack([v["lag_coefficients"] for v in gridpoint_autoregression_parameters.values()]) + gridpoint_autoregression_coeffcients = np.hstack( + [v["lag_coefficients"] for v in gridpoint_autoregression_parameters.values()] + ) localised_empirical_covariance_matrix_with_ar1_errors = ( - (1 - gridpoint_autoregression_coeffcients ** 2) - * localised_empirical_covariance_matrix - ) + 1 - gridpoint_autoregression_coeffcients**2 + ) * localised_empirical_covariance_matrix return localised_empirical_covariance_matrix_with_ar1_errors @@ -226,7 +227,9 @@ def _calculate_localised_empirical_covariance_matrix( number_iterations = min([number_samples, max_cross_validation_iterations]) # setup cross-validation stuff - index_cross_validation_out = np.zeros([number_iterations, number_samples], dtype=bool) + index_cross_validation_out = np.zeros( + [number_iterations, number_samples], dtype=bool + ) for i in range(number_iterations): index_cross_validation_out[i, i::max_cross_validation_iterations] = True @@ -239,32 +242,50 @@ def _calculate_localised_empirical_covariance_matrix( for i in range(number_iterations): # extract folds (no idea why these are called folds) - target_estimator = target_flattened.isel(**{stack_coord_name: ~index_cross_validation_out[i]}).values - target_cross_validation = target_flattened.isel(**{stack_coord_name: index_cross_validation_out[i]}).values + target_estimator = target_flattened.isel( + **{stack_coord_name: ~index_cross_validation_out[i]} + ).values + target_cross_validation = target_flattened.isel( + **{stack_coord_name: index_cross_validation_out[i]} + ).values # selecting relevant weights goes in here empirical_covariance = np.cov(target_estimator, rowvar=False) # must be a better way to handle ensuring that the dimensions line up correctly (rather than # just cheating by using `.values`) - empirical_covariance_localised = empirical_covariance * gaspari_cohn_correlation_matrices[lr].values + empirical_covariance_localised = ( + empirical_covariance * gaspari_cohn_correlation_matrices[lr].values + ) # calculate likelihood of cross validation samples - log_likelihood_cross_validation_samples = scipy.stats.multivariate_normal.logpdf( - target_cross_validation, - mean=np.zeros(gaspari_cohn_correlation_matrices[lr].shape[0]), - cov=empirical_covariance_localised, - allow_singular=True, + log_likelihood_cross_validation_samples = ( + scipy.stats.multivariate_normal.logpdf( + target_cross_validation, + mean=np.zeros(gaspari_cohn_correlation_matrices[lr].shape[0]), + cov=empirical_covariance_localised, + allow_singular=True, + ) + ) + log_likelihood_cross_validation_samples_weighted_sum = ( + np.average( + log_likelihood_cross_validation_samples, + # weights=wgt_scen_eq_cv # TODO: weights handling + ) + * log_likelihood_cross_validation_samples.shape[0] ) - log_likelihood_cross_validation_samples_weighted_sum = np.average( - log_likelihood_cross_validation_samples, - # weights=wgt_scen_eq_cv # TODO: weights handling - ) * log_likelihood_cross_validation_samples.shape[0] # add to full sum over all folds - log_likelihood_cross_validation_sum += log_likelihood_cross_validation_samples_weighted_sum + log_likelihood_cross_validation_sum += ( + log_likelihood_cross_validation_samples_weighted_sum + ) - if log_likelihood_cross_validation_sum > log_likelihood_cross_validation_sum_max: - log_likelihood_cross_validation_sum_max = log_likelihood_cross_validation_sum + if ( + log_likelihood_cross_validation_sum + > log_likelihood_cross_validation_sum_max + ): + log_likelihood_cross_validation_sum_max = ( + log_likelihood_cross_validation_sum + ) else: # experience tells us that once we start selecting large localisation radii, performance # will not improve ==> break (reduces computational effort and number of singular matrices @@ -275,6 +296,8 @@ def _calculate_localised_empirical_covariance_matrix( print(f"Selected localisation radius: {lr}") empirical_covariance = np.cov(target_flattened.values, rowvar=False) - empirical_covariance_localised = empirical_covariance * gaspari_cohn_correlation_matrices[lr].values + empirical_covariance_localised = ( + empirical_covariance * gaspari_cohn_correlation_matrices[lr].values + ) return empirical_covariance_localised diff --git a/mesmer/prototype/utils.py b/mesmer/prototype/utils.py index 1f749cf9..397bb5ed 100644 --- a/mesmer/prototype/utils.py +++ b/mesmer/prototype/utils.py @@ -37,8 +37,7 @@ def calculate_gaspari_cohn_correlation_matrices( geodistance = calculate_geodistance_exact(latitudes, longitudes) gaspari_cohn_correlation_matrices = { - lr : calculate_gaspari_cohn_values(geodistance / lr) - for lr in localisation_radii + lr: calculate_gaspari_cohn_values(geodistance / lr) for lr in localisation_radii } return gaspari_cohn_correlation_matrices @@ -78,7 +77,9 @@ def calculate_geodistance_exact(latitudes, longitudes): lat = np.tile(latitudes[i], n_points - (i + 1)) lon = np.tile(longitudes[i], n_points - (i + 1)) - geodistance[i, i + 1 :] = geod.inv(lon, lat, longitudes.values[i + 1 :], latitudes.values[i + 1 :])[2] + geodistance[i, i + 1 :] = geod.inv( + lon, lat, longitudes.values[i + 1 :], latitudes.values[i + 1 :] + )[2] # convert m to km geodistance /= 1000 @@ -87,9 +88,13 @@ def calculate_geodistance_exact(latitudes, longitudes): geodistance += np.transpose(geodistance) if latitudes.dims != longitudes.dims: - raise AssertionError(f"latitudes and longitudes have different dims: {latitudes.dims} vs. {longitudes.dims}") + raise AssertionError( + f"latitudes and longitudes have different dims: {latitudes.dims} vs. {longitudes.dims}" + ) - geodistance = xr.DataArray(geodistance, dims=list(latitudes.dims) * 2, coords=latitudes.coords) + geodistance = xr.DataArray( + geodistance, dims=list(latitudes.dims) * 2, coords=latitudes.coords + ) return geodistance @@ -115,7 +120,7 @@ def calculate_gaspari_cohn_values(inputs): sel_zero_to_one = (inputs_abs.values >= 0) & (inputs_abs.values < 1) r_s = inputs_abs.values[sel_zero_to_one] out[sel_zero_to_one] = ( - 1 - 5 / 3 * r_s ** 2 + 5 / 8 * r_s ** 3 + 1 / 2 * r_s ** 4 - 1 / 4 * r_s ** 5 + 1 - 5 / 3 * r_s**2 + 5 / 8 * r_s**3 + 1 / 2 * r_s**4 - 1 / 4 * r_s**5 ) sel_one_to_two = (inputs_abs.values >= 1) & (inputs_abs.values < 2) @@ -124,10 +129,10 @@ def calculate_gaspari_cohn_values(inputs): out[sel_one_to_two] = ( 4 - 5 * r_s - + 5 / 3 * r_s ** 2 - + 5 / 8 * r_s ** 3 - - 1 / 2 * r_s ** 4 - + 1 / 12 * r_s ** 5 + + 5 / 3 * r_s**2 + + 5 / 8 * r_s**3 + - 1 / 2 * r_s**4 + + 1 / 12 * r_s**5 - 2 / (3 * r_s) ) diff --git a/tests/integration/test_prototype.py b/tests/integration/test_prototype.py index e4e8c9f5..0f250896 100644 --- a/tests/integration/test_prototype.py +++ b/tests/integration/test_prototype.py @@ -4,9 +4,9 @@ import xarray as xr from statsmodels.tsa.arima_process import ArmaProcess +from mesmer.calibrate_mesmer.train_gv import train_gv from mesmer.calibrate_mesmer.train_lt import train_lt from mesmer.calibrate_mesmer.train_lv import train_lv -from mesmer.calibrate_mesmer.train_gv import train_gv from mesmer.prototype.calibrate import LinearRegression from mesmer.prototype.calibrate_multiple import ( calibrate_auto_regressive_process_multiple_scenarios_and_ensemble_members, @@ -110,7 +110,7 @@ def test_prototype_train_lt(): dims=pred_dims, coords=pred_coords, ) - emulator_tas_squared = emulator_tas ** 2 + emulator_tas_squared = emulator_tas**2 global_variability = xr.DataArray( np.array( [ @@ -320,7 +320,9 @@ def _do_legacy_run_train_lv( targs_legacy = {"tas": {}} for scenario, vals_scen in esm_tas_residual_local_variability.groupby("scenario"): targs_legacy["tas"][scenario] = ( - vals_scen.T.dropna(dim="time").transpose("ensemble_member", "time", "gridpoint").values + vals_scen.T.dropna(dim="time") + .transpose("ensemble_member", "time", "gridpoint") + .values ) gaspari_cohn_correlation_matrices = calculate_gaspari_cohn_correlation_matrices( @@ -390,9 +392,7 @@ def test_prototype_train_lv(): def _get_history_sample(): return np.concatenate( [ - ArmaProcess(ar, magnitude).generate_sample( - nsample=len(time_history) - ), + ArmaProcess(ar, magnitude).generate_sample(nsample=len(time_history)), np.nan * np.zeros(len(time_scenario)), ] ) @@ -401,9 +401,7 @@ def _get_scenario_sample(): return np.concatenate( [ np.nan * np.zeros(len(time_history)), - ArmaProcess(ar, magnitude).generate_sample( - nsample=len(time_scenario) - ), + ArmaProcess(ar, magnitude).generate_sample(nsample=len(time_scenario)), ] ) @@ -418,7 +416,7 @@ def _get_scenario_sample(): [ _get_history_sample(), _get_history_sample(), - ] + ], ], [ [ @@ -442,18 +440,20 @@ def _get_scenario_sample(): cfg=_MockConfig(), ) - res_updated = ( - calibrate_auto_regressive_process_with_spatially_correlated_errors_multiple_scenarios_and_ensemble_members( - esm_tas_residual_local_variability, - localisation_radii, - ) + res_updated = calibrate_auto_regressive_process_with_spatially_correlated_errors_multiple_scenarios_and_ensemble_members( + esm_tas_residual_local_variability, + localisation_radii, ) for key, comparison in ( - ("localised_empirical_covariance_matrix_with_ar1_errors", res_legacy["loc_ecov_AR1_innovs"]["tas"]), + ( + "localised_empirical_covariance_matrix_with_ar1_errors", + res_legacy["loc_ecov_AR1_innovs"]["tas"], + ), ): npt.assert_allclose(res_updated[key], comparison) + # things that aren't tested well: # - what happens if ensemble member and scenario don't actually make a coherent set # - units (should probably be using dataset rather than dataarray for inputs and outputs?) From fe1d1bcd61631b4ade407a9487a6c02c712f735b Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Thu, 21 Sep 2023 18:18:31 +0200 Subject: [PATCH 18/23] fix: test_prototype_train_lv --- tests/integration/test_prototype.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tests/integration/test_prototype.py b/tests/integration/test_prototype.py index 0f250896..e23b3418 100644 --- a/tests/integration/test_prototype.py +++ b/tests/integration/test_prototype.py @@ -445,13 +445,8 @@ def _get_scenario_sample(): localisation_radii, ) - for key, comparison in ( - ( - "localised_empirical_covariance_matrix_with_ar1_errors", - res_legacy["loc_ecov_AR1_innovs"]["tas"], - ), - ): - npt.assert_allclose(res_updated[key], comparison) + # check localised_empirical_covariance_matrix_with_ar1_errors + npt.assert_allclose(res_updated, res_legacy["loc_ecov_AR1_innovs"]["tas"]) # things that aren't tested well: From c854fa480618844c34c1de5024ff4a8baf91e577 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Thu, 21 Sep 2023 23:28:20 +0200 Subject: [PATCH 19/23] clean train_lt.py --- mesmer/calibrate_mesmer/train_lt.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mesmer/calibrate_mesmer/train_lt.py b/mesmer/calibrate_mesmer/train_lt.py index 05e1dfe6..198d970e 100644 --- a/mesmer/calibrate_mesmer/train_lt.py +++ b/mesmer/calibrate_mesmer/train_lt.py @@ -89,6 +89,7 @@ def train_lt(preds, targs, esm, cfg, save_params=True): assumption is fulfilled or rewrite code such that no longer necessary) """ + targ_names = list(targs.keys()) targ_name = targ_names[0] # because same approach for each targ pred_names = list(preds.keys()) @@ -185,9 +186,6 @@ def train_lt(preds, targs, esm, cfg, save_params=True): else: raise NotImplementedError() - else: - raise NotImplementedError() - # save the local trend paramters if requested if save_params: save_mesmer_data( @@ -225,5 +223,5 @@ def train_lt(preds, targs, esm, cfg, save_params=True): if len(params_lv) > 0: return params_lt, params_lv - - raise NotImplementedError("This path is never used") + else: + return params_lt From c3bf5e094a22aa88cfcdbf4a37678b54d99b228a Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Thu, 21 Sep 2023 23:54:54 +0200 Subject: [PATCH 20/23] remove prototype/utils.py after #298, #299, and #300 --- mesmer/prototype/calibrate_multiple.py | 12 ++- mesmer/prototype/utils.py | 141 ------------------------- tests/integration/test_prototype.py | 15 ++- 3 files changed, 17 insertions(+), 151 deletions(-) delete mode 100644 mesmer/prototype/utils.py diff --git a/mesmer/prototype/calibrate_multiple.py b/mesmer/prototype/calibrate_multiple.py index 049b1ce6..a6d847e1 100644 --- a/mesmer/prototype/calibrate_multiple.py +++ b/mesmer/prototype/calibrate_multiple.py @@ -3,8 +3,11 @@ import scipy.stats import xarray as xr +from ..core.computation import ( + calc_gaspari_cohn_correlation_matrices, + calc_geodist_exact, +) from .calibrate import AutoRegression1D, AutoRegression1DOrderSelection -from .utils import calculate_gaspari_cohn_correlation_matrices def _get_predictor_dims(predictors): @@ -185,10 +188,9 @@ def calibrate_auto_regressive_process_with_spatially_correlated_errors_multiple_ for gridpoint, gridpoint_vals in target.groupby("gridpoint") } - gaspari_cohn_correlation_matrices = calculate_gaspari_cohn_correlation_matrices( - target.lat, - target.lon, - localisation_radii, + geodist = calc_geodist_exact(target.lon, target.lat) + gaspari_cohn_correlation_matrices = calc_gaspari_cohn_correlation_matrices( + geodist, localisation_radii ) localised_empirical_covariance_matrix = ( diff --git a/mesmer/prototype/utils.py b/mesmer/prototype/utils.py deleted file mode 100644 index 397bb5ed..00000000 --- a/mesmer/prototype/utils.py +++ /dev/null @@ -1,141 +0,0 @@ -import numpy as np -import pyproj -import xarray as xr - - -def calculate_gaspari_cohn_correlation_matrices( - latitudes, - longitudes, - localisation_radii, -): - """ - Calculate Gaspari-Cohn correlation matrices for a range of localisation radiis - - Parameters - ---------- - latitudes : :obj:`xr.DataArray` - Latitudes (one-dimensional) - - longitudes : :obj:`xr.DataArray` - Longitudes (one-dimensional) - - localisation_radii : list-like - Localisation radii to test (in metres) - - Returns - ------- - dict[float: :obj:`xr.DataArray`] - Gaspari-Cohn correlation matrix (values) for each localisation radius (keys) - - Notes - ----- - Values in ``localisation_radii`` should not exceed 10'000 by much because - it can lead to ``ValueError: the input matrix must be positive semidefinite`` - """ - # I wonder if xarray can apply a function to all pairs of points in arrays - # or something - geodistance = calculate_geodistance_exact(latitudes, longitudes) - - gaspari_cohn_correlation_matrices = { - lr: calculate_gaspari_cohn_values(geodistance / lr) for lr in localisation_radii - } - - return gaspari_cohn_correlation_matrices - - -def calculate_geodistance_exact(latitudes, longitudes): - """ - Calculate exact great circle distance based on WSG 84 - - Parameters - ---------- - latitudes : :obj:`xr.DataArray` - Latitudes (one-dimensional) - - longitudes : :obj:`xr.DataArray` - Longitudes (one-dimensional) - - Returns - ------- - :obj:`xr.DataArray` - 2D array of great circle distances between points represented by ``latitudes`` - and ``longitudes`` - """ - if longitudes.shape != latitudes.shape or longitudes.ndim != 1: - raise ValueError("lon and lat need to be 1D arrays of the same shape") - - geod = pyproj.Geod(ellps="WGS84") - - n_points = longitudes.shape[0] - - geodistance = np.zeros([n_points, n_points]) - - # calculate only the upper right half of the triangle first - for i in range(n_points): - - # need to duplicate gridpoint (required by geod.inv) - lat = np.tile(latitudes[i], n_points - (i + 1)) - lon = np.tile(longitudes[i], n_points - (i + 1)) - - geodistance[i, i + 1 :] = geod.inv( - lon, lat, longitudes.values[i + 1 :], latitudes.values[i + 1 :] - )[2] - - # convert m to km - geodistance /= 1000 - - # fill the lower left half of the triangle (in-place) - geodistance += np.transpose(geodistance) - - if latitudes.dims != longitudes.dims: - raise AssertionError( - f"latitudes and longitudes have different dims: {latitudes.dims} vs. {longitudes.dims}" - ) - - geodistance = xr.DataArray( - geodistance, dims=list(latitudes.dims) * 2, coords=latitudes.coords - ) - - return geodistance - - -def calculate_gaspari_cohn_values(inputs): - """ - Calculate smooth, exponentially decaying Gaspari-Cohn values - - Parameters - ---------- - inputs : :obj:`xr.DataArray` - Inputs at which to calculate the value of the smooth, exponentially decaying Gaspari-Cohn - correlation function (these could be e.g. normalised geographical distances) - - Returns - ------- - :obj:`xr.DataArray` - Gaspari-Cohn correlation function applied to each point in ``inputs`` - """ - inputs_abs = abs(inputs) - out = np.zeros_like(inputs) - - sel_zero_to_one = (inputs_abs.values >= 0) & (inputs_abs.values < 1) - r_s = inputs_abs.values[sel_zero_to_one] - out[sel_zero_to_one] = ( - 1 - 5 / 3 * r_s**2 + 5 / 8 * r_s**3 + 1 / 2 * r_s**4 - 1 / 4 * r_s**5 - ) - - sel_one_to_two = (inputs_abs.values >= 1) & (inputs_abs.values < 2) - r_s = inputs_abs.values[sel_one_to_two] - - out[sel_one_to_two] = ( - 4 - - 5 * r_s - + 5 / 3 * r_s**2 - + 5 / 8 * r_s**3 - - 1 / 2 * r_s**4 - + 1 / 12 * r_s**5 - - 2 / (3 * r_s) - ) - - out = xr.DataArray(out, dims=inputs.dims, coords=inputs.coords) - - return out diff --git a/tests/integration/test_prototype.py b/tests/integration/test_prototype.py index e23b3418..efef96a5 100644 --- a/tests/integration/test_prototype.py +++ b/tests/integration/test_prototype.py @@ -7,13 +7,16 @@ from mesmer.calibrate_mesmer.train_gv import train_gv from mesmer.calibrate_mesmer.train_lt import train_lt from mesmer.calibrate_mesmer.train_lv import train_lv +from mesmer.core.computation import ( + calc_gaspari_cohn_correlation_matrices, + calc_geodist_exact, +) from mesmer.prototype.calibrate import LinearRegression from mesmer.prototype.calibrate_multiple import ( calibrate_auto_regressive_process_multiple_scenarios_and_ensemble_members, calibrate_auto_regressive_process_with_spatially_correlated_errors_multiple_scenarios_and_ensemble_members, flatten_predictors_and_target, ) -from mesmer.prototype.utils import calculate_gaspari_cohn_correlation_matrices class _MockConfig: @@ -325,11 +328,13 @@ def _do_legacy_run_train_lv( .values ) - gaspari_cohn_correlation_matrices = calculate_gaspari_cohn_correlation_matrices( - latitudes=esm_tas_residual_local_variability.lat, - longitudes=esm_tas_residual_local_variability.lon, - localisation_radii=localisation_radii, + geodist = calc_geodist_exact( + esm_tas_residual_local_variability.lon, esm_tas_residual_local_variability.lat + ) + gaspari_cohn_correlation_matrices = calc_gaspari_cohn_correlation_matrices( + geodist, localisation_radii ) + gaspari_cohn_correlation_matrices = { k: v.values for k, v in gaspari_cohn_correlation_matrices.items() } From 26f67619c37d2f4a4435151ab7766c3232919001 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 25 Sep 2023 12:16:39 +0200 Subject: [PATCH 21/23] allow selected ar order to be None --- mesmer/prototype/calibrate_multiple.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mesmer/prototype/calibrate_multiple.py b/mesmer/prototype/calibrate_multiple.py index a6d847e1..3a6babd3 100644 --- a/mesmer/prototype/calibrate_multiple.py +++ b/mesmer/prototype/calibrate_multiple.py @@ -117,7 +117,10 @@ def _select_auto_regressive_process_order( orders = AutoRegression1DOrderSelection().calibrate( values, maxlag=maxlag, ic=ic ) - keep_order = 0 if orders is None else orders[-1] + + # orders can be None + keep_order = np.nan if orders is None else orders[-1] + store.append( { "scenario": scenario, From d3eb99d59630335ba78fa5a1e2b4cbe85d712d9d Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Tue, 12 Dec 2023 16:27:39 +0100 Subject: [PATCH 22/23] fix for gaspari_cohn and geodist_exact --- mesmer/prototype/calibrate_multiple.py | 10 ++++------ tests/integration/test_prototype.py | 9 +++------ 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/mesmer/prototype/calibrate_multiple.py b/mesmer/prototype/calibrate_multiple.py index 3a6babd3..4b12f6b6 100644 --- a/mesmer/prototype/calibrate_multiple.py +++ b/mesmer/prototype/calibrate_multiple.py @@ -3,10 +3,8 @@ import scipy.stats import xarray as xr -from ..core.computation import ( - calc_gaspari_cohn_correlation_matrices, - calc_geodist_exact, -) +import mesmer + from .calibrate import AutoRegression1D, AutoRegression1DOrderSelection @@ -191,8 +189,8 @@ def calibrate_auto_regressive_process_with_spatially_correlated_errors_multiple_ for gridpoint, gridpoint_vals in target.groupby("gridpoint") } - geodist = calc_geodist_exact(target.lon, target.lat) - gaspari_cohn_correlation_matrices = calc_gaspari_cohn_correlation_matrices( + geodist = mesmer.geospatial.geodist_exact(target.lon, target.lat) + gaspari_cohn_correlation_matrices = mesmer.stats.gaspari_cohn_correlation_matrices( geodist, localisation_radii ) diff --git a/tests/integration/test_prototype.py b/tests/integration/test_prototype.py index efef96a5..49fc6750 100644 --- a/tests/integration/test_prototype.py +++ b/tests/integration/test_prototype.py @@ -4,13 +4,10 @@ import xarray as xr from statsmodels.tsa.arima_process import ArmaProcess +import mesmer from mesmer.calibrate_mesmer.train_gv import train_gv from mesmer.calibrate_mesmer.train_lt import train_lt from mesmer.calibrate_mesmer.train_lv import train_lv -from mesmer.core.computation import ( - calc_gaspari_cohn_correlation_matrices, - calc_geodist_exact, -) from mesmer.prototype.calibrate import LinearRegression from mesmer.prototype.calibrate_multiple import ( calibrate_auto_regressive_process_multiple_scenarios_and_ensemble_members, @@ -328,10 +325,10 @@ def _do_legacy_run_train_lv( .values ) - geodist = calc_geodist_exact( + geodist = mesmer.geospatial.geodist_exact( esm_tas_residual_local_variability.lon, esm_tas_residual_local_variability.lat ) - gaspari_cohn_correlation_matrices = calc_gaspari_cohn_correlation_matrices( + gaspari_cohn_correlation_matrices = mesmer.stats.gaspari_cohn_correlation_matrices( geodist, localisation_radii ) From 8f19cd5ac91b094e8a6ec12b2975a948db13eb44 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Tue, 12 Dec 2023 17:33:36 +0100 Subject: [PATCH 23/23] small refactor --- tests/integration/test_prototype.py | 94 +++++++++++++---------------- 1 file changed, 42 insertions(+), 52 deletions(-) diff --git a/tests/integration/test_prototype.py b/tests/integration/test_prototype.py index 49fc6750..123de595 100644 --- a/tests/integration/test_prototype.py +++ b/tests/integration/test_prototype.py @@ -1,5 +1,4 @@ import numpy as np -import numpy.testing as npt import pytest import xarray as xr from statsmodels.tsa.arima_process import ArmaProcess @@ -196,7 +195,7 @@ def test_prototype_train_lt(): ("global_variability", res_legacy[1]["coef_gvtas"]["tas"]), ("intercept", res_legacy[0]["intercept"]["tas"]), ): - npt.assert_allclose(res_updated.sel(predictor=updated_name), legacy_vals) + np.testing.assert_allclose(res_updated.sel(predictor=updated_name), legacy_vals) def _do_legacy_run_train_gv( @@ -233,9 +232,9 @@ def _do_legacy_run_train_gv( ), ) def test_prototype_train_gv(ar): - time_history = range(1850, 2014 + 1) - time_scenario = range(2015, 2100 + 1) - time = list(time_history) + list(time_scenario) + time_history = np.arange(1850, 2014 + 1) + time_scenario = np.arange(2015, 2100 + 1) + time = np.concatenate((time_history, time_scenario)) magnitude = np.array([0.1]) @@ -247,47 +246,38 @@ def test_prototype_train_gv(ar): scenario=scenarios, ensemble_member=["r1i1p1f1", "r2i1p1f1"], ) - esm_tas_global_variability = xr.DataArray( - np.array( + + def _get_history_sample(): + return np.concatenate( [ - [ - np.concatenate( - [ - ArmaProcess(ar, magnitude).generate_sample( - nsample=len(time_history) - ), - np.nan * np.zeros(len(time_scenario)), - ] - ), - np.concatenate( - [ - ArmaProcess(ar, magnitude).generate_sample( - nsample=len(time_history) - ), - np.nan * np.zeros(len(time_scenario)), - ] - ), - ], - [ - np.concatenate( - [ - np.nan * np.zeros(len(time_history)), - ArmaProcess(ar, magnitude).generate_sample( - nsample=len(time_scenario) - ), - ] - ), - np.concatenate( - [ - np.nan * np.zeros(len(time_history)), - ArmaProcess(ar, magnitude).generate_sample( - nsample=len(time_scenario) - ), - ] - ), - ], + ArmaProcess(ar, magnitude).generate_sample(nsample=time_history.size), + np.full(time_scenario.size, np.nan), ] - ), + ) + + def _get_scenario_sample(): + return np.concatenate( + [ + np.full(time_history.size, np.nan), + ArmaProcess(ar, magnitude).generate_sample(nsample=len(time_scenario)), + ] + ) + + data = np.array( + [ + [ + _get_history_sample(), + _get_history_sample(), + ], + [ + _get_scenario_sample(), + _get_scenario_sample(), + ], + ], + ) + + esm_tas_global_variability = xr.DataArray( + data, dims=targ_dims, coords=targ_coords, ) @@ -309,7 +299,7 @@ def test_prototype_train_gv(ar): ("lag_coefficients", res_legacy["AR_coefs"]), ("standard_innovations", res_legacy["AR_std_innovs"]), ): - npt.assert_allclose(res_updated[key], comparison) + np.testing.assert_allclose(res_updated[key], comparison) def _do_legacy_run_train_lv( @@ -371,9 +361,9 @@ def test_prototype_train_lv(): localisation_radii = np.arange(700, 2000, 1000) # see how much code I can reuse for the AR1 calibration - time_history = range(1850, 2014 + 1) - time_scenario = range(2015, 2100 + 1) - time = list(time_history) + list(time_scenario) + time_history = np.arange(1850, 2014 + 1) + time_scenario = np.arange(2015, 2100 + 1) + time = np.concatenate((time_history, time_scenario)) scenarios = ["hist", "ssp126"] # we wouldn't actually start like this, but we'd write a utils function @@ -394,15 +384,15 @@ def test_prototype_train_lv(): def _get_history_sample(): return np.concatenate( [ - ArmaProcess(ar, magnitude).generate_sample(nsample=len(time_history)), - np.nan * np.zeros(len(time_scenario)), + ArmaProcess(ar, magnitude).generate_sample(nsample=time_history.size), + np.full(time_scenario.size, np.nan), ] ) def _get_scenario_sample(): return np.concatenate( [ - np.nan * np.zeros(len(time_history)), + np.full(time_history.size, np.nan), ArmaProcess(ar, magnitude).generate_sample(nsample=len(time_scenario)), ] ) @@ -448,7 +438,7 @@ def _get_scenario_sample(): ) # check localised_empirical_covariance_matrix_with_ar1_errors - npt.assert_allclose(res_updated, res_legacy["loc_ecov_AR1_innovs"]["tas"]) + np.testing.assert_allclose(res_updated, res_legacy["loc_ecov_AR1_innovs"]["tas"]) # things that aren't tested well: