From 39a1fa2c8801a2caa2125017119a8b6a796b774a Mon Sep 17 00:00:00 2001 From: lbvienna Date: Fri, 25 Aug 2023 19:02:47 +0200 Subject: [PATCH 1/9] ran linter --- src/elexmodel/handlers/data/CombinedData.py | 9 +- src/elexmodel/handlers/data/Featurizer.py | 215 +++++++------ src/elexmodel/models/BaseElectionModel.py | 66 ++-- tests/handlers/test_featurizer.py | 340 +++++++++++++------- 4 files changed, 380 insertions(+), 250 deletions(-) diff --git a/src/elexmodel/handlers/data/CombinedData.py b/src/elexmodel/handlers/data/CombinedData.py index 4cc2ab56..41bce8f3 100644 --- a/src/elexmodel/handlers/data/CombinedData.py +++ b/src/elexmodel/handlers/data/CombinedData.py @@ -54,7 +54,8 @@ def get_reporting_units(self, percent_reporting_threshold, features_to_normalize reporting_units[f"results_{estimand}"] - reporting_units[f"last_election_results_{estimand}"] ) / reporting_units[f"last_election_results_{estimand}"] - reporting_units["reporting"] = 1 + reporting_units["reporting"] = int(1) + reporting_units["expected"] = True return reporting_units @@ -68,7 +69,8 @@ def get_nonreporting_units(self, percent_reporting_threshold, features_to_normal drop=True ) - nonreporting_units["reporting"] = 0 + nonreporting_units["reporting"] = int(0) + nonreporting_units["expected"] = True return nonreporting_units @@ -121,7 +123,8 @@ def get_unexpected_units(self, percent_reporting_threshold, aggregates): self._get_district_from_geographic_unit_fips ) - unexpected_units["reporting"] = 1 + unexpected_units["reporting"] = int(1) + unexpected_units["expected"] = False return unexpected_units diff --git a/src/elexmodel/handlers/data/Featurizer.py b/src/elexmodel/handlers/data/Featurizer.py index f5a9a5b5..fd8ddd11 100644 --- a/src/elexmodel/handlers/data/Featurizer.py +++ b/src/elexmodel/handlers/data/Featurizer.py @@ -7,11 +7,13 @@ class Featurizer: Featurizer. Normalizes features, add intercept, expands fixed effects """ - def __init__(self, features, fixed_effects): + def __init__(self, features: list, fixed_effects: list): self.features = features + # fixed effects can be a list, in which case every value of a fixed effect gets its own column if isinstance(fixed_effects, list): self.fixed_effect_cols = fixed_effects self.fixed_effect_params = {fe: ["all"] for fe in fixed_effects} + # fixed effects can be a dictionary from fixed effect to values that get their own column (or the string all, if we want all values) else: self.fixed_effect_cols = list(fixed_effects.keys()) self.fixed_effect_params = {} @@ -21,132 +23,147 @@ def __init__(self, features, fixed_effects): else: self.fixed_effect_params[fe] = params + # we differentiate between expanded fixed effects and active fixed effect values + # expanded fixed effects are those fixed effect values that appear in any part of the + # the data (fitting or heldout) exlcluding those that have been dropped to avoid + # multicolinearity when fitting. self.expanded_fixed_effects = [] - self.complete_features = None - self.column_means = None - - def compute_means_for_centering(self, *arg): - """ - Computes and saves the column mean of pandas dataframe passed as args. - This is used for centering. - """ - data = pd.concat(arg) - self.column_means = data[self.features].mean() - - def _center_features(self, df): - """ - Centers the features. This changes the interpretation of the intercept coefficient - from conditional mean given covariates = 0, to conditional mean given covariates are - their average value - """ - df[self.features] = df[self.features] - self.column_means + # complete features are features + expanded fixed effects + self.complete_features = [] - def _add_intercept(self, df): - df["intercept"] = 1 + # active fixed effects are those that appear in the fitting data (ie. ones for which + # the model fitting computes a coefficient) but exluding those that we drop manually + # to avoid multicolinearity when fitting + self.active_fixed_effects = [] + # active features are features + active fixed effects + self.active_features = [] - def _expand_fixed_effects(self, df: pd.DataFrame, drop_first: bool) -> pd.DataFrame: + def _expand_fixed_effects(self, df: pd.DataFrame) -> pd.DataFrame: """ Convert fixed effect columns into dummy variables. """ + df = df.copy() + # we want to keep the original fixed effect columns since we may need them later for aggregation (ie. county fixed effect) original_fixed_effect_columns = df[self.fixed_effect_cols] - # set non-included values to other as needed - fe_df = df.copy() + # set non-included values to 'other' as needed since we don't want their values to get a dummy variable for fe, params in self.fixed_effect_params.items(): if "all" not in params: - fe_df[fe] = np.where(~fe_df[fe].isin(params), "other", fe_df[fe]) + df[fe] = np.where(~df[fe].isin(params), "other", df[fe]) expanded_fixed_effects = pd.get_dummies( - fe_df, columns=self.fixed_effect_cols, prefix=self.fixed_effect_cols, prefix_sep="_", dtype=np.int64 + df, columns=self.fixed_effect_cols, prefix=self.fixed_effect_cols, prefix_sep="_", dtype=np.int64 ) - # drop first column or "other" column if drop_first is true - cols_to_drop = [] - if drop_first: - for fixed_effect in self.fixed_effect_cols: - relevant_cols = [col for col in expanded_fixed_effects.columns if col.startswith(fixed_effect)] - if f"{fixed_effect}_other" in relevant_cols: - cols_to_drop.append(f"{fixed_effect}_other") - else: - cols_to_drop.append(relevant_cols[0]) + return pd.concat([original_fixed_effect_columns, expanded_fixed_effects], axis=1) - # we concatenate the dummy variables with the original fixed effects, since we need the original fixed - # effect columns for aggregation. - return pd.concat([original_fixed_effect_columns, expanded_fixed_effects.drop(cols_to_drop, axis=1)], axis=1) - - def featurize_fitting_data(self, fitting_data, center_features=True, add_intercept=True): + def _get_categories_for_fe(self, list_: list, fe: str) -> list: """ - Featurize the data that the model is fitted on. - In our case fitting_data is either the reporting_units (when fitting a model for the point predictions) - or training_data (when fitting the model for the prediction intervals) + Return list of fixed effects values for a given fixed effect """ - # make copy of fitting_data, since we do not want to change the original data - new_fitting_data = fitting_data.copy() - self.center_features = center_features - self.add_intercept = add_intercept - - if self.center_features: - self._center_features(new_fitting_data) + return [x for x in list_ if x.startswith(fe)] - self.complete_features = [] - if self.add_intercept: + def prepare_data( + self, df: pd.DataFrame, center_features: bool = True, scale_features: bool = True, add_intercept: bool = True + ) -> pd.DataFrame: + """ + Prepares features. + Adds dummy variables for fixed effects. Also includes centering, scaling continuous covariates and adding intercept. + """ + df = df.copy() # create copy so we can do things to the values + if center_features: + df[self.features] -= df[self.features].mean() + if scale_features: + # this expects there to be some variation in the data, otherwise we are dividing by zero + df[self.features] /= df[self.features].std() + if add_intercept: self.complete_features += ["intercept"] - self._add_intercept(new_fitting_data) + self.active_features += ["intercept"] + df["intercept"] = 1 if len(self.fixed_effect_cols) > 0: - # drop_first is True for fitting_data (e.g. reporting_units) since we want to avoid the design matrix with - # expanded fixed effects to be linearly dependent - new_fitting_data = self._expand_fixed_effects(new_fitting_data, drop_first=True) - # we save the expanded fixed effects to be able to add fixed effects that are - # not in the heldout_data (nonreporting_units) as a zero column and to be able - # to specify the order of the expanded fixed effect when fitting the model - self.expanded_fixed_effects = [ + df = self._expand_fixed_effects(df) + + # we save the expanded fixed effects to be able to add a zero column for those + # fixed effect values if they are not in the heldout_data (nonreporting units). + # Also we can use this to guarantee the order of the fixed effect columns + # when fitting the model + all_expanded_fixed_effects = [ x - for x in new_fitting_data.columns + for x in df.columns if x.startswith(tuple([fixed_effect + "_" for fixed_effect in self.fixed_effect_cols])) ] + df_fitting = df[(df.reporting is True) & (df.expected is True)] + # get the indices of all expanded fixed effects in the fitting data (active fixed effects + the fixed effect we will drop for multicolinearity) + active_fixed_effect_boolean_df = df_fitting[all_expanded_fixed_effects].sum(axis=0) > 0 + # get the names of those fixed effects, since we we will want to know which fixed effect was dropped + all_active_fixed_effects = np.asarray(all_expanded_fixed_effects)[active_fixed_effect_boolean_df] + + # if we add an intercept we need to drop a value/column per fixed effect in order to avoid multicolinearity. + # the intercept column is now a stand-in for the the dropped fixed effect value/column + if add_intercept: + active_fixed_effects = ( + [] + ) # fixed effects that exist in the fitting_data (excluding one dropped column to avoid multicolinearity) + intercept_column = ( + [] + ) # we need to save the fixed effect categories that the intercept is now standing in for + # we want to drop one value/column per fixed effect to avoid multicolinearity + for fe in self.fixed_effect_cols: + # grab the potentially active fixed effect names for this fixed effect + fe_fixed_effect_filter = self._get_categories_for_fe(all_active_fixed_effects, fe) + # drop the first potentially active fixed effect + active_fixed_effects.extend(fe_fixed_effect_filter[1:]) + # save the name of the fixed effect that we dropped + intercept_column.append(fe_fixed_effect_filter[0]) + + self.active_fixed_effects = active_fixed_effects + self.intercept_column = intercept_column + # expanded fixed effects do not include the ones that we dropped to avoid multicolinearity + self.expanded_fixed_effects = [x for x in all_expanded_fixed_effects if x not in intercept_column] + else: + self.active_fixed_effects = all_active_fixed_effects + self.expanded_fixed_effects = all_expanded_fixed_effects + # all features that the model will be fit on + # these are all the features + the expanded fixed effects (so all fixed effect values in the complete data excluding the ones dropped for multicolinearity) self.complete_features += self.features + self.expanded_fixed_effects + self.active_features += self.features + self.active_fixed_effects + df = df[self.complete_features] - return new_fitting_data[self.complete_features] + return df - def featurize_heldout_data(self, heldout_data): + def filter_to_active_features(self, df: pd.DataFrame) -> pd.DataFrame: """ - Featurize the data that the model will be applied on. - In our case the heldout_data is either the nonreporting_units - (when applying the model for the point predictions) - or conformalization_data/nonreporting_units - (when applying the model for the prediction intervals) + Get active features (ie. features + active fixed effects) """ - new_heldout_data = heldout_data.copy() + return df[self.active_features] - if self.center_features: - self._center_features(new_heldout_data) - - if self.add_intercept: - self._add_intercept(new_heldout_data) - - if len(self.fixed_effect_cols) > 0: - missing_expanded_fixed_effects = [] - new_heldout_data = self._expand_fixed_effects(new_heldout_data, drop_first=False) - # if all units from one fixed effect are reporting they will not appear in the heldout_data - # (e.g. nonreporting_units) and won't get a column when we expand the fixed effects - # on that dataframe. Therefore we add those columns with zero fixed effects manually. - # As an example, if we are running a county model using state fixed effects, and - # all of Delaware's counties are reporting, then no Delaware county will be in - # heldout_data (nonreporting_units), as a result there will be no column for Delaware - # in the expanded fixed effects of heldout_data (nonreporting_units). - for expanded_fixed_effect in self.expanded_fixed_effects: - if expanded_fixed_effect not in new_heldout_data.columns: - missing_expanded_fixed_effects.append(expanded_fixed_effect) - - missing_expanded_fixed_effects_df = pd.DataFrame( - np.zeros((new_heldout_data.shape[0], len(missing_expanded_fixed_effects))), - columns=missing_expanded_fixed_effects, - ) - # if we use this method to add the missing expanded fixed effects because doing it manually - # ie. new_heldout_data[expanded_fixed_effect] = 0 - # can throw a fragmentation warning when there are many missing fixed effects. - new_heldout_data = new_heldout_data.join(missing_expanded_fixed_effects_df) - - return new_heldout_data[self.complete_features] + def generate_holdout_data(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Generate fixed effects for the holdout data (ie. data that we will predict on) + """ + df = df.copy() + + # if a unit has an inactive fixed effect value for some fixed effect category we need + # to insert 1 / (number of fixed effect values) into each active fixed effect value for that unit + # if we were to leave them as zero, then the model would apply the dropped fixed effect + # value coefficient (since this is now what the intercept stands in for) + # instead we want to apply all active fixed effect coefficients equally + + # get inactive fixed effects (ie expanded fixed effects that are not active) + # these are fixed effects that exist only in the holdout set (ie. we do not have a covariate for them) + inactive_fixed_effects = [x for x in self.expanded_fixed_effects if x not in self.active_fixed_effects] + for fe in self.fixed_effect_cols: + # active fixed effect values for this fixed effect + fe_active_fixed_effects = self._get_categories_for_fe(self.active_fixed_effects, fe) + # inactive fixed effect values for this fixed effect + fe_inactive_fixed_effects = self._get_categories_for_fe(inactive_fixed_effects, fe) + # get rows that have an inactive fixed effect + rows_w_inactive_fixed_effects = df[fe_inactive_fixed_effects].sum(axis=1) > 0 + + # set the values for active fixed effect in rows that have inactive fixed effect to be 1 / (n + 1) + # rows that have an inactive fixed effect value need to receive the treat of the average fixed effects + # NOTE: aren't we now applying 1 * the dropped fixed effect and 1 / (n + 1) times the other fixed effects? + df.loc[rows_w_inactive_fixed_effects, fe_active_fixed_effects] = 1 / (len(fe_active_fixed_effects) + 1) + return self.filter_to_active_features(df) diff --git a/src/elexmodel/models/BaseElectionModel.py b/src/elexmodel/models/BaseElectionModel.py index f985dd78..967a1f28 100644 --- a/src/elexmodel/models/BaseElectionModel.py +++ b/src/elexmodel/models/BaseElectionModel.py @@ -5,6 +5,7 @@ import cvxpy import numpy as np +import pandas as pd from elexsolver.QuantileRegressionSolver import QuantileRegressionSolver from elexmodel.handlers.data.Featurizer import Featurizer @@ -23,7 +24,6 @@ def __init__(self, model_settings={}): self.fixed_effects = model_settings.get("fixed_effects", {}) self.lambda_ = model_settings.get("lambda_", 0) self.features_to_coefficients = {} - self.featurizer = Featurizer(self.features, self.fixed_effects) self.add_intercept = True self.seed = 4191 # set arbitrarily @@ -53,26 +53,28 @@ def fit_model(self, model, df_X, df_y, tau, weights, normalize_weights): LOG.warning("Warning: solution was inaccurate or solver broke. Re-running with normalize_weights=False.") model.fit(X, y, tau_value=tau, weights=weights, lambda_=self.lambda_, normalize_weights=False) - def get_unit_predictions(self, reporting_units, nonreporting_units, estimand): + def get_unit_predictions(self, reporting_units, nonreporting_units, estimand, **kwargs): """ Produces unit level predictions. Fits quantile regression to reporting data, applies it to nonreporting data. The features are specified in model_settings. """ - # compute the means of both reporting_units and nonreporting_units for centering (part of featurizing) - # we want them both, since together they are the subunit population - self.featurizer.compute_means_for_centering(reporting_units, nonreporting_units) - # reporting_units_features and nonreporting_units_features should have the same - # features. Specifically also the same fixed effect columns. - reporting_units_features = self.featurizer.featurize_fitting_data( - reporting_units, add_intercept=self.add_intercept + n_train = reporting_units.shape[0] + n_test = nonreporting_units.shape[0] + all_units = pd.concat([reporting_units, nonreporting_units], axis=0) + + featurizer = Featurizer(self.features, self.fixed_effects) + x_all = featurizer.prepare_data( + all_units, center_features=True, scale_features=False, add_intercept=self.add_intercept ) - nonreporting_units_features = self.featurizer.featurize_heldout_data(nonreporting_units) + + reporting_units_features = featurizer.filter_to_active_features(x_all[:n_train]) + nonreporting_units_features = featurizer.generate_holdout_data(x_all[n_train : n_train + n_test]) # noqa: E203 weights = reporting_units[f"last_election_results_{estimand}"] reporting_units_residuals = reporting_units[f"residuals_{estimand}"] self.fit_model(self.qr, reporting_units_features, reporting_units_residuals, 0.5, weights, True) - self.features_to_coefficients = dict(zip(self.featurizer.complete_features, self.qr.coefficients)) + self.features_to_coefficients = dict(zip(featurizer.complete_features, self.qr.coefficients)) preds = self.qr.predict(nonreporting_units_features) @@ -200,20 +202,28 @@ def get_unit_prediction_interval_bounds(self, reporting_units, nonreporting_unit # training/conformalization split for each alpha of one run reporting_units_shuffled = reporting_units.sample(frac=1, random_state=self.seed).reset_index(drop=True) + n_reporting_units = reporting_units.shape[0] + upper_bound = (1 + alpha) / 2 lower_bound = (1 - alpha) / 2 train_rows = math.floor(reporting_units.shape[0] * conf_frac) - train_data = reporting_units_shuffled[:train_rows].reset_index(drop=True) + train_data = reporting_units_shuffled[:train_rows] - # specifying self.features extracts the correct columns and makes sure they are in the correct - # order. Necessary when fitting and predicting on the model. # the fixed effects in train_data will be a subset of the fixed effect of reporting_units since all - # units from one fixed effect category might be in the conformalization data. Note that we are - # overwritting featurizer.expanded_fixed_effects by doing this (which is what we want), since we - # want the expanded_fixed_effects from train_data to be used by conformalization_data and nonreporting_data - # in this function. - train_data_features = self.featurizer.featurize_fitting_data(train_data, add_intercept=self.add_intercept) + # units from one fixed effect category might be in the conformalization data. + + # we need a new featurizer since otherwise we will continue to add intercepts to the features + interval_featurizer = Featurizer(self.features, self.fixed_effects) + # we need all units since we will apply the upper and lower models to the nonreporting_units also + # so we need to make sure that they have the correct fixed effects + all_units_shuffled = pd.concat([reporting_units_shuffled, nonreporting_units], axis=0) + x_all = interval_featurizer.prepare_data( + all_units_shuffled, center_features=True, scale_features=False, add_intercept=self.add_intercept + ) + # x_all starts with the shuffled reporting units, so the first train_rows are the same as train_data + train_data_features = interval_featurizer.filter_to_active_features(x_all[:train_rows]) + train_data_residuals = train_data[f"residuals_{estimand}"] train_data_weights = train_data[f"last_election_results_{estimand}"] @@ -226,9 +236,11 @@ def get_unit_prediction_interval_bounds(self, reporting_units, nonreporting_unit # apply to conformalization data. Conformalization bounds will later tell us how much to adjust lower/upper # bounds for nonreporting data. - conformalization_data = reporting_units_shuffled[train_rows:].reset_index(drop=True) - # conformalization features will be the same as the features in train_data - conformalization_data_features = self.featurizer.featurize_heldout_data(conformalization_data) + conformalization_data = reporting_units_shuffled[train_rows:] + + # all_data starts with reporting_units_shuffled, so the rows between train_rows and n_reporting_units are the + # conformalization set + conformalization_data_features = interval_featurizer.generate_holdout_data(x_all[train_rows:n_reporting_units]) # we are interested in f(X) - r # since later conformity scores care about deviation of bounds from residuals @@ -244,9 +256,10 @@ def get_unit_prediction_interval_bounds(self, reporting_units, nonreporting_unit conformalization_data["lower_bounds"] = conformalization_lower_bounds # apply lower/upper models to nonreporting data - # now the features of the nonreporting_units will be the same as the train_data features - # they might differ slightly from the features used when fitting the median prediction - nonreporting_units_features = self.featurizer.featurize_heldout_data(nonreporting_units) + # since nonreporting_units is the second dataframe in a_all, all units after n_reporting_units are nonreporting + # note: the features used may be different fromt the median predictions, but this guarantees that the features + # are the same accross train_data, conformalization_data and nonreporting_units + nonreporting_units_features = interval_featurizer.generate_holdout_data(x_all[n_reporting_units:]) nonreporting_lower_bounds = lower_qr.predict(nonreporting_units_features) nonreporting_upper_bounds = upper_qr.predict(nonreporting_units_features) @@ -264,3 +277,6 @@ def get_coefficients(self): These coefficients are for the point prediciton only, not for the lower or upper intervals models. """ return self.features_to_coefficients + + def get_national_summary_estimates(self, nat_sum_data_dict, called_states, base_to_add): + raise NotImplementedError() diff --git a/tests/handlers/test_featurizer.py b/tests/handlers/test_featurizer.py index 55239fa4..991aca48 100644 --- a/tests/handlers/test_featurizer.py +++ b/tests/handlers/test_featurizer.py @@ -7,73 +7,65 @@ from elexmodel.handlers.data.PreprocessedData import PreprocessedDataHandler -def compute_testing_mean_for_centering(): - """ " - Test whether computing the column mean for centering works. - """ - features = ["a", "b", "c"] - featurizer = Featurizer(features, {}) - - # test with one dataframe - df = pd.DataFrame({"a": [1, 1, 1, 1], "b": [2, 2, 2, 2], "c": [3, 3, 3, 3], "d": [1, 2, 3, 4]}) - - featurizer.compute_means_for_centering(df) - - assert featurizer.column_means.equals(pd.Series({"a": 1, "b": 2, "c": 3}, index=["a", "b", "c"], dtype=np.float64)) - - # test with two dataframes - df2 = pd.DataFrame({"a": [2, 2, 2, 2], "b": [3, 3, 3, 3], "c": [4, 4, 4, 4], "d": [1, 2, 3, 4]}) - - featurizer.compute_means_for_centering(df, df2) - assert featurizer.column_means.equals( - pd.Series({"a": 1.5, "b": 2.5, "c": 3.5}, index=["a", "b", "c"], dtype=np.float64) - ) - - def test_centering_features(): """ Test whether centering the features works """ - features = ["a", "b"] + features = ["a", "b", "c", "d"] featurizer = Featurizer(features, {}) - # test with one dataframe - df = pd.DataFrame({"a": [1, 2, 3], "b": [2, 4, 9]}) - - featurizer.compute_means_for_centering(df) - featurizer._center_features(df) - assert df.equals(pd.DataFrame({"a": [-1.0, 0.0, 1.0], "b": [-3.0, -1.0, 4.0]})) + df = pd.DataFrame({"a": [1, 1, 1, 1], "b": [2, 2, 2, 2], "c": [3, 3, np.nan, 3], "d": [1, 2, 3, 4]}) - # confirm that when the function is used properly we would not subtract the - # means twice - df = pd.DataFrame({"a": [1, 2, 3], "b": [2, 4, 9]}) - featurizer.featurize_fitting_data(df, center_features=True) - df2 = featurizer.featurize_fitting_data(df, center_features=True) - assert df2.equals(pd.DataFrame({"intercept": [1, 1, 1], "a": [-1.0, 0.0, 1.0], "b": [-3.0, -1.0, 4.0]})) + df_new = featurizer.prepare_data(df, center_features=True, scale_features=False, add_intercept=False) + df_expected_result = pd.DataFrame( + { + "a": [0.0, 0.0, 0.0, 0.0], + "b": [0.0, 0.0, 0.0, 0.0], + "c": [0.0, 0.0, np.nan, 0.0], + "d": [-1.5, -0.5, 0.5, 1.5], + } + ) + pd.testing.assert_frame_equal(df_new, df_expected_result) def test_adding_intercept(): + """ + Test adding intercept + """ features = ["a", "b", "c"] featurizer = Featurizer(features, {}) - # test with one dataframe df = pd.DataFrame({"a": [2, 2, 2, 2], "b": [3, 3, 3, 3], "c": [1, 2, 3, 4]}) - featurizer._add_intercept(df) + df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True) + + assert "intercept" in df_new.columns + assert "intercept" in featurizer.complete_features + assert "intercept" in featurizer.active_features + pd.testing.assert_series_equal(df_new.intercept, pd.Series([1, 1, 1, 1], name="intercept")) - assert "intercept" in df.columns - assert df.intercept.equals(pd.Series([1, 1, 1, 1])) +def test_scaling_features(): + """ + Test whether scaling features works + """ + features = ["a", "b", "c", "d"] + featurizer = Featurizer(features, {}) -def test_adding_intercept_complex(): - features = ["a", "b", "c"] - featurizer = Featurizer(features, []) - df = pd.DataFrame({"a": [2, 2, 2, 2], "b": [3, 3, 3, 3], "c": [1, 2, 3, 4]}) + # standard deviations here are 0.5, 1, 2 and inf + df = pd.DataFrame({"a": [1, 1, 1, 2], "b": [1, 1, 1, 3], "c": [1, 1, 1, 5], "d": [1, 1, 1, 1]}) - df2 = featurizer.featurize_fitting_data(df, center_features=False, add_intercept=True) - assert featurizer.add_intercept - assert "intercept" in df2.columns - assert "intercept" in featurizer.complete_features + df_new = featurizer.prepare_data(df, center_features=False, scale_features=True, add_intercept=False) + + df_expected_result = pd.DataFrame( + { + "a": [2.0, 2.0, 2.0, 4.0], + "b": [1.0, 1.0, 1.0, 3.0], + "c": [0.5, 0.5, 0.5, 2.5], + "d": [np.inf, np.inf, np.inf, np.inf], + } + ) + pd.testing.assert_frame_equal(df_new, df_expected_result) def test_column_names(): @@ -81,75 +73,148 @@ def test_column_names(): This function tests to make sure that the featurizer returns the right columns """ features = ["a", "b", "c"] - fixed_effects = {"fe_a": ["all"]} + fixed_effects = ["fe_a", "fe_b"] featurizer = Featurizer(features, fixed_effects) - df_fitting = pd.DataFrame( + split_fitting_heldout = 4 + # fe_a: "c" exists in fitting but not in heldout, "d" exists in heldout but not in fitting + # fe_b: "x", "7" and "y" exist in fitting but not in heldout, "z", "w" exist in heldout but not in fitting + df = pd.DataFrame( { - "x": [5, 3, 1, 5], - "a": [2, 2, 2, 2], - "b": [3, 3, 3, 3], - "c": [1, 2, 3, 4], - "fe_a": ["a", "a", "b", "c"], - "fe_b": ["1", "x", "7", "y"], + "a": [5, 3, 1, 5, 2, 2, 2, 2], + "b": [2, 2, 2, 2, 3, 3, 3, 3], + "c": [3, 3, 3, 3, 1, 2, 3, 4], + "d": [1, 2, 3, 4, 5, 3, 1, 5], + "fe_a": ["a", "a", "b", "c", "a", "a", "b", "d"], + "fe_b": ["1", "x", "7", "y", "1", "z", "z", "w"], + "reporting": [True, True, True, True, False, False, False, False], + "expected": [True, True, True, True, False, False, False, False], } ) - df_heldout = pd.DataFrame( + + df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True) + + df_fitting = featurizer.filter_to_active_features(df_new[:split_fitting_heldout]) + df_heldout = featurizer.generate_holdout_data(df_new[split_fitting_heldout:]) + assert (df_fitting.columns == df_heldout.columns).all() + + assert "a" in df_fitting.columns + assert "a" in df_heldout.columns + assert "a" in featurizer.features + assert "a" in featurizer.active_features + assert "a" in featurizer.complete_features + + assert "fe_a" in featurizer.fixed_effect_cols + assert "fe_a" in featurizer.fixed_effect_params.keys() + + # a is in fitting and in heldout BUT it's the first and therefore dropped to avoid multicolinearity + assert "fe_a_a" not in featurizer.expanded_fixed_effects + assert "fe_a_a" not in featurizer.active_fixed_effects + assert "fe_a_a" not in featurizer.active_features + assert "fe_a_a" not in featurizer.complete_features + assert "fe_a_a" not in df_fitting.columns + assert "fe_a_a" not in df_heldout.columns + + # b is in fitting and in heldout + assert "fe_a_b" in featurizer.expanded_fixed_effects + assert "fe_a_b" in featurizer.active_fixed_effects + assert "fe_a_b" in featurizer.active_features + assert "fe_a_b" in featurizer.complete_features + assert "fe_a_b" in df_fitting.columns + assert "fe_a_b" in df_heldout.columns + + # c is in fitting but not in heldout + assert "fe_a_c" in featurizer.expanded_fixed_effects + assert "fe_a_c" in featurizer.active_fixed_effects + assert "fe_a_c" in featurizer.active_features + assert "fe_a_c" in featurizer.complete_features + assert "fe_a_c" in df_fitting.columns + assert "fe_a_c" in df_heldout.columns # should still be in heldout since added manually + + # d is not in fitting but in heldout + assert "fe_a_d" in featurizer.expanded_fixed_effects + assert "fe_a_d" not in featurizer.active_fixed_effects + assert "fe_a_d" not in featurizer.active_features + assert "fe_a_d" in featurizer.complete_features + assert "fe_a_d" not in df_fitting.columns + assert "fe_a_d" not in df_heldout.columns + + +def test_generating_heldout_set(): + """ + This test makes sure the heldout set is as expected + """ + features = ["a", "b", "c"] + fixed_effects = ["fe_a", "fe_b"] + featurizer = Featurizer(features, fixed_effects) + + split_fitting_heldout = 4 + # fe_a: "c" exists in fitting but not in heldout, "d" exists in heldout but not in fitting + # fe_b: "x", "7" and "y" exist in fitting but not in heldout, "z", "w" exist in heldout but not in fitting + df = pd.DataFrame( { - "a": [2, 2, 2, 2], - "b": [3, 3, 3, 3], - "c": [1, 2, 3, 4], - "d": [5, 3, 1, 5], - "fe_a": ["a", "a", "b", "d"], - "fe_c": ["1", "a", "7", "y"], + "a": [5, 3, 1, 5, 2, 2, 2, 2], + "b": [2, 2, 2, 2, 3, 3, 3, 3], + "c": [3, 3, 3, 3, 1, 2, 3, 4], + "d": [1, 2, 3, 4, 5, 3, 1, 5], + "fe_a": ["a", "a", "b", "c", "a", "a", "b", "d"], + "fe_b": ["1", "x", "7", "y", "1", "z", "z", "w"], + "reporting": [True, True, True, True, False, False, False, False], + "expected": [True, True, True, True, False, False, False, False], } ) - featurizer.compute_means_for_centering(df_fitting, df_heldout) - # since only a, b and c are "features" specified above we would expect - # "x" from df_fitting and "d" from df_heldout to be dropped - # similarly we would expect the same from fe_b (since only fe_a is specified as a fixed effect) - df_fitting_features = featurizer.featurize_fitting_data(df_fitting) - df_heldout_features = featurizer.featurize_heldout_data(df_heldout) - - assert (df_fitting_features.columns == df_heldout_features.columns).all() - - assert "a" in df_fitting_features.columns - assert "b" in df_fitting_features.columns - assert "c" in df_fitting_features.columns - assert "fe_a_a" not in df_fitting_features.columns # since drop_first is true - assert "fe_a_b" in df_fitting_features.columns - assert "fe_a_c" in df_fitting_features.columns - assert "x" not in df_fitting_features.columns # not a feature - assert "fe_b_1" not in df_fitting_features.columns # not a fixed effect - assert "fe_b_x" not in df_fitting_features.columns # not a fixed effect - assert "fe_c_a" not in df_fitting_features.columns # not a fixed effect - - assert "a" in df_heldout_features.columns - assert "b" in df_heldout_features.columns - assert "c" in df_heldout_features.columns - assert "fe_a_a" not in df_heldout_features.columns - # drop_first is False for heldout, but "fe_a_a" is not in - # expanded_fixed_effects because dropped by fitting_data expansion - assert "fe_a_b" in df_heldout_features.columns - assert "fe_a_c" in df_heldout_features.columns # this column should have been addded manually - assert "fe_a_d" not in df_heldout_features.columns # not in expanded_fixed_effects since not fitting_datas - assert "fe_b_1" not in df_heldout_features.columns # not a fixed effect - assert "fe_b_x" not in df_heldout_features.columns # not a fixed effect - assert "fe_c_a" not in df_heldout_features.columns # not a fixed effect + df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True) + + df_heldout = featurizer.generate_holdout_data(df_new[split_fitting_heldout:]) + + "a" in df_heldout.columns + "b" in df_heldout.columns + "c" in df_heldout.columns + "d" not in df_heldout.columns # not specified in features + + "fe_a_a" not in df_heldout.columns # dropped to avoid multicolinearity + "fe_a_b" in df_heldout.columns + "fe_a_c" in df_heldout.columns + "fe_a_d" not in df_heldout.columns # not an active fixed effect + + assert df_heldout.loc[6, "fe_a_b"] == 1 # since row 7 has an inactive fixed effect + assert df_heldout.loc[7, "fe_a_b"] == 1 / 3 # since row 7 has an inactive fixed effect + assert df_heldout.loc[7, "fe_a_c"] == 1 / 3 # since row 7 has an inactive fixed effect + + "fe_b_1" not in df_heldout.columns # dropped to avoid multicolinearity + "fe_b_x" in df_heldout.columns + "fe_b_z" not in df_heldout.columns # inactive + + assert df_heldout.loc[6, "fe_a_b"] == 1 # since row 7 has an inactive fixed effect + assert df_heldout.loc[7, "fe_a_b"] == 1 / 3 # since row 7 has an inactive fixed effect + assert df_heldout.loc[7, "fe_a_c"] == 1 / 3 # since row 7 has an inactive fixed effect + + # element 4 has the dropped fixed effect value in fe_b and so should only have an intercept + assert df_heldout.loc[4, "intercept"] == 1 + assert df_heldout.loc[4, "fe_b_7"] == 0 + assert df_heldout.loc[4, "fe_b_x"] == 0 + assert df_heldout.loc[4, "fe_b_y"] == 0 + + # row 5 has an inactive fixed effect + assert df_heldout.loc[5, "intercept"] == 1 + assert df_heldout.loc[5, "fe_b_7"] == 1 / 4 + assert df_heldout.loc[5, "fe_b_x"] == 1 / 4 + assert df_heldout.loc[5, "fe_b_y"] == 1 / 4 def test_expanding_fixed_effects_basic(): fixed_effects = {"c1": ["all"]} featurizer = Featurizer([], fixed_effects) df = pd.DataFrame({"c1": ["a", "b", "b", "c"], "c2": ["w", "x", "y", "z"], "c3": [2, 4, 1, 9]}) - expanded = featurizer._expand_fixed_effects(df, drop_first=True) + expanded = featurizer._expand_fixed_effects(df) pd.testing.assert_frame_equal( expanded.sort_index(axis=1), pd.DataFrame( { "c2": ["w", "x", "y", "z"], "c3": [2, 4, 1, 9], + "c1_a": [1, 0, 0, 0], "c1_b": [0, 1, 1, 0], "c1_c": [0, 0, 0, 1], "c1": ["a", "b", "b", "c"], @@ -158,7 +223,7 @@ def test_expanding_fixed_effects_basic(): ) df = pd.DataFrame({"c1": ["a", "b", "b", "c"], "c2": ["w", "x", "y", "z"], "c3": [2, 4, 1, 9]}) - expanded = featurizer._expand_fixed_effects(df, drop_first=False) + expanded = featurizer._expand_fixed_effects(df) pd.testing.assert_frame_equal( expanded.sort_index(axis=1), pd.DataFrame( @@ -175,14 +240,16 @@ def test_expanding_fixed_effects_basic(): fixed_effects = {"c1": ["all"], "c2": ["all"]} featurizer = Featurizer([], fixed_effects) - expanded = featurizer._expand_fixed_effects(df, drop_first=True) + expanded = featurizer._expand_fixed_effects(df) pd.testing.assert_frame_equal( expanded.sort_index(axis=1), pd.DataFrame( { "c3": [2, 4, 1, 9], + "c1_a": [1, 0, 0, 0], "c1_b": [0, 1, 1, 0], "c1_c": [0, 0, 0, 1], + "c2_w": [1, 0, 0, 0], "c2_x": [0, 1, 0, 0], "c2_y": [0, 0, 1, 0], "c2_z": [0, 0, 0, 1], @@ -197,7 +264,7 @@ def test_expand_fixed_effects_selective(): fixed_effects = {"c1": ["a", "b"]} featurizer = Featurizer([], fixed_effects) df = pd.DataFrame({"c1": ["a", "b", "b", "c"], "c2": ["w", "x", "y", "z"], "c3": [2, 4, 1, 9]}) - expanded = featurizer._expand_fixed_effects(df, drop_first=True) + expanded = featurizer._expand_fixed_effects(df) pd.testing.assert_frame_equal( expanded.sort_index(axis=1), pd.DataFrame( @@ -206,39 +273,48 @@ def test_expand_fixed_effects_selective(): "c3": [2, 4, 1, 9], "c1_a": [1, 0, 0, 0], "c1_b": [0, 1, 1, 0], + "c1_other": [0, 0, 0, 1], "c1": ["a", "b", "b", "c"], } ).sort_index(axis=1), ) - expanded = featurizer._expand_fixed_effects(df, drop_first=False) + fixed_effects = {"c1": ["a"], "c2": ["w", "x"]} + featurizer = Featurizer([], fixed_effects) + expanded = featurizer._expand_fixed_effects(df) pd.testing.assert_frame_equal( expanded.sort_index(axis=1), pd.DataFrame( { + "c1": ["a", "b", "b", "c"], "c2": ["w", "x", "y", "z"], "c3": [2, 4, 1, 9], "c1_a": [1, 0, 0, 0], - "c1_b": [0, 1, 1, 0], - "c1_other": [0, 0, 0, 1], - "c1": ["a", "b", "b", "c"], + "c1_other": [0, 1, 1, 1], + "c2_other": [0, 0, 1, 1], + "c2_w": [1, 0, 0, 0], + "c2_x": [0, 1, 0, 0], } ).sort_index(axis=1), ) - fixed_effects = {"c1": ["a"], "c2": ["w", "x"]} + fixed_effects = {"c1": ["all"], "c2": ["w", "x"]} featurizer = Featurizer([], fixed_effects) - expanded = featurizer._expand_fixed_effects(df, drop_first=True) + expanded = featurizer._expand_fixed_effects(df) + pd.testing.assert_frame_equal( expanded.sort_index(axis=1), pd.DataFrame( { + "c1": ["a", "b", "b", "c"], + "c2": ["w", "x", "y", "z"], "c3": [2, 4, 1, 9], "c1_a": [1, 0, 0, 0], + "c1_b": [0, 1, 1, 0], + "c1_c": [0, 0, 0, 1], + "c2_other": [0, 0, 1, 1], "c2_w": [1, 0, 0, 0], "c2_x": [0, 1, 0, 0], - "c1": ["a", "b", "b", "c"], - "c2": ["w", "x", "y", "z"], } ).sort_index(axis=1), ) @@ -276,10 +352,14 @@ def test_generate_fixed_effects(va_governor_county_data): nonreporting_data = combined_data_handler.get_nonreporting_units(99) featurizer = Featurizer([], {"county_classification": "all"}) - featurizer.compute_means_for_centering(reporting_data, nonreporting_data) - reporting_data_features = featurizer.featurize_fitting_data(reporting_data) - nonreporting_data_features = featurizer.featurize_heldout_data(nonreporting_data) + n_train = reporting_data.shape[0] + all_units = pd.concat([reporting_data, nonreporting_data], axis=0) + + x_all = featurizer.prepare_data(all_units, center_features=False, scale_features=False, add_intercept=True) + + reporting_data_features = featurizer.filter_to_active_features(x_all[:n_train]) + nonreporting_data_features = featurizer.generate_holdout_data(x_all[n_train:]) assert combined_data_handler.data.shape == (133, 32) @@ -292,6 +372,7 @@ def test_generate_fixed_effects(va_governor_county_data): assert "county_classification" in featurizer.fixed_effect_cols assert len(featurizer.expanded_fixed_effects) == 5 # 6 - 1 + assert len(featurizer.active_fixed_effects) == 5 combined_data_handler = CombinedDataHandler( va_governor_county_data, @@ -302,13 +383,17 @@ def test_generate_fixed_effects(va_governor_county_data): ) featurizer = Featurizer([], {"county_classification": ["all"], "county_fips": ["all"]}) - featurizer.compute_means_for_centering(reporting_data, nonreporting_data) reporting_data = combined_data_handler.get_reporting_units(99) nonreporting_data = combined_data_handler.get_nonreporting_units(99) - reporting_data_features = featurizer.featurize_fitting_data(reporting_data) - nonreporting_data_features = featurizer.featurize_heldout_data(nonreporting_data) + n_train = reporting_data.shape[0] + all_units = pd.concat([reporting_data, nonreporting_data], axis=0) + + x_all = featurizer.prepare_data(all_units, center_features=False, scale_features=False, add_intercept=True) + + reporting_data_features = featurizer.filter_to_active_features(x_all[:n_train]) + nonreporting_data_features = featurizer.generate_holdout_data(x_all[n_train:]) assert combined_data_handler.data.shape == (133, 32) @@ -359,10 +444,13 @@ def test_generate_fixed_effects_not_all_reporting(va_governor_county_data): nonreporting_data = combined_data_handler.get_nonreporting_units(99) featurizer = Featurizer([], {"county_fips": ["all"]}) - featurizer.compute_means_for_centering(reporting_data, nonreporting_data) + n_train = reporting_data.shape[0] + all_units = pd.concat([reporting_data, nonreporting_data], axis=0) - reporting_data_features = featurizer.featurize_fitting_data(reporting_data) - nonreporting_data_features = featurizer.featurize_heldout_data(nonreporting_data) + x_all = featurizer.prepare_data(all_units, center_features=False, scale_features=False, add_intercept=True) + + reporting_data_features = featurizer.filter_to_active_features(x_all[:n_train]) + nonreporting_data_features = featurizer.generate_holdout_data(x_all[n_train:]) assert combined_data_handler.data.shape == (133, 32) @@ -384,7 +472,8 @@ def test_generate_fixed_effects_not_all_reporting(va_governor_county_data): ) # not in here because not in featurizer.complete_features assert "county_fips" in featurizer.fixed_effect_cols - assert len(featurizer.expanded_fixed_effects) == n - 1 + assert len(featurizer.expanded_fixed_effects) == 133 - 1 + assert len(featurizer.active_fixed_effects) == n - 1 assert not reporting_data_features["county_fips_51009"].isnull().any() @@ -421,16 +510,21 @@ def test_generate_fixed_effects_mixed_reporting(va_governor_precinct_data): nonreporting_data = combined_data_handler.get_nonreporting_units(99) featurizer = Featurizer([], ["county_fips"]) - featurizer.compute_means_for_centering(reporting_data, nonreporting_data) - reporting_data_features = featurizer.featurize_fitting_data(reporting_data) - nonreporting_data_features = featurizer.featurize_heldout_data(nonreporting_data) + n_train = reporting_data.shape[0] + n_test = nonreporting_data.shape[0] + all_units = pd.concat([reporting_data, nonreporting_data], axis=0) + + x_all = featurizer.prepare_data(all_units, center_features=False, scale_features=False, add_intercept=True) + + reporting_data_features = featurizer.filter_to_active_features(x_all[:n_train]) + nonreporting_data_features = featurizer.generate_holdout_data(x_all[n_train:]) assert combined_data_handler.data.shape == (2360, 32) n_expected_columns = 7 # when n = 100 we get to county 51013 (minus dropped fixed effect, plus intercept) - assert reporting_data_features.shape == (n, n_expected_columns) - assert nonreporting_data_features.shape == (2360 - n, n_expected_columns) + assert reporting_data_features.shape == (n_train, n_expected_columns) # use n_train since dropping columns + assert nonreporting_data_features.shape == (n_test, n_expected_columns) assert "county_fips_51001" not in reporting_data_features.columns # dropped from get_dummies because first assert "county_fips_51001" not in nonreporting_data_features.columns # therefore not added manually @@ -451,4 +545,4 @@ def test_generate_fixed_effects_mixed_reporting(va_governor_precinct_data): ) # not in here because not in featurizer.complete_features assert "county_fips" in featurizer.fixed_effect_cols - assert len(featurizer.expanded_fixed_effects) == 7 - 1 + assert len(featurizer.expanded_fixed_effects) == 133 - 1 From 740aca28c3b22f8cea17ad24f891f81610674413 Mon Sep 17 00:00:00 2001 From: lbvienna Date: Fri, 25 Aug 2023 19:18:27 +0200 Subject: [PATCH 2/9] small bug fix --- src/elexmodel/handlers/data/Featurizer.py | 2 +- tests/handlers/test_featurizer.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/elexmodel/handlers/data/Featurizer.py b/src/elexmodel/handlers/data/Featurizer.py index fd8ddd11..f26b3301 100644 --- a/src/elexmodel/handlers/data/Featurizer.py +++ b/src/elexmodel/handlers/data/Featurizer.py @@ -93,7 +93,7 @@ def prepare_data( if x.startswith(tuple([fixed_effect + "_" for fixed_effect in self.fixed_effect_cols])) ] - df_fitting = df[(df.reporting is True) & (df.expected is True)] + df_fitting = df[(df.reporting) & (df.expected)] # get the indices of all expanded fixed effects in the fitting data (active fixed effects + the fixed effect we will drop for multicolinearity) active_fixed_effect_boolean_df = df_fitting[all_expanded_fixed_effects].sum(axis=0) > 0 # get the names of those fixed effects, since we we will want to know which fixed effect was dropped diff --git a/tests/handlers/test_featurizer.py b/tests/handlers/test_featurizer.py index 991aca48..8c3edda1 100644 --- a/tests/handlers/test_featurizer.py +++ b/tests/handlers/test_featurizer.py @@ -91,7 +91,6 @@ def test_column_names(): "expected": [True, True, True, True, False, False, False, False], } ) - df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True) df_fitting = featurizer.filter_to_active_features(df_new[:split_fitting_heldout]) From c811e9c2c76456cc692f44756a89fa7e59058352 Mon Sep 17 00:00:00 2001 From: lbvienna Date: Wed, 30 Aug 2023 18:41:52 +0200 Subject: [PATCH 3/9] updated comments --- src/elexmodel/handlers/data/Featurizer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/elexmodel/handlers/data/Featurizer.py b/src/elexmodel/handlers/data/Featurizer.py index f26b3301..e7848dd7 100644 --- a/src/elexmodel/handlers/data/Featurizer.py +++ b/src/elexmodel/handlers/data/Featurizer.py @@ -67,7 +67,12 @@ def prepare_data( ) -> pd.DataFrame: """ Prepares features. - Adds dummy variables for fixed effects. Also includes centering, scaling continuous covariates and adding intercept. + Adds dummy variables for fixed effects, also determines which fixed effects are expanded and active. + if center_features is true we subtract the features by their average column value, which sets the average column value to zero + this allows us to interpret the intercept as the mean response given all other covariates at their average value + if scale_features is true we divide the features by their standard deviation, which gives them all the same scale + this can improve the convergence of optimization algorithms + if add_intercept is true an intercept column is added to the features and one fixed effect value is dropped """ df = df.copy() # create copy so we can do things to the values if center_features: From 9345e370509dadc4e0241c10a0443578c8eeac1a Mon Sep 17 00:00:00 2001 From: lbvienna Date: Wed, 30 Aug 2023 20:15:25 +0200 Subject: [PATCH 4/9] linter --- src/elexmodel/models/BaseElectionModel.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/elexmodel/models/BaseElectionModel.py b/src/elexmodel/models/BaseElectionModel.py index 967a1f28..818de945 100644 --- a/src/elexmodel/models/BaseElectionModel.py +++ b/src/elexmodel/models/BaseElectionModel.py @@ -58,7 +58,7 @@ def get_unit_predictions(self, reporting_units, nonreporting_units, estimand, ** Produces unit level predictions. Fits quantile regression to reporting data, applies it to nonreporting data. The features are specified in model_settings. """ - n_train = reporting_units.shape[0] + self.n_train = reporting_units.shape[0] n_test = nonreporting_units.shape[0] all_units = pd.concat([reporting_units, nonreporting_units], axis=0) @@ -67,8 +67,10 @@ def get_unit_predictions(self, reporting_units, nonreporting_units, estimand, ** all_units, center_features=True, scale_features=False, add_intercept=self.add_intercept ) - reporting_units_features = featurizer.filter_to_active_features(x_all[:n_train]) - nonreporting_units_features = featurizer.generate_holdout_data(x_all[n_train : n_train + n_test]) # noqa: E203 + reporting_units_features = featurizer.filter_to_active_features(x_all[: self.n_train]) + nonreporting_units_features = featurizer.generate_holdout_data( + x_all[self.n_train : self.n_train + n_test] # noqa: E203 + ) weights = reporting_units[f"last_election_results_{estimand}"] reporting_units_residuals = reporting_units[f"residuals_{estimand}"] @@ -207,7 +209,7 @@ def get_unit_prediction_interval_bounds(self, reporting_units, nonreporting_unit upper_bound = (1 + alpha) / 2 lower_bound = (1 - alpha) / 2 - train_rows = math.floor(reporting_units.shape[0] * conf_frac) + train_rows = math.floor(self.n_train * conf_frac) train_data = reporting_units_shuffled[:train_rows] # the fixed effects in train_data will be a subset of the fixed effect of reporting_units since all @@ -238,9 +240,11 @@ def get_unit_prediction_interval_bounds(self, reporting_units, nonreporting_unit # bounds for nonreporting data. conformalization_data = reporting_units_shuffled[train_rows:] - # all_data starts with reporting_units_shuffled, so the rows between train_rows and n_reporting_units are the + # all_data starts with reporting_units_shuffled, so the rows between train_rows and n_train are the # conformalization set - conformalization_data_features = interval_featurizer.generate_holdout_data(x_all[train_rows:n_reporting_units]) + conformalization_data_features = interval_featurizer.generate_holdout_data( + x_all[train_rows : self.n_train] # noqa: E203 + ) # we are interested in f(X) - r # since later conformity scores care about deviation of bounds from residuals From 8387d41af37796ac65a4b8004b3d0f6849517c55 Mon Sep 17 00:00:00 2001 From: Leonard Bronner <6954332+lennybronner@users.noreply.github.com> Date: Wed, 30 Aug 2023 20:16:15 +0200 Subject: [PATCH 5/9] Update src/elexmodel/models/BaseElectionModel.py Co-authored-by: Jen Haskell --- src/elexmodel/models/BaseElectionModel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/elexmodel/models/BaseElectionModel.py b/src/elexmodel/models/BaseElectionModel.py index 818de945..5ad7c701 100644 --- a/src/elexmodel/models/BaseElectionModel.py +++ b/src/elexmodel/models/BaseElectionModel.py @@ -261,7 +261,7 @@ def get_unit_prediction_interval_bounds(self, reporting_units, nonreporting_unit # apply lower/upper models to nonreporting data # since nonreporting_units is the second dataframe in a_all, all units after n_reporting_units are nonreporting - # note: the features used may be different fromt the median predictions, but this guarantees that the features + # note: the features used may be different from the median predictions, but this guarantees that the features # are the same accross train_data, conformalization_data and nonreporting_units nonreporting_units_features = interval_featurizer.generate_holdout_data(x_all[n_reporting_units:]) nonreporting_lower_bounds = lower_qr.predict(nonreporting_units_features) From c80156d0afc12935a9ce7af82ee070c38461dc5c Mon Sep 17 00:00:00 2001 From: Leonard Bronner <6954332+lennybronner@users.noreply.github.com> Date: Wed, 30 Aug 2023 23:30:42 +0200 Subject: [PATCH 6/9] Update tests/handlers/test_featurizer.py Co-authored-by: Jen Haskell --- tests/handlers/test_featurizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/handlers/test_featurizer.py b/tests/handlers/test_featurizer.py index 8c3edda1..c84b1b8f 100644 --- a/tests/handlers/test_featurizer.py +++ b/tests/handlers/test_featurizer.py @@ -177,7 +177,7 @@ def test_generating_heldout_set(): "fe_a_c" in df_heldout.columns "fe_a_d" not in df_heldout.columns # not an active fixed effect - assert df_heldout.loc[6, "fe_a_b"] == 1 # since row 7 has an inactive fixed effect + assert df_heldout.loc[6, "fe_a_b"] == 1 # since row 6 has an active fixed effect assert df_heldout.loc[7, "fe_a_b"] == 1 / 3 # since row 7 has an inactive fixed effect assert df_heldout.loc[7, "fe_a_c"] == 1 / 3 # since row 7 has an inactive fixed effect From e86c6138c218beb5038dcbc7ddd1147000673f29 Mon Sep 17 00:00:00 2001 From: Leonard Bronner <6954332+lennybronner@users.noreply.github.com> Date: Wed, 30 Aug 2023 23:31:28 +0200 Subject: [PATCH 7/9] Update tests/handlers/test_featurizer.py Co-authored-by: Jen Haskell --- tests/handlers/test_featurizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/handlers/test_featurizer.py b/tests/handlers/test_featurizer.py index c84b1b8f..2b235195 100644 --- a/tests/handlers/test_featurizer.py +++ b/tests/handlers/test_featurizer.py @@ -185,7 +185,7 @@ def test_generating_heldout_set(): "fe_b_x" in df_heldout.columns "fe_b_z" not in df_heldout.columns # inactive - assert df_heldout.loc[6, "fe_a_b"] == 1 # since row 7 has an inactive fixed effect + assert df_heldout.loc[6, "fe_a_b"] == 1 # since row 6 has an active fixed effect assert df_heldout.loc[7, "fe_a_b"] == 1 / 3 # since row 7 has an inactive fixed effect assert df_heldout.loc[7, "fe_a_c"] == 1 / 3 # since row 7 has an inactive fixed effect From 8c85b7a72b3b9e0429da2c71a907e8b078407a9c Mon Sep 17 00:00:00 2001 From: lbvienna Date: Fri, 1 Sep 2023 20:04:47 +0200 Subject: [PATCH 8/9] updated comment --- src/elexmodel/handlers/data/Featurizer.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/elexmodel/handlers/data/Featurizer.py b/src/elexmodel/handlers/data/Featurizer.py index e7848dd7..e0b1a089 100644 --- a/src/elexmodel/handlers/data/Featurizer.py +++ b/src/elexmodel/handlers/data/Featurizer.py @@ -169,6 +169,15 @@ def generate_holdout_data(self, df: pd.DataFrame) -> pd.DataFrame: # set the values for active fixed effect in rows that have inactive fixed effect to be 1 / (n + 1) # rows that have an inactive fixed effect value need to receive the treat of the average fixed effects - # NOTE: aren't we now applying 1 * the dropped fixed effect and 1 / (n + 1) times the other fixed effects? df.loc[rows_w_inactive_fixed_effects, fe_active_fixed_effects] = 1 / (len(fe_active_fixed_effects) + 1) + # This is correct because even rows with active fixed effects have an interept columns, so the coefficient + # of the fixed effect value column is actually the *difference* between the dropped column (for which the intercept is + # the stand in and the fixed effect column. + # Another way to think about this is that for a fixed effect value that is present the fixed effect estimate is: + # if there are three fixed effects r, u and s where s is dropped. + # beta_0 + beta_r * indic{r} + # beta_0 + beta_u * indic{u} + # and the fixed effect estimate for the dropped value is beta_0, so the average is: + # beta_0 + (beta_r / 3) + (beta_u / 3) + return self.filter_to_active_features(df) From 20d8ba98c987747fc4274ebfb9d8a31cdb5e3686 Mon Sep 17 00:00:00 2001 From: lbvienna Date: Fri, 1 Sep 2023 20:11:28 +0200 Subject: [PATCH 9/9] updated linter --- src/elexmodel/handlers/data/Featurizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/elexmodel/handlers/data/Featurizer.py b/src/elexmodel/handlers/data/Featurizer.py index e0b1a089..9937e00b 100644 --- a/src/elexmodel/handlers/data/Featurizer.py +++ b/src/elexmodel/handlers/data/Featurizer.py @@ -173,7 +173,7 @@ def generate_holdout_data(self, df: pd.DataFrame) -> pd.DataFrame: # This is correct because even rows with active fixed effects have an interept columns, so the coefficient # of the fixed effect value column is actually the *difference* between the dropped column (for which the intercept is # the stand in and the fixed effect column. - # Another way to think about this is that for a fixed effect value that is present the fixed effect estimate is: + # Another way to think about this is that for a fixed effect value that is present the fixed effect estimate is: # if there are three fixed effects r, u and s where s is dropped. # beta_0 + beta_r * indic{r} # beta_0 + beta_u * indic{u}