Merge pull request #69 from washingtonpost/updates-to-featurizer

Updates to Featurizer
washingtonpost · Sep 15, 2023 · 8bff4bc · 8bff4bc
2 parents 60f1e1f + 704d8d1
commit 8bff4bc
Show file tree

Hide file tree

Showing 5 changed files with 400 additions and 252 deletions.
diff --git a/src/elexmodel/handlers/data/CombinedData.py b/src/elexmodel/handlers/data/CombinedData.py
@@ -61,7 +61,8 @@ def get_reporting_units(self, percent_reporting_threshold, features_to_normalize
                 reporting_units[f"results_{estimand}"] - reporting_units[f"last_election_results_{estimand}"]
             ) / reporting_units[f"last_election_results_{estimand}"]
 
-        reporting_units["reporting"] = 1
+        reporting_units["reporting"] = int(1)
+        reporting_units["expected"] = True
 
         return reporting_units
 
@@ -75,7 +76,8 @@ def get_nonreporting_units(self, percent_reporting_threshold, features_to_normal
             drop=True
         )
 
-        nonreporting_units["reporting"] = 0
+        nonreporting_units["reporting"] = int(0)
+        nonreporting_units["expected"] = True
 
         return nonreporting_units
 
@@ -128,7 +130,8 @@ def get_unexpected_units(self, percent_reporting_threshold, aggregates):
                 self._get_district_from_geographic_unit_fips
             )
 
-        unexpected_units["reporting"] = 1
+        unexpected_units["reporting"] = int(1)
+        unexpected_units["expected"] = False
 
         return unexpected_units
 

diff --git a/src/elexmodel/handlers/data/Featurizer.py b/src/elexmodel/handlers/data/Featurizer.py
@@ -7,11 +7,13 @@ class Featurizer:
     Featurizer. Normalizes features, add intercept, expands fixed effects
     """
 
-    def __init__(self, features, fixed_effects):
+    def __init__(self, features: list, fixed_effects: list):
         self.features = features
+        # fixed effects can be a list, in which case every value of a fixed effect gets its own column
         if isinstance(fixed_effects, list):
             self.fixed_effect_cols = fixed_effects
             self.fixed_effect_params = {fe: ["all"] for fe in fixed_effects}
+        # fixed effects can be a dictionary from fixed effect to values that get their own column (or the string all, if we want all values)
         else:
             self.fixed_effect_cols = list(fixed_effects.keys())
             self.fixed_effect_params = {}
@@ -21,132 +23,161 @@ def __init__(self, features, fixed_effects):
                 else:
                     self.fixed_effect_params[fe] = params
 
+        # we differentiate between expanded fixed effects and active fixed effect values
+        # expanded fixed effects are those fixed effect values that appear in any part of the
+        # the data (fitting or heldout) exlcluding those that have been dropped to avoid
+        # multicolinearity when fitting.
         self.expanded_fixed_effects = []
-        self.complete_features = None
-        self.column_means = None
-
-    def compute_means_for_centering(self, *arg):
-        """
-        Computes and saves the column mean of pandas dataframe passed as args.
-        This is used for centering.
-        """
-        data = pd.concat(arg)
-        self.column_means = data[self.features].mean()
-
-    def _center_features(self, df):
-        """
-        Centers the features. This changes the interpretation of the intercept coefficient
-        from conditional mean given covariates = 0, to conditional mean given covariates are
-        their average value
-        """
-        df[self.features] = df[self.features] - self.column_means
+        # complete features are features + expanded fixed effects
+        self.complete_features = []
 
-    def _add_intercept(self, df):
-        df["intercept"] = 1
+        # active fixed effects are those that appear in the fitting data (ie. ones for which
+        # the model fitting computes a coefficient) but exluding those that we drop manually
+        # to avoid multicolinearity when fitting
+        self.active_fixed_effects = []
+        # active features are features + active fixed effects
+        self.active_features = []
 
-    def _expand_fixed_effects(self, df: pd.DataFrame, drop_first: bool) -> pd.DataFrame:
+    def _expand_fixed_effects(self, df: pd.DataFrame) -> pd.DataFrame:
         """
         Convert fixed effect columns into dummy variables.
         """
+        df = df.copy()
+        # we want to keep the original fixed effect columns since we may need them later for aggregation (ie. county fixed effect)
         original_fixed_effect_columns = df[self.fixed_effect_cols]
-        # set non-included values to other as needed
-        fe_df = df.copy()
+        # set non-included values to 'other' as needed since we don't want their values to get a dummy variable
         for fe, params in self.fixed_effect_params.items():
             if "all" not in params:
-                fe_df[fe] = np.where(~fe_df[fe].isin(params), "other", fe_df[fe])
+                df[fe] = np.where(~df[fe].isin(params), "other", df[fe])
 
         expanded_fixed_effects = pd.get_dummies(
-            fe_df, columns=self.fixed_effect_cols, prefix=self.fixed_effect_cols, prefix_sep="_", dtype=np.int64
+            df, columns=self.fixed_effect_cols, prefix=self.fixed_effect_cols, prefix_sep="_", dtype=np.int64
         )
 
-        # drop first column or "other" column if drop_first is true
-        cols_to_drop = []
-        if drop_first:
-            for fixed_effect in self.fixed_effect_cols:
-                relevant_cols = [col for col in expanded_fixed_effects.columns if col.startswith(fixed_effect)]
-                if f"{fixed_effect}_other" in relevant_cols:
-                    cols_to_drop.append(f"{fixed_effect}_other")
-                else:
-                    cols_to_drop.append(relevant_cols[0])
+        return pd.concat([original_fixed_effect_columns, expanded_fixed_effects], axis=1)
 
-        # we concatenate the dummy variables with the original fixed effects, since we need the original fixed
-        # effect columns for aggregation.
-        return pd.concat([original_fixed_effect_columns, expanded_fixed_effects.drop(cols_to_drop, axis=1)], axis=1)
-
-    def featurize_fitting_data(self, fitting_data, center_features=True, add_intercept=True):
+    def _get_categories_for_fe(self, list_: list, fe: str) -> list:
         """
-        Featurize the data that the model is fitted on.
-        In our case fitting_data is either the reporting_units (when fitting a model for the point predictions)
-        or training_data (when fitting the model for the prediction intervals)
+        Return list of fixed effects values for a given fixed effect
         """
-        # make copy of fitting_data, since we do not want to change the original data
-        new_fitting_data = fitting_data.copy()
-        self.center_features = center_features
-        self.add_intercept = add_intercept
-
-        if self.center_features:
-            self._center_features(new_fitting_data)
+        return [x for x in list_ if x.startswith(fe)]
 
-        self.complete_features = []
-        if self.add_intercept:
+    def prepare_data(
+        self, df: pd.DataFrame, center_features: bool = True, scale_features: bool = True, add_intercept: bool = True
+    ) -> pd.DataFrame:
+        """
+        Prepares features.
+        Adds dummy variables for fixed effects, also determines which fixed effects are expanded and active.
+        if center_features is true we subtract the features by their average column value, which sets the average column value to zero
+            this allows us to interpret the intercept as the mean response given all other covariates at their average value
+        if scale_features is true we divide the features by their standard deviation, which gives them all the same scale
+            this can improve the convergence of optimization algorithms
+        if add_intercept is true an intercept column is added to the features and one fixed effect value is dropped
+        """
+        df = df.copy()  # create copy so we can do things to the values
+        if center_features:
+            df[self.features] -= df[self.features].mean()
+        if scale_features:
+            # this expects there to be some variation in the data, otherwise we are dividing by zero
+            df[self.features] /= df[self.features].std()
+        if add_intercept:
             self.complete_features += ["intercept"]
-            self._add_intercept(new_fitting_data)
+            self.active_features += ["intercept"]
+            df["intercept"] = 1
 
         if len(self.fixed_effect_cols) > 0:
-            # drop_first is True for fitting_data (e.g. reporting_units) since we want to avoid the design matrix with
-            # expanded fixed effects to be linearly dependent
-            new_fitting_data = self._expand_fixed_effects(new_fitting_data, drop_first=True)
-            # we save the expanded fixed effects to be able to add fixed effects that are
-            # not in the heldout_data (nonreporting_units) as a zero column and to be able
-            # to specify the order of the expanded fixed effect when fitting the model
-            self.expanded_fixed_effects = [
+            df = self._expand_fixed_effects(df)
+
+            # we save the expanded fixed effects to be able to add a zero column for those
+            # fixed effect values if they are not in the heldout_data (nonreporting units).
+            # Also we can use this to guarantee the order of the fixed effect columns
+            # when fitting the model
+            all_expanded_fixed_effects = [
                 x
-                for x in new_fitting_data.columns
+                for x in df.columns
                 if x.startswith(tuple([fixed_effect + "_" for fixed_effect in self.fixed_effect_cols]))
             ]
 
+            df_fitting = df[(df.reporting) & (df.expected)]
+            # get the indices of all expanded fixed effects in the fitting data (active fixed effects + the fixed effect we will drop for multicolinearity)
+            active_fixed_effect_boolean_df = df_fitting[all_expanded_fixed_effects].sum(axis=0) > 0
+            # get the names of those fixed effects, since we we will want to know which fixed effect was dropped
+            all_active_fixed_effects = np.asarray(all_expanded_fixed_effects)[active_fixed_effect_boolean_df]
+
+            # if we add an intercept we need to drop a value/column per fixed effect in order to avoid multicolinearity.
+            # the intercept column is now a stand-in for the the dropped fixed effect value/column
+            if add_intercept:
+                active_fixed_effects = (
+                    []
+                )  # fixed effects that exist in the fitting_data (excluding one dropped column to avoid multicolinearity)
+                intercept_column = (
+                    []
+                )  # we need to save the fixed effect categories that the intercept is now standing in for
+                # we want to drop one value/column per fixed effect to avoid multicolinearity
+                for fe in self.fixed_effect_cols:
+                    # grab the potentially active fixed effect names for this fixed effect
+                    fe_fixed_effect_filter = self._get_categories_for_fe(all_active_fixed_effects, fe)
+                    # drop the first potentially active fixed effect
+                    active_fixed_effects.extend(fe_fixed_effect_filter[1:])
+                    # save the name of the fixed effect that we dropped
+                    intercept_column.append(fe_fixed_effect_filter[0])
+
+                self.active_fixed_effects = active_fixed_effects
+                self.intercept_column = intercept_column
+                # expanded fixed effects do not include the ones that we dropped to avoid multicolinearity
+                self.expanded_fixed_effects = [x for x in all_expanded_fixed_effects if x not in intercept_column]
+            else:
+                self.active_fixed_effects = all_active_fixed_effects
+                self.expanded_fixed_effects = all_expanded_fixed_effects
+
         # all features that the model will be fit on
+        # these are all the features + the expanded fixed effects (so all fixed effect values in the complete data excluding the ones dropped for multicolinearity)
         self.complete_features += self.features + self.expanded_fixed_effects
+        self.active_features += self.features + self.active_fixed_effects
+        df = df[self.complete_features]
 
-        return new_fitting_data[self.complete_features]
+        return df
 
-    def featurize_heldout_data(self, heldout_data):
+    def filter_to_active_features(self, df: pd.DataFrame) -> pd.DataFrame:
         """
-        Featurize the data that the model will be applied on.
-        In our case the heldout_data is either the nonreporting_units
-        (when applying the model for the point predictions)
-        or conformalization_data/nonreporting_units
-        (when applying the model for the prediction intervals)
+        Get active features (ie. features + active fixed effects)
         """
-        new_heldout_data = heldout_data.copy()
+        return df[self.active_features]
 
-        if self.center_features:
-            self._center_features(new_heldout_data)
-
-        if self.add_intercept:
-            self._add_intercept(new_heldout_data)
-
-        if len(self.fixed_effect_cols) > 0:
-            missing_expanded_fixed_effects = []
-            new_heldout_data = self._expand_fixed_effects(new_heldout_data, drop_first=False)
-            # if all units from one fixed effect are reporting they will not appear in the heldout_data
-            # (e.g. nonreporting_units) and won't get a column when we expand the fixed effects
-            # on that dataframe. Therefore we add those columns with zero fixed effects manually.
-            # As an example, if we are running a county model using state fixed effects, and
-            # all of Delaware's counties are reporting, then no Delaware county will be in
-            # heldout_data (nonreporting_units), as a result there will be no column for Delaware
-            # in the expanded fixed effects of heldout_data (nonreporting_units).
-            for expanded_fixed_effect in self.expanded_fixed_effects:
-                if expanded_fixed_effect not in new_heldout_data.columns:
-                    missing_expanded_fixed_effects.append(expanded_fixed_effect)
-
-            missing_expanded_fixed_effects_df = pd.DataFrame(
-                np.zeros((new_heldout_data.shape[0], len(missing_expanded_fixed_effects))),
-                columns=missing_expanded_fixed_effects,
-            )
-            # if we use this method to add the missing expanded fixed effects because doing it manually
-            # ie. new_heldout_data[expanded_fixed_effect] = 0
-            # can throw a fragmentation warning when there are many missing fixed effects.
-            new_heldout_data = new_heldout_data.join(missing_expanded_fixed_effects_df)
-
-        return new_heldout_data[self.complete_features]
+    def generate_holdout_data(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Generate fixed effects for the holdout data (ie. data that we will predict on)
+        """
+        df = df.copy()
+
+        # if a unit has an inactive fixed effect value for some fixed effect category we need
+        # to insert 1 / (number of fixed effect values) into each active fixed effect value for that unit
+        # if we were to leave them as zero, then the model would apply the dropped fixed effect
+        # value coefficient (since this is now what the intercept stands in for)
+        # instead we want to apply all active fixed effect coefficients equally
+
+        # get inactive fixed effects (ie expanded fixed effects that are not active)
+        # these are fixed effects that exist only in the holdout set (ie. we do not have a covariate for them)
+        inactive_fixed_effects = [x for x in self.expanded_fixed_effects if x not in self.active_fixed_effects]
+        for fe in self.fixed_effect_cols:
+            # active fixed effect values for this fixed effect
+            fe_active_fixed_effects = self._get_categories_for_fe(self.active_fixed_effects, fe)
+            # inactive fixed effect values for this fixed effect
+            fe_inactive_fixed_effects = self._get_categories_for_fe(inactive_fixed_effects, fe)
+            # get rows that have an inactive fixed effect
+            rows_w_inactive_fixed_effects = df[fe_inactive_fixed_effects].sum(axis=1) > 0
+
+            # set the values for active fixed effect in rows that have inactive fixed effect to be 1 / (n + 1)
+            # rows that have an inactive fixed effect value need to receive the treat of the average fixed effects
+            df.loc[rows_w_inactive_fixed_effects, fe_active_fixed_effects] = 1 / (len(fe_active_fixed_effects) + 1)
+            # This is correct because even rows with active fixed effects have an interept columns, so the coefficient
+            # of the fixed effect value column is actually the *difference* between the dropped column (for which the intercept is
+            # the stand in and the fixed effect column.
+            # Another way to think about this is that for a fixed effect value that is present the fixed effect estimate is:
+            # if there are three fixed effects r, u and s where s is dropped.
+            # beta_0 + beta_r * indic{r}
+            # beta_0 + beta_u * indic{u}
+            # and the fixed effect estimate for the dropped value is beta_0, so the average is:
+            # beta_0 + (beta_r / 3) + (beta_u / 3)
+
+        return self.filter_to_active_features(df)
diff --git a/src/elexmodel/models/BaseElectionModel.py b/src/elexmodel/models/BaseElectionModel.py
@@ -171,3 +171,6 @@ def get_coefficients(self) -> dict:
         These coefficients are for the point prediciton only.
         """
         return self.features_to_coefficients
+
+    def get_national_summary_estimates(self, nat_sum_data_dict, called_states, base_to_add):
+        raise NotImplementedError()