From 39a1fa2c8801a2caa2125017119a8b6a796b774a Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Fri, 25 Aug 2023 19:02:47 +0200
Subject: [PATCH 1/9] ran linter

---
 src/elexmodel/handlers/data/CombinedData.py |   9 +-
 src/elexmodel/handlers/data/Featurizer.py   | 215 +++++++------
 src/elexmodel/models/BaseElectionModel.py   |  66 ++--
 tests/handlers/test_featurizer.py           | 340 +++++++++++++-------
 4 files changed, 380 insertions(+), 250 deletions(-)

diff --git a/src/elexmodel/handlers/data/CombinedData.py b/src/elexmodel/handlers/data/CombinedData.py
index 4cc2ab56..41bce8f3 100644
--- a/src/elexmodel/handlers/data/CombinedData.py
+++ b/src/elexmodel/handlers/data/CombinedData.py
@@ -54,7 +54,8 @@ def get_reporting_units(self, percent_reporting_threshold, features_to_normalize
                 reporting_units[f"results_{estimand}"] - reporting_units[f"last_election_results_{estimand}"]
             ) / reporting_units[f"last_election_results_{estimand}"]
 
-        reporting_units["reporting"] = 1
+        reporting_units["reporting"] = int(1)
+        reporting_units["expected"] = True
 
         return reporting_units
 
@@ -68,7 +69,8 @@ def get_nonreporting_units(self, percent_reporting_threshold, features_to_normal
             drop=True
         )
 
-        nonreporting_units["reporting"] = 0
+        nonreporting_units["reporting"] = int(0)
+        nonreporting_units["expected"] = True
 
         return nonreporting_units
 
@@ -121,7 +123,8 @@ def get_unexpected_units(self, percent_reporting_threshold, aggregates):
                 self._get_district_from_geographic_unit_fips
             )
 
-        unexpected_units["reporting"] = 1
+        unexpected_units["reporting"] = int(1)
+        unexpected_units["expected"] = False
 
         return unexpected_units
 
diff --git a/src/elexmodel/handlers/data/Featurizer.py b/src/elexmodel/handlers/data/Featurizer.py
index f5a9a5b5..fd8ddd11 100644
--- a/src/elexmodel/handlers/data/Featurizer.py
+++ b/src/elexmodel/handlers/data/Featurizer.py
@@ -7,11 +7,13 @@ class Featurizer:
     Featurizer. Normalizes features, add intercept, expands fixed effects
     """
 
-    def __init__(self, features, fixed_effects):
+    def __init__(self, features: list, fixed_effects: list):
         self.features = features
+        # fixed effects can be a list, in which case every value of a fixed effect gets its own column
         if isinstance(fixed_effects, list):
             self.fixed_effect_cols = fixed_effects
             self.fixed_effect_params = {fe: ["all"] for fe in fixed_effects}
+        # fixed effects can be a dictionary from fixed effect to values that get their own column (or the string all, if we want all values)
         else:
             self.fixed_effect_cols = list(fixed_effects.keys())
             self.fixed_effect_params = {}
@@ -21,132 +23,147 @@ def __init__(self, features, fixed_effects):
                 else:
                     self.fixed_effect_params[fe] = params
 
+        # we differentiate between expanded fixed effects and active fixed effect values
+        # expanded fixed effects are those fixed effect values that appear in any part of the
+        # the data (fitting or heldout) exlcluding those that have been dropped to avoid
+        # multicolinearity when fitting.
         self.expanded_fixed_effects = []
-        self.complete_features = None
-        self.column_means = None
-
-    def compute_means_for_centering(self, *arg):
-        """
-        Computes and saves the column mean of pandas dataframe passed as args.
-        This is used for centering.
-        """
-        data = pd.concat(arg)
-        self.column_means = data[self.features].mean()
-
-    def _center_features(self, df):
-        """
-        Centers the features. This changes the interpretation of the intercept coefficient
-        from conditional mean given covariates = 0, to conditional mean given covariates are
-        their average value
-        """
-        df[self.features] = df[self.features] - self.column_means
+        # complete features are features + expanded fixed effects
+        self.complete_features = []
 
-    def _add_intercept(self, df):
-        df["intercept"] = 1
+        # active fixed effects are those that appear in the fitting data (ie. ones for which
+        # the model fitting computes a coefficient) but exluding those that we drop manually
+        # to avoid multicolinearity when fitting
+        self.active_fixed_effects = []
+        # active features are features + active fixed effects
+        self.active_features = []
 
-    def _expand_fixed_effects(self, df: pd.DataFrame, drop_first: bool) -> pd.DataFrame:
+    def _expand_fixed_effects(self, df: pd.DataFrame) -> pd.DataFrame:
         """
         Convert fixed effect columns into dummy variables.
         """
+        df = df.copy()
+        # we want to keep the original fixed effect columns since we may need them later for aggregation (ie. county fixed effect)
         original_fixed_effect_columns = df[self.fixed_effect_cols]
-        # set non-included values to other as needed
-        fe_df = df.copy()
+        # set non-included values to 'other' as needed since we don't want their values to get a dummy variable
         for fe, params in self.fixed_effect_params.items():
             if "all" not in params:
-                fe_df[fe] = np.where(~fe_df[fe].isin(params), "other", fe_df[fe])
+                df[fe] = np.where(~df[fe].isin(params), "other", df[fe])
 
         expanded_fixed_effects = pd.get_dummies(
-            fe_df, columns=self.fixed_effect_cols, prefix=self.fixed_effect_cols, prefix_sep="_", dtype=np.int64
+            df, columns=self.fixed_effect_cols, prefix=self.fixed_effect_cols, prefix_sep="_", dtype=np.int64
         )
 
-        # drop first column or "other" column if drop_first is true
-        cols_to_drop = []
-        if drop_first:
-            for fixed_effect in self.fixed_effect_cols:
-                relevant_cols = [col for col in expanded_fixed_effects.columns if col.startswith(fixed_effect)]
-                if f"{fixed_effect}_other" in relevant_cols:
-                    cols_to_drop.append(f"{fixed_effect}_other")
-                else:
-                    cols_to_drop.append(relevant_cols[0])
+        return pd.concat([original_fixed_effect_columns, expanded_fixed_effects], axis=1)
 
-        # we concatenate the dummy variables with the original fixed effects, since we need the original fixed
-        # effect columns for aggregation.
-        return pd.concat([original_fixed_effect_columns, expanded_fixed_effects.drop(cols_to_drop, axis=1)], axis=1)
-
-    def featurize_fitting_data(self, fitting_data, center_features=True, add_intercept=True):
+    def _get_categories_for_fe(self, list_: list, fe: str) -> list:
         """
-        Featurize the data that the model is fitted on.
-        In our case fitting_data is either the reporting_units (when fitting a model for the point predictions)
-        or training_data (when fitting the model for the prediction intervals)
+        Return list of fixed effects values for a given fixed effect
         """
-        # make copy of fitting_data, since we do not want to change the original data
-        new_fitting_data = fitting_data.copy()
-        self.center_features = center_features
-        self.add_intercept = add_intercept
-
-        if self.center_features:
-            self._center_features(new_fitting_data)
+        return [x for x in list_ if x.startswith(fe)]
 
-        self.complete_features = []
-        if self.add_intercept:
+    def prepare_data(
+        self, df: pd.DataFrame, center_features: bool = True, scale_features: bool = True, add_intercept: bool = True
+    ) -> pd.DataFrame:
+        """
+        Prepares features.
+        Adds dummy variables for fixed effects. Also includes centering, scaling continuous covariates and adding intercept.
+        """
+        df = df.copy()  # create copy so we can do things to the values
+        if center_features:
+            df[self.features] -= df[self.features].mean()
+        if scale_features:
+            # this expects there to be some variation in the data, otherwise we are dividing by zero
+            df[self.features] /= df[self.features].std()
+        if add_intercept:
             self.complete_features += ["intercept"]
-            self._add_intercept(new_fitting_data)
+            self.active_features += ["intercept"]
+            df["intercept"] = 1
 
         if len(self.fixed_effect_cols) > 0:
-            # drop_first is True for fitting_data (e.g. reporting_units) since we want to avoid the design matrix with
-            # expanded fixed effects to be linearly dependent
-            new_fitting_data = self._expand_fixed_effects(new_fitting_data, drop_first=True)
-            # we save the expanded fixed effects to be able to add fixed effects that are
-            # not in the heldout_data (nonreporting_units) as a zero column and to be able
-            # to specify the order of the expanded fixed effect when fitting the model
-            self.expanded_fixed_effects = [
+            df = self._expand_fixed_effects(df)
+
+            # we save the expanded fixed effects to be able to add a zero column for those
+            # fixed effect values if they are not in the heldout_data (nonreporting units).
+            # Also we can use this to guarantee the order of the fixed effect columns
+            # when fitting the model
+            all_expanded_fixed_effects = [
                 x
-                for x in new_fitting_data.columns
+                for x in df.columns
                 if x.startswith(tuple([fixed_effect + "_" for fixed_effect in self.fixed_effect_cols]))
             ]
 
+            df_fitting = df[(df.reporting is True) & (df.expected is True)]
+            # get the indices of all expanded fixed effects in the fitting data (active fixed effects + the fixed effect we will drop for multicolinearity)
+            active_fixed_effect_boolean_df = df_fitting[all_expanded_fixed_effects].sum(axis=0) > 0
+            # get the names of those fixed effects, since we we will want to know which fixed effect was dropped
+            all_active_fixed_effects = np.asarray(all_expanded_fixed_effects)[active_fixed_effect_boolean_df]
+
+            # if we add an intercept we need to drop a value/column per fixed effect in order to avoid multicolinearity.
+            # the intercept column is now a stand-in for the the dropped fixed effect value/column
+            if add_intercept:
+                active_fixed_effects = (
+                    []
+                )  # fixed effects that exist in the fitting_data (excluding one dropped column to avoid multicolinearity)
+                intercept_column = (
+                    []
+                )  # we need to save the fixed effect categories that the intercept is now standing in for
+                # we want to drop one value/column per fixed effect to avoid multicolinearity
+                for fe in self.fixed_effect_cols:
+                    # grab the potentially active fixed effect names for this fixed effect
+                    fe_fixed_effect_filter = self._get_categories_for_fe(all_active_fixed_effects, fe)
+                    # drop the first potentially active fixed effect
+                    active_fixed_effects.extend(fe_fixed_effect_filter[1:])
+                    # save the name of the fixed effect that we dropped
+                    intercept_column.append(fe_fixed_effect_filter[0])
+
+                self.active_fixed_effects = active_fixed_effects
+                self.intercept_column = intercept_column
+                # expanded fixed effects do not include the ones that we dropped to avoid multicolinearity
+                self.expanded_fixed_effects = [x for x in all_expanded_fixed_effects if x not in intercept_column]
+            else:
+                self.active_fixed_effects = all_active_fixed_effects
+                self.expanded_fixed_effects = all_expanded_fixed_effects
+
         # all features that the model will be fit on
+        # these are all the features + the expanded fixed effects (so all fixed effect values in the complete data excluding the ones dropped for multicolinearity)
         self.complete_features += self.features + self.expanded_fixed_effects
+        self.active_features += self.features + self.active_fixed_effects
+        df = df[self.complete_features]
 
-        return new_fitting_data[self.complete_features]
+        return df
 
-    def featurize_heldout_data(self, heldout_data):
+    def filter_to_active_features(self, df: pd.DataFrame) -> pd.DataFrame:
         """
-        Featurize the data that the model will be applied on.
-        In our case the heldout_data is either the nonreporting_units
-        (when applying the model for the point predictions)
-        or conformalization_data/nonreporting_units
-        (when applying the model for the prediction intervals)
+        Get active features (ie. features + active fixed effects)
         """
-        new_heldout_data = heldout_data.copy()
+        return df[self.active_features]
 
-        if self.center_features:
-            self._center_features(new_heldout_data)
-
-        if self.add_intercept:
-            self._add_intercept(new_heldout_data)
-
-        if len(self.fixed_effect_cols) > 0:
-            missing_expanded_fixed_effects = []
-            new_heldout_data = self._expand_fixed_effects(new_heldout_data, drop_first=False)
-            # if all units from one fixed effect are reporting they will not appear in the heldout_data
-            # (e.g. nonreporting_units) and won't get a column when we expand the fixed effects
-            # on that dataframe. Therefore we add those columns with zero fixed effects manually.
-            # As an example, if we are running a county model using state fixed effects, and
-            # all of Delaware's counties are reporting, then no Delaware county will be in
-            # heldout_data (nonreporting_units), as a result there will be no column for Delaware
-            # in the expanded fixed effects of heldout_data (nonreporting_units).
-            for expanded_fixed_effect in self.expanded_fixed_effects:
-                if expanded_fixed_effect not in new_heldout_data.columns:
-                    missing_expanded_fixed_effects.append(expanded_fixed_effect)
-
-            missing_expanded_fixed_effects_df = pd.DataFrame(
-                np.zeros((new_heldout_data.shape[0], len(missing_expanded_fixed_effects))),
-                columns=missing_expanded_fixed_effects,
-            )
-            # if we use this method to add the missing expanded fixed effects because doing it manually
-            # ie. new_heldout_data[expanded_fixed_effect] = 0
-            # can throw a fragmentation warning when there are many missing fixed effects.
-            new_heldout_data = new_heldout_data.join(missing_expanded_fixed_effects_df)
-
-        return new_heldout_data[self.complete_features]
+    def generate_holdout_data(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Generate fixed effects for the holdout data (ie. data that we will predict on)
+        """
+        df = df.copy()
+
+        # if a unit has an inactive fixed effect value for some fixed effect category we need
+        # to insert 1 / (number of fixed effect values) into each active fixed effect value for that unit
+        # if we were to leave them as zero, then the model would apply the dropped fixed effect
+        # value coefficient (since this is now what the intercept stands in for)
+        # instead we want to apply all active fixed effect coefficients equally
+
+        # get inactive fixed effects (ie expanded fixed effects that are not active)
+        # these are fixed effects that exist only in the holdout set (ie. we do not have a covariate for them)
+        inactive_fixed_effects = [x for x in self.expanded_fixed_effects if x not in self.active_fixed_effects]
+        for fe in self.fixed_effect_cols:
+            # active fixed effect values for this fixed effect
+            fe_active_fixed_effects = self._get_categories_for_fe(self.active_fixed_effects, fe)
+            # inactive fixed effect values for this fixed effect
+            fe_inactive_fixed_effects = self._get_categories_for_fe(inactive_fixed_effects, fe)
+            # get rows that have an inactive fixed effect
+            rows_w_inactive_fixed_effects = df[fe_inactive_fixed_effects].sum(axis=1) > 0
+
+            # set the values for active fixed effect in rows that have inactive fixed effect to be 1 / (n + 1)
+            # rows that have an inactive fixed effect value need to receive the treat of the average fixed effects
+            # NOTE: aren't we now applying 1 * the dropped fixed effect and 1 / (n + 1) times the other fixed effects?
+            df.loc[rows_w_inactive_fixed_effects, fe_active_fixed_effects] = 1 / (len(fe_active_fixed_effects) + 1)
+        return self.filter_to_active_features(df)
diff --git a/src/elexmodel/models/BaseElectionModel.py b/src/elexmodel/models/BaseElectionModel.py
index f985dd78..967a1f28 100644
--- a/src/elexmodel/models/BaseElectionModel.py
+++ b/src/elexmodel/models/BaseElectionModel.py
@@ -5,6 +5,7 @@
 
 import cvxpy
 import numpy as np
+import pandas as pd
 from elexsolver.QuantileRegressionSolver import QuantileRegressionSolver
 
 from elexmodel.handlers.data.Featurizer import Featurizer
@@ -23,7 +24,6 @@ def __init__(self, model_settings={}):
         self.fixed_effects = model_settings.get("fixed_effects", {})
         self.lambda_ = model_settings.get("lambda_", 0)
         self.features_to_coefficients = {}
-        self.featurizer = Featurizer(self.features, self.fixed_effects)
         self.add_intercept = True
         self.seed = 4191  # set arbitrarily
 
@@ -53,26 +53,28 @@ def fit_model(self, model, df_X, df_y, tau, weights, normalize_weights):
             LOG.warning("Warning: solution was inaccurate or solver broke. Re-running with normalize_weights=False.")
             model.fit(X, y, tau_value=tau, weights=weights, lambda_=self.lambda_, normalize_weights=False)
 
-    def get_unit_predictions(self, reporting_units, nonreporting_units, estimand):
+    def get_unit_predictions(self, reporting_units, nonreporting_units, estimand, **kwargs):
         """
         Produces unit level predictions. Fits quantile regression to reporting data, applies
         it to nonreporting data. The features are specified in model_settings.
         """
-        # compute the means of both reporting_units and nonreporting_units for centering (part of featurizing)
-        # we want them both, since together they are the subunit population
-        self.featurizer.compute_means_for_centering(reporting_units, nonreporting_units)
-        # reporting_units_features and nonreporting_units_features should have the same
-        # features. Specifically also the same fixed effect columns.
-        reporting_units_features = self.featurizer.featurize_fitting_data(
-            reporting_units, add_intercept=self.add_intercept
+        n_train = reporting_units.shape[0]
+        n_test = nonreporting_units.shape[0]
+        all_units = pd.concat([reporting_units, nonreporting_units], axis=0)
+
+        featurizer = Featurizer(self.features, self.fixed_effects)
+        x_all = featurizer.prepare_data(
+            all_units, center_features=True, scale_features=False, add_intercept=self.add_intercept
         )
-        nonreporting_units_features = self.featurizer.featurize_heldout_data(nonreporting_units)
+
+        reporting_units_features = featurizer.filter_to_active_features(x_all[:n_train])
+        nonreporting_units_features = featurizer.generate_holdout_data(x_all[n_train : n_train + n_test])  # noqa: E203
 
         weights = reporting_units[f"last_election_results_{estimand}"]
         reporting_units_residuals = reporting_units[f"residuals_{estimand}"]
 
         self.fit_model(self.qr, reporting_units_features, reporting_units_residuals, 0.5, weights, True)
-        self.features_to_coefficients = dict(zip(self.featurizer.complete_features, self.qr.coefficients))
+        self.features_to_coefficients = dict(zip(featurizer.complete_features, self.qr.coefficients))
 
         preds = self.qr.predict(nonreporting_units_features)
 
@@ -200,20 +202,28 @@ def get_unit_prediction_interval_bounds(self, reporting_units, nonreporting_unit
         # training/conformalization split for each alpha of one run
         reporting_units_shuffled = reporting_units.sample(frac=1, random_state=self.seed).reset_index(drop=True)
 
+        n_reporting_units = reporting_units.shape[0]
+
         upper_bound = (1 + alpha) / 2
         lower_bound = (1 - alpha) / 2
 
         train_rows = math.floor(reporting_units.shape[0] * conf_frac)
-        train_data = reporting_units_shuffled[:train_rows].reset_index(drop=True)
+        train_data = reporting_units_shuffled[:train_rows]
 
-        # specifying self.features extracts the correct columns and makes sure they are in the correct
-        # order. Necessary when fitting and predicting on the model.
         # the fixed effects in train_data will be a subset of the fixed effect of reporting_units since all
-        # units from one fixed effect category might be in the conformalization data. Note that we are
-        # overwritting featurizer.expanded_fixed_effects by doing this (which is what we want), since we
-        # want the expanded_fixed_effects from train_data to be used by conformalization_data and nonreporting_data
-        # in this function.
-        train_data_features = self.featurizer.featurize_fitting_data(train_data, add_intercept=self.add_intercept)
+        # units from one fixed effect category might be in the conformalization data.
+
+        # we need a new featurizer since otherwise we will continue to add intercepts to the features
+        interval_featurizer = Featurizer(self.features, self.fixed_effects)
+        # we need all units since we will apply the upper and lower models to the nonreporting_units also
+        # so we need to make sure that they have the correct fixed effects
+        all_units_shuffled = pd.concat([reporting_units_shuffled, nonreporting_units], axis=0)
+        x_all = interval_featurizer.prepare_data(
+            all_units_shuffled, center_features=True, scale_features=False, add_intercept=self.add_intercept
+        )
+        # x_all starts with the shuffled reporting units, so the first train_rows are the same as train_data
+        train_data_features = interval_featurizer.filter_to_active_features(x_all[:train_rows])
+
         train_data_residuals = train_data[f"residuals_{estimand}"]
         train_data_weights = train_data[f"last_election_results_{estimand}"]
 
@@ -226,9 +236,11 @@ def get_unit_prediction_interval_bounds(self, reporting_units, nonreporting_unit
 
         # apply to conformalization data. Conformalization bounds will later tell us how much to adjust lower/upper
         # bounds for nonreporting data.
-        conformalization_data = reporting_units_shuffled[train_rows:].reset_index(drop=True)
-        # conformalization features will be the same as the features in train_data
-        conformalization_data_features = self.featurizer.featurize_heldout_data(conformalization_data)
+        conformalization_data = reporting_units_shuffled[train_rows:]
+
+        # all_data starts with reporting_units_shuffled, so the rows between train_rows and n_reporting_units are the
+        # conformalization set
+        conformalization_data_features = interval_featurizer.generate_holdout_data(x_all[train_rows:n_reporting_units])
 
         # we are interested in f(X) - r
         # since later conformity scores care about deviation of bounds from residuals
@@ -244,9 +256,10 @@ def get_unit_prediction_interval_bounds(self, reporting_units, nonreporting_unit
         conformalization_data["lower_bounds"] = conformalization_lower_bounds
 
         # apply lower/upper models to nonreporting data
-        # now the features of the nonreporting_units will be the same as the train_data features
-        # they might differ slightly from the features used when fitting the median prediction
-        nonreporting_units_features = self.featurizer.featurize_heldout_data(nonreporting_units)
+        # since nonreporting_units is the second dataframe in a_all, all units after n_reporting_units are nonreporting
+        # note: the features used may be different fromt the median predictions, but this guarantees that the features
+        # are the same accross train_data, conformalization_data and nonreporting_units
+        nonreporting_units_features = interval_featurizer.generate_holdout_data(x_all[n_reporting_units:])
         nonreporting_lower_bounds = lower_qr.predict(nonreporting_units_features)
         nonreporting_upper_bounds = upper_qr.predict(nonreporting_units_features)
 
@@ -264,3 +277,6 @@ def get_coefficients(self):
         These coefficients are for the point prediciton only, not for the lower or upper intervals models.
         """
         return self.features_to_coefficients
+
+    def get_national_summary_estimates(self, nat_sum_data_dict, called_states, base_to_add):
+        raise NotImplementedError()
diff --git a/tests/handlers/test_featurizer.py b/tests/handlers/test_featurizer.py
index 55239fa4..991aca48 100644
--- a/tests/handlers/test_featurizer.py
+++ b/tests/handlers/test_featurizer.py
@@ -7,73 +7,65 @@
 from elexmodel.handlers.data.PreprocessedData import PreprocessedDataHandler
 
 
-def compute_testing_mean_for_centering():
-    """ "
-    Test whether computing the column mean for centering works.
-    """
-    features = ["a", "b", "c"]
-    featurizer = Featurizer(features, {})
-
-    # test with one dataframe
-    df = pd.DataFrame({"a": [1, 1, 1, 1], "b": [2, 2, 2, 2], "c": [3, 3, 3, 3], "d": [1, 2, 3, 4]})
-
-    featurizer.compute_means_for_centering(df)
-
-    assert featurizer.column_means.equals(pd.Series({"a": 1, "b": 2, "c": 3}, index=["a", "b", "c"], dtype=np.float64))
-
-    # test with two dataframes
-    df2 = pd.DataFrame({"a": [2, 2, 2, 2], "b": [3, 3, 3, 3], "c": [4, 4, 4, 4], "d": [1, 2, 3, 4]})
-
-    featurizer.compute_means_for_centering(df, df2)
-    assert featurizer.column_means.equals(
-        pd.Series({"a": 1.5, "b": 2.5, "c": 3.5}, index=["a", "b", "c"], dtype=np.float64)
-    )
-
-
 def test_centering_features():
     """
     Test whether centering the features works
     """
-    features = ["a", "b"]
+    features = ["a", "b", "c", "d"]
     featurizer = Featurizer(features, {})
 
-    # test with one dataframe
-    df = pd.DataFrame({"a": [1, 2, 3], "b": [2, 4, 9]})
-
-    featurizer.compute_means_for_centering(df)
-    featurizer._center_features(df)
-    assert df.equals(pd.DataFrame({"a": [-1.0, 0.0, 1.0], "b": [-3.0, -1.0, 4.0]}))
+    df = pd.DataFrame({"a": [1, 1, 1, 1], "b": [2, 2, 2, 2], "c": [3, 3, np.nan, 3], "d": [1, 2, 3, 4]})
 
-    # confirm that when the function is used properly we would not subtract the
-    # means twice
-    df = pd.DataFrame({"a": [1, 2, 3], "b": [2, 4, 9]})
-    featurizer.featurize_fitting_data(df, center_features=True)
-    df2 = featurizer.featurize_fitting_data(df, center_features=True)
-    assert df2.equals(pd.DataFrame({"intercept": [1, 1, 1], "a": [-1.0, 0.0, 1.0], "b": [-3.0, -1.0, 4.0]}))
+    df_new = featurizer.prepare_data(df, center_features=True, scale_features=False, add_intercept=False)
+    df_expected_result = pd.DataFrame(
+        {
+            "a": [0.0, 0.0, 0.0, 0.0],
+            "b": [0.0, 0.0, 0.0, 0.0],
+            "c": [0.0, 0.0, np.nan, 0.0],
+            "d": [-1.5, -0.5, 0.5, 1.5],
+        }
+    )
+    pd.testing.assert_frame_equal(df_new, df_expected_result)
 
 
 def test_adding_intercept():
+    """
+    Test adding intercept
+    """
     features = ["a", "b", "c"]
     featurizer = Featurizer(features, {})
 
-    # test with one dataframe
     df = pd.DataFrame({"a": [2, 2, 2, 2], "b": [3, 3, 3, 3], "c": [1, 2, 3, 4]})
 
-    featurizer._add_intercept(df)
+    df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True)
+
+    assert "intercept" in df_new.columns
+    assert "intercept" in featurizer.complete_features
+    assert "intercept" in featurizer.active_features
+    pd.testing.assert_series_equal(df_new.intercept, pd.Series([1, 1, 1, 1], name="intercept"))
 
-    assert "intercept" in df.columns
-    assert df.intercept.equals(pd.Series([1, 1, 1, 1]))
 
+def test_scaling_features():
+    """
+    Test whether scaling features works
+    """
+    features = ["a", "b", "c", "d"]
+    featurizer = Featurizer(features, {})
 
-def test_adding_intercept_complex():
-    features = ["a", "b", "c"]
-    featurizer = Featurizer(features, [])
-    df = pd.DataFrame({"a": [2, 2, 2, 2], "b": [3, 3, 3, 3], "c": [1, 2, 3, 4]})
+    # standard deviations here are 0.5, 1, 2 and inf
+    df = pd.DataFrame({"a": [1, 1, 1, 2], "b": [1, 1, 1, 3], "c": [1, 1, 1, 5], "d": [1, 1, 1, 1]})
 
-    df2 = featurizer.featurize_fitting_data(df, center_features=False, add_intercept=True)
-    assert featurizer.add_intercept
-    assert "intercept" in df2.columns
-    assert "intercept" in featurizer.complete_features
+    df_new = featurizer.prepare_data(df, center_features=False, scale_features=True, add_intercept=False)
+
+    df_expected_result = pd.DataFrame(
+        {
+            "a": [2.0, 2.0, 2.0, 4.0],
+            "b": [1.0, 1.0, 1.0, 3.0],
+            "c": [0.5, 0.5, 0.5, 2.5],
+            "d": [np.inf, np.inf, np.inf, np.inf],
+        }
+    )
+    pd.testing.assert_frame_equal(df_new, df_expected_result)
 
 
 def test_column_names():
@@ -81,75 +73,148 @@ def test_column_names():
     This function tests to make sure that the featurizer returns the right columns
     """
     features = ["a", "b", "c"]
-    fixed_effects = {"fe_a": ["all"]}
+    fixed_effects = ["fe_a", "fe_b"]
     featurizer = Featurizer(features, fixed_effects)
 
-    df_fitting = pd.DataFrame(
+    split_fitting_heldout = 4
+    # fe_a: "c" exists in fitting but not in heldout, "d" exists in heldout but not in fitting
+    # fe_b: "x", "7" and "y" exist in fitting but not in heldout, "z", "w" exist in heldout but not in fitting
+    df = pd.DataFrame(
         {
-            "x": [5, 3, 1, 5],
-            "a": [2, 2, 2, 2],
-            "b": [3, 3, 3, 3],
-            "c": [1, 2, 3, 4],
-            "fe_a": ["a", "a", "b", "c"],
-            "fe_b": ["1", "x", "7", "y"],
+            "a": [5, 3, 1, 5, 2, 2, 2, 2],
+            "b": [2, 2, 2, 2, 3, 3, 3, 3],
+            "c": [3, 3, 3, 3, 1, 2, 3, 4],
+            "d": [1, 2, 3, 4, 5, 3, 1, 5],
+            "fe_a": ["a", "a", "b", "c", "a", "a", "b", "d"],
+            "fe_b": ["1", "x", "7", "y", "1", "z", "z", "w"],
+            "reporting": [True, True, True, True, False, False, False, False],
+            "expected": [True, True, True, True, False, False, False, False],
         }
     )
-    df_heldout = pd.DataFrame(
+
+    df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True)
+
+    df_fitting = featurizer.filter_to_active_features(df_new[:split_fitting_heldout])
+    df_heldout = featurizer.generate_holdout_data(df_new[split_fitting_heldout:])
+    assert (df_fitting.columns == df_heldout.columns).all()
+
+    assert "a" in df_fitting.columns
+    assert "a" in df_heldout.columns
+    assert "a" in featurizer.features
+    assert "a" in featurizer.active_features
+    assert "a" in featurizer.complete_features
+
+    assert "fe_a" in featurizer.fixed_effect_cols
+    assert "fe_a" in featurizer.fixed_effect_params.keys()
+
+    # a is in fitting and in heldout BUT it's the first and therefore dropped to avoid multicolinearity
+    assert "fe_a_a" not in featurizer.expanded_fixed_effects
+    assert "fe_a_a" not in featurizer.active_fixed_effects
+    assert "fe_a_a" not in featurizer.active_features
+    assert "fe_a_a" not in featurizer.complete_features
+    assert "fe_a_a" not in df_fitting.columns
+    assert "fe_a_a" not in df_heldout.columns
+
+    # b is in fitting and in heldout
+    assert "fe_a_b" in featurizer.expanded_fixed_effects
+    assert "fe_a_b" in featurizer.active_fixed_effects
+    assert "fe_a_b" in featurizer.active_features
+    assert "fe_a_b" in featurizer.complete_features
+    assert "fe_a_b" in df_fitting.columns
+    assert "fe_a_b" in df_heldout.columns
+
+    # c is in fitting but not in heldout
+    assert "fe_a_c" in featurizer.expanded_fixed_effects
+    assert "fe_a_c" in featurizer.active_fixed_effects
+    assert "fe_a_c" in featurizer.active_features
+    assert "fe_a_c" in featurizer.complete_features
+    assert "fe_a_c" in df_fitting.columns
+    assert "fe_a_c" in df_heldout.columns  # should still be in heldout since added manually
+
+    # d is not in fitting but in heldout
+    assert "fe_a_d" in featurizer.expanded_fixed_effects
+    assert "fe_a_d" not in featurizer.active_fixed_effects
+    assert "fe_a_d" not in featurizer.active_features
+    assert "fe_a_d" in featurizer.complete_features
+    assert "fe_a_d" not in df_fitting.columns
+    assert "fe_a_d" not in df_heldout.columns
+
+
+def test_generating_heldout_set():
+    """
+    This test makes sure the heldout set is as expected
+    """
+    features = ["a", "b", "c"]
+    fixed_effects = ["fe_a", "fe_b"]
+    featurizer = Featurizer(features, fixed_effects)
+
+    split_fitting_heldout = 4
+    # fe_a: "c" exists in fitting but not in heldout, "d" exists in heldout but not in fitting
+    # fe_b: "x", "7" and "y" exist in fitting but not in heldout, "z", "w" exist in heldout but not in fitting
+    df = pd.DataFrame(
         {
-            "a": [2, 2, 2, 2],
-            "b": [3, 3, 3, 3],
-            "c": [1, 2, 3, 4],
-            "d": [5, 3, 1, 5],
-            "fe_a": ["a", "a", "b", "d"],
-            "fe_c": ["1", "a", "7", "y"],
+            "a": [5, 3, 1, 5, 2, 2, 2, 2],
+            "b": [2, 2, 2, 2, 3, 3, 3, 3],
+            "c": [3, 3, 3, 3, 1, 2, 3, 4],
+            "d": [1, 2, 3, 4, 5, 3, 1, 5],
+            "fe_a": ["a", "a", "b", "c", "a", "a", "b", "d"],
+            "fe_b": ["1", "x", "7", "y", "1", "z", "z", "w"],
+            "reporting": [True, True, True, True, False, False, False, False],
+            "expected": [True, True, True, True, False, False, False, False],
         }
     )
 
-    featurizer.compute_means_for_centering(df_fitting, df_heldout)
-    # since only a, b and c are "features" specified above we would expect
-    # "x" from df_fitting and "d" from df_heldout to be dropped
-    # similarly we would expect the same from fe_b (since only fe_a is specified as a fixed effect)
-    df_fitting_features = featurizer.featurize_fitting_data(df_fitting)
-    df_heldout_features = featurizer.featurize_heldout_data(df_heldout)
-
-    assert (df_fitting_features.columns == df_heldout_features.columns).all()
-
-    assert "a" in df_fitting_features.columns
-    assert "b" in df_fitting_features.columns
-    assert "c" in df_fitting_features.columns
-    assert "fe_a_a" not in df_fitting_features.columns  # since drop_first is true
-    assert "fe_a_b" in df_fitting_features.columns
-    assert "fe_a_c" in df_fitting_features.columns
-    assert "x" not in df_fitting_features.columns  # not a feature
-    assert "fe_b_1" not in df_fitting_features.columns  # not a fixed effect
-    assert "fe_b_x" not in df_fitting_features.columns  # not a fixed effect
-    assert "fe_c_a" not in df_fitting_features.columns  # not a fixed effect
-
-    assert "a" in df_heldout_features.columns
-    assert "b" in df_heldout_features.columns
-    assert "c" in df_heldout_features.columns
-    assert "fe_a_a" not in df_heldout_features.columns
-    # drop_first is False for heldout, but "fe_a_a" is not in
-    # expanded_fixed_effects because dropped by fitting_data expansion
-    assert "fe_a_b" in df_heldout_features.columns
-    assert "fe_a_c" in df_heldout_features.columns  # this column should have been addded manually
-    assert "fe_a_d" not in df_heldout_features.columns  # not in expanded_fixed_effects since not fitting_datas
-    assert "fe_b_1" not in df_heldout_features.columns  # not a fixed effect
-    assert "fe_b_x" not in df_heldout_features.columns  # not a fixed effect
-    assert "fe_c_a" not in df_heldout_features.columns  # not a fixed effect
+    df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True)
+
+    df_heldout = featurizer.generate_holdout_data(df_new[split_fitting_heldout:])
+
+    "a" in df_heldout.columns
+    "b" in df_heldout.columns
+    "c" in df_heldout.columns
+    "d" not in df_heldout.columns  # not specified in features
+
+    "fe_a_a" not in df_heldout.columns  # dropped to avoid multicolinearity
+    "fe_a_b" in df_heldout.columns
+    "fe_a_c" in df_heldout.columns
+    "fe_a_d" not in df_heldout.columns  # not an active fixed effect
+
+    assert df_heldout.loc[6, "fe_a_b"] == 1  # since row 7 has an inactive fixed effect
+    assert df_heldout.loc[7, "fe_a_b"] == 1 / 3  # since row 7 has an inactive fixed effect
+    assert df_heldout.loc[7, "fe_a_c"] == 1 / 3  # since row 7 has an inactive fixed effect
+
+    "fe_b_1" not in df_heldout.columns  # dropped to avoid multicolinearity
+    "fe_b_x" in df_heldout.columns
+    "fe_b_z" not in df_heldout.columns  # inactive
+
+    assert df_heldout.loc[6, "fe_a_b"] == 1  # since row 7 has an inactive fixed effect
+    assert df_heldout.loc[7, "fe_a_b"] == 1 / 3  # since row 7 has an inactive fixed effect
+    assert df_heldout.loc[7, "fe_a_c"] == 1 / 3  # since row 7 has an inactive fixed effect
+
+    # element 4 has the dropped fixed effect value in fe_b and so should only have an intercept
+    assert df_heldout.loc[4, "intercept"] == 1
+    assert df_heldout.loc[4, "fe_b_7"] == 0
+    assert df_heldout.loc[4, "fe_b_x"] == 0
+    assert df_heldout.loc[4, "fe_b_y"] == 0
+
+    # row 5 has an inactive fixed effect
+    assert df_heldout.loc[5, "intercept"] == 1
+    assert df_heldout.loc[5, "fe_b_7"] == 1 / 4
+    assert df_heldout.loc[5, "fe_b_x"] == 1 / 4
+    assert df_heldout.loc[5, "fe_b_y"] == 1 / 4
 
 
 def test_expanding_fixed_effects_basic():
     fixed_effects = {"c1": ["all"]}
     featurizer = Featurizer([], fixed_effects)
     df = pd.DataFrame({"c1": ["a", "b", "b", "c"], "c2": ["w", "x", "y", "z"], "c3": [2, 4, 1, 9]})
-    expanded = featurizer._expand_fixed_effects(df, drop_first=True)
+    expanded = featurizer._expand_fixed_effects(df)
     pd.testing.assert_frame_equal(
         expanded.sort_index(axis=1),
         pd.DataFrame(
             {
                 "c2": ["w", "x", "y", "z"],
                 "c3": [2, 4, 1, 9],
+                "c1_a": [1, 0, 0, 0],
                 "c1_b": [0, 1, 1, 0],
                 "c1_c": [0, 0, 0, 1],
                 "c1": ["a", "b", "b", "c"],
@@ -158,7 +223,7 @@ def test_expanding_fixed_effects_basic():
     )
 
     df = pd.DataFrame({"c1": ["a", "b", "b", "c"], "c2": ["w", "x", "y", "z"], "c3": [2, 4, 1, 9]})
-    expanded = featurizer._expand_fixed_effects(df, drop_first=False)
+    expanded = featurizer._expand_fixed_effects(df)
     pd.testing.assert_frame_equal(
         expanded.sort_index(axis=1),
         pd.DataFrame(
@@ -175,14 +240,16 @@ def test_expanding_fixed_effects_basic():
 
     fixed_effects = {"c1": ["all"], "c2": ["all"]}
     featurizer = Featurizer([], fixed_effects)
-    expanded = featurizer._expand_fixed_effects(df, drop_first=True)
+    expanded = featurizer._expand_fixed_effects(df)
     pd.testing.assert_frame_equal(
         expanded.sort_index(axis=1),
         pd.DataFrame(
             {
                 "c3": [2, 4, 1, 9],
+                "c1_a": [1, 0, 0, 0],
                 "c1_b": [0, 1, 1, 0],
                 "c1_c": [0, 0, 0, 1],
+                "c2_w": [1, 0, 0, 0],
                 "c2_x": [0, 1, 0, 0],
                 "c2_y": [0, 0, 1, 0],
                 "c2_z": [0, 0, 0, 1],
@@ -197,7 +264,7 @@ def test_expand_fixed_effects_selective():
     fixed_effects = {"c1": ["a", "b"]}
     featurizer = Featurizer([], fixed_effects)
     df = pd.DataFrame({"c1": ["a", "b", "b", "c"], "c2": ["w", "x", "y", "z"], "c3": [2, 4, 1, 9]})
-    expanded = featurizer._expand_fixed_effects(df, drop_first=True)
+    expanded = featurizer._expand_fixed_effects(df)
     pd.testing.assert_frame_equal(
         expanded.sort_index(axis=1),
         pd.DataFrame(
@@ -206,39 +273,48 @@ def test_expand_fixed_effects_selective():
                 "c3": [2, 4, 1, 9],
                 "c1_a": [1, 0, 0, 0],
                 "c1_b": [0, 1, 1, 0],
+                "c1_other": [0, 0, 0, 1],
                 "c1": ["a", "b", "b", "c"],
             }
         ).sort_index(axis=1),
     )
 
-    expanded = featurizer._expand_fixed_effects(df, drop_first=False)
+    fixed_effects = {"c1": ["a"], "c2": ["w", "x"]}
+    featurizer = Featurizer([], fixed_effects)
+    expanded = featurizer._expand_fixed_effects(df)
     pd.testing.assert_frame_equal(
         expanded.sort_index(axis=1),
         pd.DataFrame(
             {
+                "c1": ["a", "b", "b", "c"],
                 "c2": ["w", "x", "y", "z"],
                 "c3": [2, 4, 1, 9],
                 "c1_a": [1, 0, 0, 0],
-                "c1_b": [0, 1, 1, 0],
-                "c1_other": [0, 0, 0, 1],
-                "c1": ["a", "b", "b", "c"],
+                "c1_other": [0, 1, 1, 1],
+                "c2_other": [0, 0, 1, 1],
+                "c2_w": [1, 0, 0, 0],
+                "c2_x": [0, 1, 0, 0],
             }
         ).sort_index(axis=1),
     )
 
-    fixed_effects = {"c1": ["a"], "c2": ["w", "x"]}
+    fixed_effects = {"c1": ["all"], "c2": ["w", "x"]}
     featurizer = Featurizer([], fixed_effects)
-    expanded = featurizer._expand_fixed_effects(df, drop_first=True)
+    expanded = featurizer._expand_fixed_effects(df)
+
     pd.testing.assert_frame_equal(
         expanded.sort_index(axis=1),
         pd.DataFrame(
             {
+                "c1": ["a", "b", "b", "c"],
+                "c2": ["w", "x", "y", "z"],
                 "c3": [2, 4, 1, 9],
                 "c1_a": [1, 0, 0, 0],
+                "c1_b": [0, 1, 1, 0],
+                "c1_c": [0, 0, 0, 1],
+                "c2_other": [0, 0, 1, 1],
                 "c2_w": [1, 0, 0, 0],
                 "c2_x": [0, 1, 0, 0],
-                "c1": ["a", "b", "b", "c"],
-                "c2": ["w", "x", "y", "z"],
             }
         ).sort_index(axis=1),
     )
@@ -276,10 +352,14 @@ def test_generate_fixed_effects(va_governor_county_data):
     nonreporting_data = combined_data_handler.get_nonreporting_units(99)
 
     featurizer = Featurizer([], {"county_classification": "all"})
-    featurizer.compute_means_for_centering(reporting_data, nonreporting_data)
 
-    reporting_data_features = featurizer.featurize_fitting_data(reporting_data)
-    nonreporting_data_features = featurizer.featurize_heldout_data(nonreporting_data)
+    n_train = reporting_data.shape[0]
+    all_units = pd.concat([reporting_data, nonreporting_data], axis=0)
+
+    x_all = featurizer.prepare_data(all_units, center_features=False, scale_features=False, add_intercept=True)
+
+    reporting_data_features = featurizer.filter_to_active_features(x_all[:n_train])
+    nonreporting_data_features = featurizer.generate_holdout_data(x_all[n_train:])
 
     assert combined_data_handler.data.shape == (133, 32)
 
@@ -292,6 +372,7 @@ def test_generate_fixed_effects(va_governor_county_data):
 
     assert "county_classification" in featurizer.fixed_effect_cols
     assert len(featurizer.expanded_fixed_effects) == 5  # 6 - 1
+    assert len(featurizer.active_fixed_effects) == 5
 
     combined_data_handler = CombinedDataHandler(
         va_governor_county_data,
@@ -302,13 +383,17 @@ def test_generate_fixed_effects(va_governor_county_data):
     )
 
     featurizer = Featurizer([], {"county_classification": ["all"], "county_fips": ["all"]})
-    featurizer.compute_means_for_centering(reporting_data, nonreporting_data)
 
     reporting_data = combined_data_handler.get_reporting_units(99)
     nonreporting_data = combined_data_handler.get_nonreporting_units(99)
 
-    reporting_data_features = featurizer.featurize_fitting_data(reporting_data)
-    nonreporting_data_features = featurizer.featurize_heldout_data(nonreporting_data)
+    n_train = reporting_data.shape[0]
+    all_units = pd.concat([reporting_data, nonreporting_data], axis=0)
+
+    x_all = featurizer.prepare_data(all_units, center_features=False, scale_features=False, add_intercept=True)
+
+    reporting_data_features = featurizer.filter_to_active_features(x_all[:n_train])
+    nonreporting_data_features = featurizer.generate_holdout_data(x_all[n_train:])
 
     assert combined_data_handler.data.shape == (133, 32)
 
@@ -359,10 +444,13 @@ def test_generate_fixed_effects_not_all_reporting(va_governor_county_data):
     nonreporting_data = combined_data_handler.get_nonreporting_units(99)
 
     featurizer = Featurizer([], {"county_fips": ["all"]})
-    featurizer.compute_means_for_centering(reporting_data, nonreporting_data)
+    n_train = reporting_data.shape[0]
+    all_units = pd.concat([reporting_data, nonreporting_data], axis=0)
 
-    reporting_data_features = featurizer.featurize_fitting_data(reporting_data)
-    nonreporting_data_features = featurizer.featurize_heldout_data(nonreporting_data)
+    x_all = featurizer.prepare_data(all_units, center_features=False, scale_features=False, add_intercept=True)
+
+    reporting_data_features = featurizer.filter_to_active_features(x_all[:n_train])
+    nonreporting_data_features = featurizer.generate_holdout_data(x_all[n_train:])
 
     assert combined_data_handler.data.shape == (133, 32)
 
@@ -384,7 +472,8 @@ def test_generate_fixed_effects_not_all_reporting(va_governor_county_data):
     )  # not in here because not in featurizer.complete_features
 
     assert "county_fips" in featurizer.fixed_effect_cols
-    assert len(featurizer.expanded_fixed_effects) == n - 1
+    assert len(featurizer.expanded_fixed_effects) == 133 - 1
+    assert len(featurizer.active_fixed_effects) == n - 1
 
     assert not reporting_data_features["county_fips_51009"].isnull().any()
 
@@ -421,16 +510,21 @@ def test_generate_fixed_effects_mixed_reporting(va_governor_precinct_data):
     nonreporting_data = combined_data_handler.get_nonreporting_units(99)
 
     featurizer = Featurizer([], ["county_fips"])
-    featurizer.compute_means_for_centering(reporting_data, nonreporting_data)
 
-    reporting_data_features = featurizer.featurize_fitting_data(reporting_data)
-    nonreporting_data_features = featurizer.featurize_heldout_data(nonreporting_data)
+    n_train = reporting_data.shape[0]
+    n_test = nonreporting_data.shape[0]
+    all_units = pd.concat([reporting_data, nonreporting_data], axis=0)
+
+    x_all = featurizer.prepare_data(all_units, center_features=False, scale_features=False, add_intercept=True)
+
+    reporting_data_features = featurizer.filter_to_active_features(x_all[:n_train])
+    nonreporting_data_features = featurizer.generate_holdout_data(x_all[n_train:])
 
     assert combined_data_handler.data.shape == (2360, 32)
 
     n_expected_columns = 7  # when n = 100 we get to county 51013 (minus dropped fixed effect, plus intercept)
-    assert reporting_data_features.shape == (n, n_expected_columns)
-    assert nonreporting_data_features.shape == (2360 - n, n_expected_columns)
+    assert reporting_data_features.shape == (n_train, n_expected_columns)  # use n_train since dropping columns
+    assert nonreporting_data_features.shape == (n_test, n_expected_columns)
 
     assert "county_fips_51001" not in reporting_data_features.columns  # dropped from get_dummies because first
     assert "county_fips_51001" not in nonreporting_data_features.columns  # therefore not added manually
@@ -451,4 +545,4 @@ def test_generate_fixed_effects_mixed_reporting(va_governor_precinct_data):
     )  # not in here because not in featurizer.complete_features
 
     assert "county_fips" in featurizer.fixed_effect_cols
-    assert len(featurizer.expanded_fixed_effects) == 7 - 1
+    assert len(featurizer.expanded_fixed_effects) == 133 - 1

From 740aca28c3b22f8cea17ad24f891f81610674413 Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Fri, 25 Aug 2023 19:18:27 +0200
Subject: [PATCH 2/9] small bug fix

---
 src/elexmodel/handlers/data/Featurizer.py | 2 +-
 tests/handlers/test_featurizer.py         | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/elexmodel/handlers/data/Featurizer.py b/src/elexmodel/handlers/data/Featurizer.py
index fd8ddd11..f26b3301 100644
--- a/src/elexmodel/handlers/data/Featurizer.py
+++ b/src/elexmodel/handlers/data/Featurizer.py
@@ -93,7 +93,7 @@ def prepare_data(
                 if x.startswith(tuple([fixed_effect + "_" for fixed_effect in self.fixed_effect_cols]))
             ]
 
-            df_fitting = df[(df.reporting is True) & (df.expected is True)]
+            df_fitting = df[(df.reporting) & (df.expected)]
             # get the indices of all expanded fixed effects in the fitting data (active fixed effects + the fixed effect we will drop for multicolinearity)
             active_fixed_effect_boolean_df = df_fitting[all_expanded_fixed_effects].sum(axis=0) > 0
             # get the names of those fixed effects, since we we will want to know which fixed effect was dropped
diff --git a/tests/handlers/test_featurizer.py b/tests/handlers/test_featurizer.py
index 991aca48..8c3edda1 100644
--- a/tests/handlers/test_featurizer.py
+++ b/tests/handlers/test_featurizer.py
@@ -91,7 +91,6 @@ def test_column_names():
             "expected": [True, True, True, True, False, False, False, False],
         }
     )
-
     df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True)
 
     df_fitting = featurizer.filter_to_active_features(df_new[:split_fitting_heldout])

From c811e9c2c76456cc692f44756a89fa7e59058352 Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Wed, 30 Aug 2023 18:41:52 +0200
Subject: [PATCH 3/9] updated comments

---
 src/elexmodel/handlers/data/Featurizer.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/elexmodel/handlers/data/Featurizer.py b/src/elexmodel/handlers/data/Featurizer.py
index f26b3301..e7848dd7 100644
--- a/src/elexmodel/handlers/data/Featurizer.py
+++ b/src/elexmodel/handlers/data/Featurizer.py
@@ -67,7 +67,12 @@ def prepare_data(
     ) -> pd.DataFrame:
         """
         Prepares features.
-        Adds dummy variables for fixed effects. Also includes centering, scaling continuous covariates and adding intercept.
+        Adds dummy variables for fixed effects, also determines which fixed effects are expanded and active.
+        if center_features is true we subtract the features by their average column value, which sets the average column value to zero
+            this allows us to interpret the intercept as the mean response given all other covariates at their average value
+        if scale_features is true we divide the features by their standard deviation, which gives them all the same scale
+            this can improve the convergence of optimization algorithms
+        if add_intercept is true an intercept column is added to the features and one fixed effect value is dropped
         """
         df = df.copy()  # create copy so we can do things to the values
         if center_features:

From 9345e370509dadc4e0241c10a0443578c8eeac1a Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Wed, 30 Aug 2023 20:15:25 +0200
Subject: [PATCH 4/9] linter

---
 src/elexmodel/models/BaseElectionModel.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/elexmodel/models/BaseElectionModel.py b/src/elexmodel/models/BaseElectionModel.py
index 967a1f28..818de945 100644
--- a/src/elexmodel/models/BaseElectionModel.py
+++ b/src/elexmodel/models/BaseElectionModel.py
@@ -58,7 +58,7 @@ def get_unit_predictions(self, reporting_units, nonreporting_units, estimand, **
         Produces unit level predictions. Fits quantile regression to reporting data, applies
         it to nonreporting data. The features are specified in model_settings.
         """
-        n_train = reporting_units.shape[0]
+        self.n_train = reporting_units.shape[0]
         n_test = nonreporting_units.shape[0]
         all_units = pd.concat([reporting_units, nonreporting_units], axis=0)
 
@@ -67,8 +67,10 @@ def get_unit_predictions(self, reporting_units, nonreporting_units, estimand, **
             all_units, center_features=True, scale_features=False, add_intercept=self.add_intercept
         )
 
-        reporting_units_features = featurizer.filter_to_active_features(x_all[:n_train])
-        nonreporting_units_features = featurizer.generate_holdout_data(x_all[n_train : n_train + n_test])  # noqa: E203
+        reporting_units_features = featurizer.filter_to_active_features(x_all[: self.n_train])
+        nonreporting_units_features = featurizer.generate_holdout_data(
+            x_all[self.n_train : self.n_train + n_test]  # noqa: E203
+        )
 
         weights = reporting_units[f"last_election_results_{estimand}"]
         reporting_units_residuals = reporting_units[f"residuals_{estimand}"]
@@ -207,7 +209,7 @@ def get_unit_prediction_interval_bounds(self, reporting_units, nonreporting_unit
         upper_bound = (1 + alpha) / 2
         lower_bound = (1 - alpha) / 2
 
-        train_rows = math.floor(reporting_units.shape[0] * conf_frac)
+        train_rows = math.floor(self.n_train * conf_frac)
         train_data = reporting_units_shuffled[:train_rows]
 
         # the fixed effects in train_data will be a subset of the fixed effect of reporting_units since all
@@ -238,9 +240,11 @@ def get_unit_prediction_interval_bounds(self, reporting_units, nonreporting_unit
         # bounds for nonreporting data.
         conformalization_data = reporting_units_shuffled[train_rows:]
 
-        # all_data starts with reporting_units_shuffled, so the rows between train_rows and n_reporting_units are the
+        # all_data starts with reporting_units_shuffled, so the rows between train_rows and n_train are the
         # conformalization set
-        conformalization_data_features = interval_featurizer.generate_holdout_data(x_all[train_rows:n_reporting_units])
+        conformalization_data_features = interval_featurizer.generate_holdout_data(
+            x_all[train_rows : self.n_train]  # noqa: E203
+        )
 
         # we are interested in f(X) - r
         # since later conformity scores care about deviation of bounds from residuals

From 8387d41af37796ac65a4b8004b3d0f6849517c55 Mon Sep 17 00:00:00 2001
From: Leonard Bronner <6954332+lennybronner@users.noreply.github.com>
Date: Wed, 30 Aug 2023 20:16:15 +0200
Subject: [PATCH 5/9] Update src/elexmodel/models/BaseElectionModel.py

Co-authored-by: Jen Haskell <jchaskell@users.noreply.github.com>
---
 src/elexmodel/models/BaseElectionModel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/elexmodel/models/BaseElectionModel.py b/src/elexmodel/models/BaseElectionModel.py
index 818de945..5ad7c701 100644
--- a/src/elexmodel/models/BaseElectionModel.py
+++ b/src/elexmodel/models/BaseElectionModel.py
@@ -261,7 +261,7 @@ def get_unit_prediction_interval_bounds(self, reporting_units, nonreporting_unit
 
         # apply lower/upper models to nonreporting data
         # since nonreporting_units is the second dataframe in a_all, all units after n_reporting_units are nonreporting
-        # note: the features used may be different fromt the median predictions, but this guarantees that the features
+        # note: the features used may be different from the median predictions, but this guarantees that the features
         # are the same accross train_data, conformalization_data and nonreporting_units
         nonreporting_units_features = interval_featurizer.generate_holdout_data(x_all[n_reporting_units:])
         nonreporting_lower_bounds = lower_qr.predict(nonreporting_units_features)

From c80156d0afc12935a9ce7af82ee070c38461dc5c Mon Sep 17 00:00:00 2001
From: Leonard Bronner <6954332+lennybronner@users.noreply.github.com>
Date: Wed, 30 Aug 2023 23:30:42 +0200
Subject: [PATCH 6/9] Update tests/handlers/test_featurizer.py

Co-authored-by: Jen Haskell <jchaskell@users.noreply.github.com>
---
 tests/handlers/test_featurizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/handlers/test_featurizer.py b/tests/handlers/test_featurizer.py
index 8c3edda1..c84b1b8f 100644
--- a/tests/handlers/test_featurizer.py
+++ b/tests/handlers/test_featurizer.py
@@ -177,7 +177,7 @@ def test_generating_heldout_set():
     "fe_a_c" in df_heldout.columns
     "fe_a_d" not in df_heldout.columns  # not an active fixed effect
 
-    assert df_heldout.loc[6, "fe_a_b"] == 1  # since row 7 has an inactive fixed effect
+    assert df_heldout.loc[6, "fe_a_b"] == 1  # since row 6 has an active fixed effect
     assert df_heldout.loc[7, "fe_a_b"] == 1 / 3  # since row 7 has an inactive fixed effect
     assert df_heldout.loc[7, "fe_a_c"] == 1 / 3  # since row 7 has an inactive fixed effect
 

From e86c6138c218beb5038dcbc7ddd1147000673f29 Mon Sep 17 00:00:00 2001
From: Leonard Bronner <6954332+lennybronner@users.noreply.github.com>
Date: Wed, 30 Aug 2023 23:31:28 +0200
Subject: [PATCH 7/9] Update tests/handlers/test_featurizer.py

Co-authored-by: Jen Haskell <jchaskell@users.noreply.github.com>
---
 tests/handlers/test_featurizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/handlers/test_featurizer.py b/tests/handlers/test_featurizer.py
index c84b1b8f..2b235195 100644
--- a/tests/handlers/test_featurizer.py
+++ b/tests/handlers/test_featurizer.py
@@ -185,7 +185,7 @@ def test_generating_heldout_set():
     "fe_b_x" in df_heldout.columns
     "fe_b_z" not in df_heldout.columns  # inactive
 
-    assert df_heldout.loc[6, "fe_a_b"] == 1  # since row 7 has an inactive fixed effect
+    assert df_heldout.loc[6, "fe_a_b"] == 1  # since row 6 has an active fixed effect
     assert df_heldout.loc[7, "fe_a_b"] == 1 / 3  # since row 7 has an inactive fixed effect
     assert df_heldout.loc[7, "fe_a_c"] == 1 / 3  # since row 7 has an inactive fixed effect
 

From 8c85b7a72b3b9e0429da2c71a907e8b078407a9c Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Fri, 1 Sep 2023 20:04:47 +0200
Subject: [PATCH 8/9] updated comment

---
 src/elexmodel/handlers/data/Featurizer.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/elexmodel/handlers/data/Featurizer.py b/src/elexmodel/handlers/data/Featurizer.py
index e7848dd7..e0b1a089 100644
--- a/src/elexmodel/handlers/data/Featurizer.py
+++ b/src/elexmodel/handlers/data/Featurizer.py
@@ -169,6 +169,15 @@ def generate_holdout_data(self, df: pd.DataFrame) -> pd.DataFrame:
 
             # set the values for active fixed effect in rows that have inactive fixed effect to be 1 / (n + 1)
             # rows that have an inactive fixed effect value need to receive the treat of the average fixed effects
-            # NOTE: aren't we now applying 1 * the dropped fixed effect and 1 / (n + 1) times the other fixed effects?
             df.loc[rows_w_inactive_fixed_effects, fe_active_fixed_effects] = 1 / (len(fe_active_fixed_effects) + 1)
+            # This is correct because even rows with active fixed effects have an interept columns, so the coefficient
+            # of the fixed effect value column is actually the *difference* between the dropped column (for which the intercept is
+            # the stand in and the fixed effect column.
+            # Another way to think about this is that for a fixed effect value that is present the fixed effect estimate is: 
+            # if there are three fixed effects r, u and s where s is dropped.
+            # beta_0 + beta_r * indic{r}
+            # beta_0 + beta_u * indic{u}
+            # and the fixed effect estimate for the dropped value is beta_0, so the average is:
+            # beta_0 + (beta_r / 3) + (beta_u / 3)
+
         return self.filter_to_active_features(df)

From 20d8ba98c987747fc4274ebfb9d8a31cdb5e3686 Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Fri, 1 Sep 2023 20:11:28 +0200
Subject: [PATCH 9/9] updated linter

---
 src/elexmodel/handlers/data/Featurizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/elexmodel/handlers/data/Featurizer.py b/src/elexmodel/handlers/data/Featurizer.py
index e0b1a089..9937e00b 100644
--- a/src/elexmodel/handlers/data/Featurizer.py
+++ b/src/elexmodel/handlers/data/Featurizer.py
@@ -173,7 +173,7 @@ def generate_holdout_data(self, df: pd.DataFrame) -> pd.DataFrame:
             # This is correct because even rows with active fixed effects have an interept columns, so the coefficient
             # of the fixed effect value column is actually the *difference* between the dropped column (for which the intercept is
             # the stand in and the fixed effect column.
-            # Another way to think about this is that for a fixed effect value that is present the fixed effect estimate is: 
+            # Another way to think about this is that for a fixed effect value that is present the fixed effect estimate is:
             # if there are three fixed effects r, u and s where s is dropped.
             # beta_0 + beta_r * indic{r}
             # beta_0 + beta_u * indic{u}