Skip to content

Commit

Permalink
Merge pull request #69 from washingtonpost/updates-to-featurizer
Browse files Browse the repository at this point in the history
Updates to Featurizer
  • Loading branch information
lennybronner authored Sep 15, 2023
2 parents 60f1e1f + 704d8d1 commit 8bff4bc
Show file tree
Hide file tree
Showing 5 changed files with 400 additions and 252 deletions.
9 changes: 6 additions & 3 deletions src/elexmodel/handlers/data/CombinedData.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ def get_reporting_units(self, percent_reporting_threshold, features_to_normalize
reporting_units[f"results_{estimand}"] - reporting_units[f"last_election_results_{estimand}"]
) / reporting_units[f"last_election_results_{estimand}"]

reporting_units["reporting"] = 1
reporting_units["reporting"] = int(1)
reporting_units["expected"] = True

return reporting_units

Expand All @@ -75,7 +76,8 @@ def get_nonreporting_units(self, percent_reporting_threshold, features_to_normal
drop=True
)

nonreporting_units["reporting"] = 0
nonreporting_units["reporting"] = int(0)
nonreporting_units["expected"] = True

return nonreporting_units

Expand Down Expand Up @@ -128,7 +130,8 @@ def get_unexpected_units(self, percent_reporting_threshold, aggregates):
self._get_district_from_geographic_unit_fips
)

unexpected_units["reporting"] = 1
unexpected_units["reporting"] = int(1)
unexpected_units["expected"] = False

return unexpected_units

Expand Down
229 changes: 130 additions & 99 deletions src/elexmodel/handlers/data/Featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ class Featurizer:
Featurizer. Normalizes features, add intercept, expands fixed effects
"""

def __init__(self, features, fixed_effects):
def __init__(self, features: list, fixed_effects: list):
self.features = features
# fixed effects can be a list, in which case every value of a fixed effect gets its own column
if isinstance(fixed_effects, list):
self.fixed_effect_cols = fixed_effects
self.fixed_effect_params = {fe: ["all"] for fe in fixed_effects}
# fixed effects can be a dictionary from fixed effect to values that get their own column (or the string all, if we want all values)
else:
self.fixed_effect_cols = list(fixed_effects.keys())
self.fixed_effect_params = {}
Expand All @@ -21,132 +23,161 @@ def __init__(self, features, fixed_effects):
else:
self.fixed_effect_params[fe] = params

# we differentiate between expanded fixed effects and active fixed effect values
# expanded fixed effects are those fixed effect values that appear in any part of the
# the data (fitting or heldout) exlcluding those that have been dropped to avoid
# multicolinearity when fitting.
self.expanded_fixed_effects = []
self.complete_features = None
self.column_means = None

def compute_means_for_centering(self, *arg):
"""
Computes and saves the column mean of pandas dataframe passed as args.
This is used for centering.
"""
data = pd.concat(arg)
self.column_means = data[self.features].mean()

def _center_features(self, df):
"""
Centers the features. This changes the interpretation of the intercept coefficient
from conditional mean given covariates = 0, to conditional mean given covariates are
their average value
"""
df[self.features] = df[self.features] - self.column_means
# complete features are features + expanded fixed effects
self.complete_features = []

def _add_intercept(self, df):
df["intercept"] = 1
# active fixed effects are those that appear in the fitting data (ie. ones for which
# the model fitting computes a coefficient) but exluding those that we drop manually
# to avoid multicolinearity when fitting
self.active_fixed_effects = []
# active features are features + active fixed effects
self.active_features = []

def _expand_fixed_effects(self, df: pd.DataFrame, drop_first: bool) -> pd.DataFrame:
def _expand_fixed_effects(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Convert fixed effect columns into dummy variables.
"""
df = df.copy()
# we want to keep the original fixed effect columns since we may need them later for aggregation (ie. county fixed effect)
original_fixed_effect_columns = df[self.fixed_effect_cols]
# set non-included values to other as needed
fe_df = df.copy()
# set non-included values to 'other' as needed since we don't want their values to get a dummy variable
for fe, params in self.fixed_effect_params.items():
if "all" not in params:
fe_df[fe] = np.where(~fe_df[fe].isin(params), "other", fe_df[fe])
df[fe] = np.where(~df[fe].isin(params), "other", df[fe])

expanded_fixed_effects = pd.get_dummies(
fe_df, columns=self.fixed_effect_cols, prefix=self.fixed_effect_cols, prefix_sep="_", dtype=np.int64
df, columns=self.fixed_effect_cols, prefix=self.fixed_effect_cols, prefix_sep="_", dtype=np.int64
)

# drop first column or "other" column if drop_first is true
cols_to_drop = []
if drop_first:
for fixed_effect in self.fixed_effect_cols:
relevant_cols = [col for col in expanded_fixed_effects.columns if col.startswith(fixed_effect)]
if f"{fixed_effect}_other" in relevant_cols:
cols_to_drop.append(f"{fixed_effect}_other")
else:
cols_to_drop.append(relevant_cols[0])
return pd.concat([original_fixed_effect_columns, expanded_fixed_effects], axis=1)

# we concatenate the dummy variables with the original fixed effects, since we need the original fixed
# effect columns for aggregation.
return pd.concat([original_fixed_effect_columns, expanded_fixed_effects.drop(cols_to_drop, axis=1)], axis=1)

def featurize_fitting_data(self, fitting_data, center_features=True, add_intercept=True):
def _get_categories_for_fe(self, list_: list, fe: str) -> list:
"""
Featurize the data that the model is fitted on.
In our case fitting_data is either the reporting_units (when fitting a model for the point predictions)
or training_data (when fitting the model for the prediction intervals)
Return list of fixed effects values for a given fixed effect
"""
# make copy of fitting_data, since we do not want to change the original data
new_fitting_data = fitting_data.copy()
self.center_features = center_features
self.add_intercept = add_intercept

if self.center_features:
self._center_features(new_fitting_data)
return [x for x in list_ if x.startswith(fe)]

self.complete_features = []
if self.add_intercept:
def prepare_data(
self, df: pd.DataFrame, center_features: bool = True, scale_features: bool = True, add_intercept: bool = True
) -> pd.DataFrame:
"""
Prepares features.
Adds dummy variables for fixed effects, also determines which fixed effects are expanded and active.
if center_features is true we subtract the features by their average column value, which sets the average column value to zero
this allows us to interpret the intercept as the mean response given all other covariates at their average value
if scale_features is true we divide the features by their standard deviation, which gives them all the same scale
this can improve the convergence of optimization algorithms
if add_intercept is true an intercept column is added to the features and one fixed effect value is dropped
"""
df = df.copy() # create copy so we can do things to the values
if center_features:
df[self.features] -= df[self.features].mean()
if scale_features:
# this expects there to be some variation in the data, otherwise we are dividing by zero
df[self.features] /= df[self.features].std()
if add_intercept:
self.complete_features += ["intercept"]
self._add_intercept(new_fitting_data)
self.active_features += ["intercept"]
df["intercept"] = 1

if len(self.fixed_effect_cols) > 0:
# drop_first is True for fitting_data (e.g. reporting_units) since we want to avoid the design matrix with
# expanded fixed effects to be linearly dependent
new_fitting_data = self._expand_fixed_effects(new_fitting_data, drop_first=True)
# we save the expanded fixed effects to be able to add fixed effects that are
# not in the heldout_data (nonreporting_units) as a zero column and to be able
# to specify the order of the expanded fixed effect when fitting the model
self.expanded_fixed_effects = [
df = self._expand_fixed_effects(df)

# we save the expanded fixed effects to be able to add a zero column for those
# fixed effect values if they are not in the heldout_data (nonreporting units).
# Also we can use this to guarantee the order of the fixed effect columns
# when fitting the model
all_expanded_fixed_effects = [
x
for x in new_fitting_data.columns
for x in df.columns
if x.startswith(tuple([fixed_effect + "_" for fixed_effect in self.fixed_effect_cols]))
]

df_fitting = df[(df.reporting) & (df.expected)]
# get the indices of all expanded fixed effects in the fitting data (active fixed effects + the fixed effect we will drop for multicolinearity)
active_fixed_effect_boolean_df = df_fitting[all_expanded_fixed_effects].sum(axis=0) > 0
# get the names of those fixed effects, since we we will want to know which fixed effect was dropped
all_active_fixed_effects = np.asarray(all_expanded_fixed_effects)[active_fixed_effect_boolean_df]

# if we add an intercept we need to drop a value/column per fixed effect in order to avoid multicolinearity.
# the intercept column is now a stand-in for the the dropped fixed effect value/column
if add_intercept:
active_fixed_effects = (
[]
) # fixed effects that exist in the fitting_data (excluding one dropped column to avoid multicolinearity)
intercept_column = (
[]
) # we need to save the fixed effect categories that the intercept is now standing in for
# we want to drop one value/column per fixed effect to avoid multicolinearity
for fe in self.fixed_effect_cols:
# grab the potentially active fixed effect names for this fixed effect
fe_fixed_effect_filter = self._get_categories_for_fe(all_active_fixed_effects, fe)
# drop the first potentially active fixed effect
active_fixed_effects.extend(fe_fixed_effect_filter[1:])
# save the name of the fixed effect that we dropped
intercept_column.append(fe_fixed_effect_filter[0])

self.active_fixed_effects = active_fixed_effects
self.intercept_column = intercept_column
# expanded fixed effects do not include the ones that we dropped to avoid multicolinearity
self.expanded_fixed_effects = [x for x in all_expanded_fixed_effects if x not in intercept_column]
else:
self.active_fixed_effects = all_active_fixed_effects
self.expanded_fixed_effects = all_expanded_fixed_effects

# all features that the model will be fit on
# these are all the features + the expanded fixed effects (so all fixed effect values in the complete data excluding the ones dropped for multicolinearity)
self.complete_features += self.features + self.expanded_fixed_effects
self.active_features += self.features + self.active_fixed_effects
df = df[self.complete_features]

return new_fitting_data[self.complete_features]
return df

def featurize_heldout_data(self, heldout_data):
def filter_to_active_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Featurize the data that the model will be applied on.
In our case the heldout_data is either the nonreporting_units
(when applying the model for the point predictions)
or conformalization_data/nonreporting_units
(when applying the model for the prediction intervals)
Get active features (ie. features + active fixed effects)
"""
new_heldout_data = heldout_data.copy()
return df[self.active_features]

if self.center_features:
self._center_features(new_heldout_data)

if self.add_intercept:
self._add_intercept(new_heldout_data)

if len(self.fixed_effect_cols) > 0:
missing_expanded_fixed_effects = []
new_heldout_data = self._expand_fixed_effects(new_heldout_data, drop_first=False)
# if all units from one fixed effect are reporting they will not appear in the heldout_data
# (e.g. nonreporting_units) and won't get a column when we expand the fixed effects
# on that dataframe. Therefore we add those columns with zero fixed effects manually.
# As an example, if we are running a county model using state fixed effects, and
# all of Delaware's counties are reporting, then no Delaware county will be in
# heldout_data (nonreporting_units), as a result there will be no column for Delaware
# in the expanded fixed effects of heldout_data (nonreporting_units).
for expanded_fixed_effect in self.expanded_fixed_effects:
if expanded_fixed_effect not in new_heldout_data.columns:
missing_expanded_fixed_effects.append(expanded_fixed_effect)

missing_expanded_fixed_effects_df = pd.DataFrame(
np.zeros((new_heldout_data.shape[0], len(missing_expanded_fixed_effects))),
columns=missing_expanded_fixed_effects,
)
# if we use this method to add the missing expanded fixed effects because doing it manually
# ie. new_heldout_data[expanded_fixed_effect] = 0
# can throw a fragmentation warning when there are many missing fixed effects.
new_heldout_data = new_heldout_data.join(missing_expanded_fixed_effects_df)

return new_heldout_data[self.complete_features]
def generate_holdout_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Generate fixed effects for the holdout data (ie. data that we will predict on)
"""
df = df.copy()

# if a unit has an inactive fixed effect value for some fixed effect category we need
# to insert 1 / (number of fixed effect values) into each active fixed effect value for that unit
# if we were to leave them as zero, then the model would apply the dropped fixed effect
# value coefficient (since this is now what the intercept stands in for)
# instead we want to apply all active fixed effect coefficients equally

# get inactive fixed effects (ie expanded fixed effects that are not active)
# these are fixed effects that exist only in the holdout set (ie. we do not have a covariate for them)
inactive_fixed_effects = [x for x in self.expanded_fixed_effects if x not in self.active_fixed_effects]
for fe in self.fixed_effect_cols:
# active fixed effect values for this fixed effect
fe_active_fixed_effects = self._get_categories_for_fe(self.active_fixed_effects, fe)
# inactive fixed effect values for this fixed effect
fe_inactive_fixed_effects = self._get_categories_for_fe(inactive_fixed_effects, fe)
# get rows that have an inactive fixed effect
rows_w_inactive_fixed_effects = df[fe_inactive_fixed_effects].sum(axis=1) > 0

# set the values for active fixed effect in rows that have inactive fixed effect to be 1 / (n + 1)
# rows that have an inactive fixed effect value need to receive the treat of the average fixed effects
df.loc[rows_w_inactive_fixed_effects, fe_active_fixed_effects] = 1 / (len(fe_active_fixed_effects) + 1)
# This is correct because even rows with active fixed effects have an interept columns, so the coefficient
# of the fixed effect value column is actually the *difference* between the dropped column (for which the intercept is
# the stand in and the fixed effect column.
# Another way to think about this is that for a fixed effect value that is present the fixed effect estimate is:
# if there are three fixed effects r, u and s where s is dropped.
# beta_0 + beta_r * indic{r}
# beta_0 + beta_u * indic{u}
# and the fixed effect estimate for the dropped value is beta_0, so the average is:
# beta_0 + (beta_r / 3) + (beta_u / 3)

return self.filter_to_active_features(df)
3 changes: 3 additions & 0 deletions src/elexmodel/models/BaseElectionModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,6 @@ def get_coefficients(self) -> dict:
These coefficients are for the point prediciton only.
"""
return self.features_to_coefficients

def get_national_summary_estimates(self, nat_sum_data_dict, called_states, base_to_add):
raise NotImplementedError()
Loading

0 comments on commit 8bff4bc

Please sign in to comment.