Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates to Featurizer #69

Merged
merged 11 commits into from
Sep 15, 2023
9 changes: 6 additions & 3 deletions src/elexmodel/handlers/data/CombinedData.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ def get_reporting_units(self, percent_reporting_threshold, features_to_normalize
reporting_units[f"results_{estimand}"] - reporting_units[f"last_election_results_{estimand}"]
) / reporting_units[f"last_election_results_{estimand}"]

reporting_units["reporting"] = 1
reporting_units["reporting"] = int(1)
jchaskell marked this conversation as resolved.
Show resolved Hide resolved
reporting_units["expected"] = True
dmnapolitano marked this conversation as resolved.
Show resolved Hide resolved

return reporting_units

Expand All @@ -68,7 +69,8 @@ def get_nonreporting_units(self, percent_reporting_threshold, features_to_normal
drop=True
)

nonreporting_units["reporting"] = 0
nonreporting_units["reporting"] = int(0)
nonreporting_units["expected"] = True

return nonreporting_units

Expand Down Expand Up @@ -121,7 +123,8 @@ def get_unexpected_units(self, percent_reporting_threshold, aggregates):
self._get_district_from_geographic_unit_fips
)

unexpected_units["reporting"] = 1
unexpected_units["reporting"] = int(1)
unexpected_units["expected"] = False

return unexpected_units

Expand Down
215 changes: 116 additions & 99 deletions src/elexmodel/handlers/data/Featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ class Featurizer:
Featurizer. Normalizes features, add intercept, expands fixed effects
"""

def __init__(self, features, fixed_effects):
def __init__(self, features: list, fixed_effects: list):
lennybronner marked this conversation as resolved.
Show resolved Hide resolved
self.features = features
# fixed effects can be a list, in which case every value of a fixed effect gets its own column
if isinstance(fixed_effects, list):
self.fixed_effect_cols = fixed_effects
self.fixed_effect_params = {fe: ["all"] for fe in fixed_effects}
# fixed effects can be a dictionary from fixed effect to values that get their own column (or the string all, if we want all values)
else:
self.fixed_effect_cols = list(fixed_effects.keys())
self.fixed_effect_params = {}
Expand All @@ -21,132 +23,147 @@ def __init__(self, features, fixed_effects):
else:
self.fixed_effect_params[fe] = params

# we differentiate between expanded fixed effects and active fixed effect values
# expanded fixed effects are those fixed effect values that appear in any part of the
# the data (fitting or heldout) exlcluding those that have been dropped to avoid
# multicolinearity when fitting.
self.expanded_fixed_effects = []
self.complete_features = None
self.column_means = None

def compute_means_for_centering(self, *arg):
"""
Computes and saves the column mean of pandas dataframe passed as args.
This is used for centering.
"""
data = pd.concat(arg)
self.column_means = data[self.features].mean()

def _center_features(self, df):
"""
Centers the features. This changes the interpretation of the intercept coefficient
from conditional mean given covariates = 0, to conditional mean given covariates are
their average value
"""
df[self.features] = df[self.features] - self.column_means
# complete features are features + expanded fixed effects
self.complete_features = []

def _add_intercept(self, df):
df["intercept"] = 1
# active fixed effects are those that appear in the fitting data (ie. ones for which
# the model fitting computes a coefficient) but exluding those that we drop manually
# to avoid multicolinearity when fitting
self.active_fixed_effects = []
# active features are features + active fixed effects
self.active_features = []

def _expand_fixed_effects(self, df: pd.DataFrame, drop_first: bool) -> pd.DataFrame:
def _expand_fixed_effects(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Convert fixed effect columns into dummy variables.
"""
df = df.copy()
# we want to keep the original fixed effect columns since we may need them later for aggregation (ie. county fixed effect)
original_fixed_effect_columns = df[self.fixed_effect_cols]
# set non-included values to other as needed
fe_df = df.copy()
# set non-included values to 'other' as needed since we don't want their values to get a dummy variable
for fe, params in self.fixed_effect_params.items():
if "all" not in params:
fe_df[fe] = np.where(~fe_df[fe].isin(params), "other", fe_df[fe])
df[fe] = np.where(~df[fe].isin(params), "other", df[fe])

expanded_fixed_effects = pd.get_dummies(
fe_df, columns=self.fixed_effect_cols, prefix=self.fixed_effect_cols, prefix_sep="_", dtype=np.int64
df, columns=self.fixed_effect_cols, prefix=self.fixed_effect_cols, prefix_sep="_", dtype=np.int64
)

# drop first column or "other" column if drop_first is true
cols_to_drop = []
if drop_first:
jchaskell marked this conversation as resolved.
Show resolved Hide resolved
for fixed_effect in self.fixed_effect_cols:
relevant_cols = [col for col in expanded_fixed_effects.columns if col.startswith(fixed_effect)]
if f"{fixed_effect}_other" in relevant_cols:
cols_to_drop.append(f"{fixed_effect}_other")
else:
cols_to_drop.append(relevant_cols[0])
return pd.concat([original_fixed_effect_columns, expanded_fixed_effects], axis=1)

# we concatenate the dummy variables with the original fixed effects, since we need the original fixed
# effect columns for aggregation.
return pd.concat([original_fixed_effect_columns, expanded_fixed_effects.drop(cols_to_drop, axis=1)], axis=1)

def featurize_fitting_data(self, fitting_data, center_features=True, add_intercept=True):
def _get_categories_for_fe(self, list_: list, fe: str) -> list:
"""
Featurize the data that the model is fitted on.
In our case fitting_data is either the reporting_units (when fitting a model for the point predictions)
or training_data (when fitting the model for the prediction intervals)
Return list of fixed effects values for a given fixed effect
"""
# make copy of fitting_data, since we do not want to change the original data
new_fitting_data = fitting_data.copy()
self.center_features = center_features
self.add_intercept = add_intercept

if self.center_features:
self._center_features(new_fitting_data)
return [x for x in list_ if x.startswith(fe)]

self.complete_features = []
if self.add_intercept:
def prepare_data(
self, df: pd.DataFrame, center_features: bool = True, scale_features: bool = True, add_intercept: bool = True
) -> pd.DataFrame:
"""
Prepares features.
jchaskell marked this conversation as resolved.
Show resolved Hide resolved
Adds dummy variables for fixed effects. Also includes centering, scaling continuous covariates and adding intercept.
"""
df = df.copy() # create copy so we can do things to the values
if center_features:
df[self.features] -= df[self.features].mean()
if scale_features:
# this expects there to be some variation in the data, otherwise we are dividing by zero
df[self.features] /= df[self.features].std()
if add_intercept:
self.complete_features += ["intercept"]
self._add_intercept(new_fitting_data)
self.active_features += ["intercept"]
df["intercept"] = 1

if len(self.fixed_effect_cols) > 0:
# drop_first is True for fitting_data (e.g. reporting_units) since we want to avoid the design matrix with
# expanded fixed effects to be linearly dependent
new_fitting_data = self._expand_fixed_effects(new_fitting_data, drop_first=True)
# we save the expanded fixed effects to be able to add fixed effects that are
# not in the heldout_data (nonreporting_units) as a zero column and to be able
# to specify the order of the expanded fixed effect when fitting the model
self.expanded_fixed_effects = [
df = self._expand_fixed_effects(df)

# we save the expanded fixed effects to be able to add a zero column for those
# fixed effect values if they are not in the heldout_data (nonreporting units).
# Also we can use this to guarantee the order of the fixed effect columns
# when fitting the model
all_expanded_fixed_effects = [
x
for x in new_fitting_data.columns
for x in df.columns
if x.startswith(tuple([fixed_effect + "_" for fixed_effect in self.fixed_effect_cols]))
]

df_fitting = df[(df.reporting) & (df.expected)]
# get the indices of all expanded fixed effects in the fitting data (active fixed effects + the fixed effect we will drop for multicolinearity)
active_fixed_effect_boolean_df = df_fitting[all_expanded_fixed_effects].sum(axis=0) > 0
# get the names of those fixed effects, since we we will want to know which fixed effect was dropped
all_active_fixed_effects = np.asarray(all_expanded_fixed_effects)[active_fixed_effect_boolean_df]

# if we add an intercept we need to drop a value/column per fixed effect in order to avoid multicolinearity.
# the intercept column is now a stand-in for the the dropped fixed effect value/column
if add_intercept:
active_fixed_effects = (
[]
) # fixed effects that exist in the fitting_data (excluding one dropped column to avoid multicolinearity)
intercept_column = (
[]
) # we need to save the fixed effect categories that the intercept is now standing in for
# we want to drop one value/column per fixed effect to avoid multicolinearity
for fe in self.fixed_effect_cols:
# grab the potentially active fixed effect names for this fixed effect
fe_fixed_effect_filter = self._get_categories_for_fe(all_active_fixed_effects, fe)
# drop the first potentially active fixed effect
active_fixed_effects.extend(fe_fixed_effect_filter[1:])
# save the name of the fixed effect that we dropped
intercept_column.append(fe_fixed_effect_filter[0])

self.active_fixed_effects = active_fixed_effects
self.intercept_column = intercept_column
# expanded fixed effects do not include the ones that we dropped to avoid multicolinearity
self.expanded_fixed_effects = [x for x in all_expanded_fixed_effects if x not in intercept_column]
else:
self.active_fixed_effects = all_active_fixed_effects
self.expanded_fixed_effects = all_expanded_fixed_effects

# all features that the model will be fit on
# these are all the features + the expanded fixed effects (so all fixed effect values in the complete data excluding the ones dropped for multicolinearity)
self.complete_features += self.features + self.expanded_fixed_effects
self.active_features += self.features + self.active_fixed_effects
df = df[self.complete_features]

return new_fitting_data[self.complete_features]
return df

def featurize_heldout_data(self, heldout_data):
def filter_to_active_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Featurize the data that the model will be applied on.
In our case the heldout_data is either the nonreporting_units
(when applying the model for the point predictions)
or conformalization_data/nonreporting_units
(when applying the model for the prediction intervals)
Get active features (ie. features + active fixed effects)
"""
new_heldout_data = heldout_data.copy()
return df[self.active_features]

if self.center_features:
self._center_features(new_heldout_data)

if self.add_intercept:
self._add_intercept(new_heldout_data)

if len(self.fixed_effect_cols) > 0:
missing_expanded_fixed_effects = []
new_heldout_data = self._expand_fixed_effects(new_heldout_data, drop_first=False)
# if all units from one fixed effect are reporting they will not appear in the heldout_data
# (e.g. nonreporting_units) and won't get a column when we expand the fixed effects
# on that dataframe. Therefore we add those columns with zero fixed effects manually.
# As an example, if we are running a county model using state fixed effects, and
# all of Delaware's counties are reporting, then no Delaware county will be in
# heldout_data (nonreporting_units), as a result there will be no column for Delaware
# in the expanded fixed effects of heldout_data (nonreporting_units).
for expanded_fixed_effect in self.expanded_fixed_effects:
if expanded_fixed_effect not in new_heldout_data.columns:
missing_expanded_fixed_effects.append(expanded_fixed_effect)

missing_expanded_fixed_effects_df = pd.DataFrame(
np.zeros((new_heldout_data.shape[0], len(missing_expanded_fixed_effects))),
columns=missing_expanded_fixed_effects,
)
# if we use this method to add the missing expanded fixed effects because doing it manually
# ie. new_heldout_data[expanded_fixed_effect] = 0
# can throw a fragmentation warning when there are many missing fixed effects.
new_heldout_data = new_heldout_data.join(missing_expanded_fixed_effects_df)

return new_heldout_data[self.complete_features]
def generate_holdout_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Generate fixed effects for the holdout data (ie. data that we will predict on)
"""
df = df.copy()

# if a unit has an inactive fixed effect value for some fixed effect category we need
# to insert 1 / (number of fixed effect values) into each active fixed effect value for that unit
dmnapolitano marked this conversation as resolved.
Show resolved Hide resolved
# if we were to leave them as zero, then the model would apply the dropped fixed effect
# value coefficient (since this is now what the intercept stands in for)
# instead we want to apply all active fixed effect coefficients equally

# get inactive fixed effects (ie expanded fixed effects that are not active)
# these are fixed effects that exist only in the holdout set (ie. we do not have a covariate for them)
inactive_fixed_effects = [x for x in self.expanded_fixed_effects if x not in self.active_fixed_effects]
for fe in self.fixed_effect_cols:
# active fixed effect values for this fixed effect
fe_active_fixed_effects = self._get_categories_for_fe(self.active_fixed_effects, fe)
# inactive fixed effect values for this fixed effect
fe_inactive_fixed_effects = self._get_categories_for_fe(inactive_fixed_effects, fe)
# get rows that have an inactive fixed effect
rows_w_inactive_fixed_effects = df[fe_inactive_fixed_effects].sum(axis=1) > 0

# set the values for active fixed effect in rows that have inactive fixed effect to be 1 / (n + 1)
# rows that have an inactive fixed effect value need to receive the treat of the average fixed effects
# NOTE: aren't we now applying 1 * the dropped fixed effect and 1 / (n + 1) times the other fixed effects?
df.loc[rows_w_inactive_fixed_effects, fe_active_fixed_effects] = 1 / (len(fe_active_fixed_effects) + 1)
return self.filter_to_active_features(df)
Loading