Skip to content

Commit

Permalink
Merge pull request #117 from washingtonpost/fit-separate-model-for-co…
Browse files Browse the repository at this point in the history
…ntest

Fit separate model for contest
  • Loading branch information
lennybronner authored Oct 27, 2024
2 parents 5c891c6 + e1637b7 commit 787a136
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 5 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ Some model types have specific model parameters that can be included.
| percent_expected_vote_error_bound | numeric | error tolerance on expected vote | `bootstrap` |
| z_unobserved_lower_bound | numeric | lower bound for turnout factor | `bootstrap` |
| z_unobserved_upper_bound | numeric | upper bound for turnout factor | `bootstrap` |
| states_for_separate_model | list | states with separate model fit | `bootstrap` |

This is the class and function that invokes the general function to generate estimates. You can install `elex-model` as a Python package and use this code snippet in other projects.

Expand Down
36 changes: 33 additions & 3 deletions src/elexmodel/handlers/data/Featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ class Featurizer:
Featurizer. Normalizes features, add intercept, expands fixed effects
"""

def __init__(self, features: list, fixed_effects: list):
def __init__(self, features: list, fixed_effects: list, states_for_separate_model: list = []):
self.features = features
# fixed effects can be a list, in which case every value of a fixed effect gets its own column
if isinstance(fixed_effects, list):
Expand Down Expand Up @@ -39,6 +39,8 @@ def __init__(self, features: list, fixed_effects: list):
# active features are features + active fixed effects
self.active_features = []

self.states_for_separate_model = states_for_separate_model

def _expand_fixed_effects(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Convert fixed effect columns into dummy variables.
Expand Down Expand Up @@ -81,6 +83,19 @@ def prepare_data(
if add_intercept is true an intercept column is added to the features and one fixed effect value is dropped
"""
df = df.copy() # create copy so we can do things to the values

# if a state is in the states for separate model, then we add separate feature columns for that state
# and we zero out the original feature column for those states
additional_state_features = []
for state in self.states_for_separate_model:
mask = df.postal_code == state
for feature in self.features:
state_feature = f"{feature}_{state}"
df[state_feature] = df[feature].where(mask, 0)
df.loc[mask, feature] = 0
additional_state_features.append(state_feature)

df = df.copy() # create copy so we can do things to the values
if center_features:
df[self.features] -= df[self.features].mean()
if scale_features:
Expand All @@ -91,6 +106,21 @@ def prepare_data(
self.active_features += ["intercept"]
df["intercept"] = 1

# if a state is in the states for separate model, then we add separate intercept columns for that state
# and we zero out the original intercept column for those stattes
for state in self.states_for_separate_model:
mask = df.postal_code == state
# if we have a postal code fixed effect then we do not want a separate intercept column for that state
# because the fixed effect takes the role of that column (ie. those would be linearly dependent)
# but we still want to zero out the original intercept for those states.
if "postal_code" not in self.fixed_effect_cols:
state_intercept = f"intercept_{state}"
df[state_intercept] = df["intercept"].where(mask, 0)
self.complete_features.append(state_intercept)
df.loc[mask, "intercept"] = 0

# if fixed effects are on then we have redundant with the state specific intercepts

if len(self.fixed_effect_cols) > 0:
df = self._expand_fixed_effects(df)

Expand Down Expand Up @@ -140,8 +170,8 @@ def prepare_data(
# all features that the model will be fit on
# these are all the features + the expanded fixed effects
# (so all fixed effect values in the complete data excluding the ones dropped for multicolinearity)
self.complete_features += self.features + self.expanded_fixed_effects
self.active_features += self.features + self.active_fixed_effects
self.complete_features += self.features + additional_state_features + self.expanded_fixed_effects
self.active_features += self.features + additional_state_features + self.active_fixed_effects
df = df[self.complete_features]

return df
Expand Down
8 changes: 6 additions & 2 deletions src/elexmodel/models/BootstrapElectionModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,11 @@ def __init__(self, model_settings={}):
self.z_unobserved_upper_bound = model_settings.get("z_unobserved_upper_bound", 1.5)
self.z_unobserved_lower_bound = model_settings.get("z_unobserved_lower_bound", 0.5)

self.featurizer = Featurizer(self.features, self.fixed_effects)
self.states_for_separate_model = model_settings.get("states_for_separate_model", [])
self.featurizer = Featurizer(
self.features, self.fixed_effects, states_for_separate_model=self.states_for_separate_model
)

self.seed = model_settings.get("seed", 0)
self.rng = np.random.default_rng(seed=self.seed) # used for sampling
self.ran_bootstrap = False
Expand Down Expand Up @@ -719,7 +723,7 @@ def _get_strata(
# but like with fixed effects we drop one strata category and use the intercept instead so the
# example would be
# rural: 0, 0 urban: 1, 0 and rural: 0, 1
strata_featurizer = Featurizer([], self.strata)
strata_featurizer = Featurizer([], self.strata, states_for_separate_model=self.states_for_separate_model)
all_units = pd.concat([reporting_units, nonreporting_units], axis=0)

strata_all = strata_featurizer.prepare_data(
Expand Down
82 changes: 82 additions & 0 deletions tests/handlers/test_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,3 +540,85 @@ def test_generate_fixed_effects_mixed_reporting(va_governor_precinct_data):

assert "county_fips" in featurizer.fixed_effect_cols
assert len(featurizer.expanded_fixed_effects) == 133 - 1


def test_separate_state_model():
"""
This function tests to make sure that the featurizer returns the right columns
"""
features = ["a", "b", "c"]
fixed_effects = ["fe_a", "fe_b"]
states_for_separate_model = ["CC"]

featurizer = Featurizer(features, fixed_effects, states_for_separate_model)

df = pd.DataFrame(
{
"postal_code": ["AA", "AA", "BB", "BB", "CC", "CC", "CC", "DD"],
"a": [5, 3, 1, 5, 2, 2, 2, 2],
"b": [2, 2, 2, 2, 3, 3, 3, 3],
"c": [3, 3, 3, 3, 1, 2, 3, 4],
"d": [1, 2, 3, 4, 5, 3, 1, 5],
"fe_a": ["a", "a", "b", "c", "a", "a", "b", "d"],
"fe_b": ["1", "x", "7", "y", "1", "z", "z", "w"],
"reporting": [1, 1, 1, 1, 1, 0, 0, 0],
"unit_category": ["expected"] * 8,
}
)

df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True)
assert df_new.loc[df.postal_code != "CC", "intercept"].all() == 1
assert df_new.loc[df.postal_code == "CC", "intercept"].all() == 0
assert df_new.loc[df.postal_code != "CC", "intercept_CC"].all() == 0
assert df_new.loc[df.postal_code == "CC", "intercept_CC"].all() == 1

assert df_new.loc[df.postal_code != "CC", "a"].all() > 0
assert df_new.loc[df.postal_code == "CC", "a"].all() == 0
assert df_new.loc[df.postal_code != "CC", "a_CC"].all() == 0
assert df_new.loc[df.postal_code == "CC", "a_CC"].all() > 0

assert df_new.loc[df.postal_code != "CC", "b"].all() > 0
assert df_new.loc[df.postal_code == "CC", "b"].all() == 0
assert df_new.loc[df.postal_code != "CC", "b_CC"].all() == 0
assert df_new.loc[df.postal_code == "CC", "b_CC"].all() > 0

assert df_new.loc[df.postal_code != "CC", "c"].all() > 0
assert df_new.loc[df.postal_code == "CC", "c"].all() == 0
assert df_new.loc[df.postal_code != "CC", "c_CC"].all() == 0
assert df_new.loc[df.postal_code == "CC", "c_CC"].all() > 0

# slightly more complicated, with two states
states_for_separate_model = ["BB", "CC"]
featurizer = Featurizer(features, fixed_effects, states_for_separate_model)
df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True)

assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept"].all() == 1
assert df_new.loc[df.postal_code == "CC", "intercept"].all() == 0
assert df_new.loc[df.postal_code == "BB", "intercept"].all() == 0
assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept_CC"].all() == 0
assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept_BB"].all() == 0
assert df_new.loc[df.postal_code == "CC", "intercept_CC"].all() == 1
assert df_new.loc[df.postal_code == "BB", "intercept_BB"].all() == 1
assert df_new.loc[df.postal_code == "CC", "intercept_BB"].all() == 0
assert df_new.loc[df.postal_code == "BB", "intercept_CC"].all() == 0

assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "a"].all() > 0
assert df_new.loc[df.postal_code == "CC", "a"].all() == 0
assert df_new.loc[df.postal_code == "BB", "a"].all() == 0
assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "a_CC"].all() == 0
assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "a_BB"].all() == 0
assert df_new.loc[df.postal_code == "CC", "a_CC"].all() > 0
assert df_new.loc[df.postal_code == "BB", "a_BB"].all() > 0
assert df_new.loc[df.postal_code == "CC", "a_BB"].all() == 0
assert df_new.loc[df.postal_code == "BB", "a_CC"].all() == 0

# if postal code is in fixed effect, then don't add indivdual intercepts
fixed_effects = ["fe_a", "fe_b", "postal_code"]
featurizer = Featurizer(features, fixed_effects, states_for_separate_model)
df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True)

assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept"].all() == 1
assert df_new.loc[df.postal_code == "CC", "intercept"].all() == 0
assert df_new.loc[df.postal_code == "BB", "intercept"].all() == 0
assert "intercept_BB" not in df_new.columns
assert "intercept_CC" not in df_new.columns

0 comments on commit 787a136

Please sign in to comment.