Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fit separate model for contest #117

Merged
merged 6 commits into from
Oct 27, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ Some model types have specific model parameters that can be included.
| percent_expected_vote_error_bound | numeric | error tolerance on expected vote | `bootstrap` |
| z_unobserved_lower_bound | numeric | lower bound for turnout factor | `bootstrap` |
| z_unobserved_upper_bound | numeric | upper bound for turnout factor | `bootstrap` |
| states_for_separate_model | list | states with separate model fit | `bootstrap` |

This is the class and function that invokes the general function to generate estimates. You can install `elex-model` as a Python package and use this code snippet in other projects.

Expand Down
38 changes: 35 additions & 3 deletions src/elexmodel/handlers/data/Featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ class Featurizer:
Featurizer. Normalizes features, add intercept, expands fixed effects
"""

def __init__(self, features: list, fixed_effects: list):
def __init__(self, features: list, fixed_effects: list, states_for_separate_model: list = []):
self.features = features
# fixed effects can be a list, in which case every value of a fixed effect gets its own column
if isinstance(fixed_effects, list):
Expand Down Expand Up @@ -39,6 +39,8 @@ def __init__(self, features: list, fixed_effects: list):
# active features are features + active fixed effects
self.active_features = []

self.states_for_separate_model = states_for_separate_model

def _expand_fixed_effects(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Convert fixed effect columns into dummy variables.
Expand Down Expand Up @@ -81,6 +83,21 @@ def prepare_data(
if add_intercept is true an intercept column is added to the features and one fixed effect value is dropped
"""
df = df.copy() # create copy so we can do things to the values

# if a state is in the states for separate model, then we add separate feature columns for that state
# and we zero out the original feature column for those states
additional_state_features = []
# if 'BB' in self.states_for_separate_model and 'CC' in self.states_for_separate_model:
# import pdb; pdb.set_trace()
dmnapolitano marked this conversation as resolved.
Show resolved Hide resolved
for state in self.states_for_separate_model:
mask = df.postal_code == state
for feature in self.features:
state_feature = f"{feature}_{state}"
df[state_feature] = df[feature].where(mask, 0)
df.loc[mask, feature] = 0
additional_state_features.append(state_feature)

df = df.copy() # create copy so we can do things to the values
if center_features:
df[self.features] -= df[self.features].mean()
if scale_features:
Expand All @@ -91,6 +108,21 @@ def prepare_data(
self.active_features += ["intercept"]
df["intercept"] = 1

# if a state is in the states for separate model, then we add separate intercept columns for that state
# and we zero out the original intercept column for those stattes
for state in self.states_for_separate_model:
mask = df.postal_code == state
# if we have a postal code fixed effect then we do not want a separate intercept column for that state
# because the fixed effect takes the role of that column (ie. those would be linearly dependent)
# but we still want to zero out the original intercept for those states.
if "postal_code" not in self.fixed_effect_cols:
state_intercept = f"intercept_{state}"
df[state_intercept] = df["intercept"].where(mask, 0)
self.complete_features.append(state_intercept)
df.loc[mask, "intercept"] = 0

# if fixed effects are on then we have redundant with the state specific intercepts

if len(self.fixed_effect_cols) > 0:
df = self._expand_fixed_effects(df)

Expand Down Expand Up @@ -140,8 +172,8 @@ def prepare_data(
# all features that the model will be fit on
# these are all the features + the expanded fixed effects
# (so all fixed effect values in the complete data excluding the ones dropped for multicolinearity)
self.complete_features += self.features + self.expanded_fixed_effects
self.active_features += self.features + self.active_fixed_effects
self.complete_features += self.features + additional_state_features + self.expanded_fixed_effects
self.active_features += self.features + additional_state_features + self.active_fixed_effects
df = df[self.complete_features]

return df
Expand Down
8 changes: 6 additions & 2 deletions src/elexmodel/models/BootstrapElectionModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,11 @@ def __init__(self, model_settings={}):
self.z_unobserved_upper_bound = model_settings.get("z_unobserved_upper_bound", 1.5)
self.z_unobserved_lower_bound = model_settings.get("z_unobserved_lower_bound", 0.5)

self.featurizer = Featurizer(self.features, self.fixed_effects)
self.states_for_separate_model = model_settings.get("states_for_separate_model", [])
self.featurizer = Featurizer(
self.features, self.fixed_effects, states_for_separate_model=self.states_for_separate_model
)

self.seed = model_settings.get("seed", 0)
self.rng = np.random.default_rng(seed=self.seed) # used for sampling
self.ran_bootstrap = False
Expand Down Expand Up @@ -661,7 +665,7 @@ def _get_strata(
# but like with fixed effects we drop one strata category and use the intercept instead so the
# example would be
# rural: 0, 0 urban: 1, 0 and rural: 0, 1
strata_featurizer = Featurizer([], self.strata)
strata_featurizer = Featurizer([], self.strata, states_for_separate_model=self.states_for_separate_model)
all_units = pd.concat([reporting_units, nonreporting_units], axis=0)

strata_all = strata_featurizer.prepare_data(
Expand Down
82 changes: 82 additions & 0 deletions tests/handlers/test_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,3 +540,85 @@ def test_generate_fixed_effects_mixed_reporting(va_governor_precinct_data):

assert "county_fips" in featurizer.fixed_effect_cols
assert len(featurizer.expanded_fixed_effects) == 133 - 1


def test_separate_state_model():
"""
This function tests to make sure that the featurizer returns the right columns
"""
features = ["a", "b", "c"]
fixed_effects = ["fe_a", "fe_b"]
states_for_separate_model = ["CC"]

featurizer = Featurizer(features, fixed_effects, states_for_separate_model)

df = pd.DataFrame(
{
"postal_code": ["AA", "AA", "BB", "BB", "CC", "CC", "CC", "DD"],
"a": [5, 3, 1, 5, 2, 2, 2, 2],
"b": [2, 2, 2, 2, 3, 3, 3, 3],
"c": [3, 3, 3, 3, 1, 2, 3, 4],
"d": [1, 2, 3, 4, 5, 3, 1, 5],
"fe_a": ["a", "a", "b", "c", "a", "a", "b", "d"],
"fe_b": ["1", "x", "7", "y", "1", "z", "z", "w"],
"reporting": [1, 1, 1, 1, 1, 0, 0, 0],
"unit_category": ["expected"] * 8,
}
)

df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True)
assert df_new.loc[df.postal_code != "CC", "intercept"].all() == 1
assert df_new.loc[df.postal_code == "CC", "intercept"].all() == 0
assert df_new.loc[df.postal_code != "CC", "intercept_CC"].all() == 0
assert df_new.loc[df.postal_code == "CC", "intercept_CC"].all() == 1

assert df_new.loc[df.postal_code != "CC", "a"].all() > 0
assert df_new.loc[df.postal_code == "CC", "a"].all() == 0
assert df_new.loc[df.postal_code != "CC", "a_CC"].all() == 0
assert df_new.loc[df.postal_code == "CC", "a_CC"].all() > 0

assert df_new.loc[df.postal_code != "CC", "b"].all() > 0
assert df_new.loc[df.postal_code == "CC", "b"].all() == 0
assert df_new.loc[df.postal_code != "CC", "b_CC"].all() == 0
assert df_new.loc[df.postal_code == "CC", "b_CC"].all() > 0

assert df_new.loc[df.postal_code != "CC", "c"].all() > 0
assert df_new.loc[df.postal_code == "CC", "c"].all() == 0
assert df_new.loc[df.postal_code != "CC", "c_CC"].all() == 0
assert df_new.loc[df.postal_code == "CC", "c_CC"].all() > 0

# slightly more complicated, with two states
states_for_separate_model = ["BB", "CC"]
featurizer = Featurizer(features, fixed_effects, states_for_separate_model)
df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True)

assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept"].all() == 1
assert df_new.loc[df.postal_code == "CC", "intercept"].all() == 0
assert df_new.loc[df.postal_code == "BB", "intercept"].all() == 0
assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept_CC"].all() == 0
assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept_BB"].all() == 0
assert df_new.loc[df.postal_code == "CC", "intercept_CC"].all() == 1
assert df_new.loc[df.postal_code == "BB", "intercept_BB"].all() == 1
assert df_new.loc[df.postal_code == "CC", "intercept_BB"].all() == 0
assert df_new.loc[df.postal_code == "BB", "intercept_CC"].all() == 0

assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "a"].all() > 0
assert df_new.loc[df.postal_code == "CC", "a"].all() == 0
assert df_new.loc[df.postal_code == "BB", "a"].all() == 0
assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "a_CC"].all() == 0
assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "a_BB"].all() == 0
assert df_new.loc[df.postal_code == "CC", "a_CC"].all() > 0
assert df_new.loc[df.postal_code == "BB", "a_BB"].all() > 0
assert df_new.loc[df.postal_code == "CC", "a_BB"].all() == 0
assert df_new.loc[df.postal_code == "BB", "a_CC"].all() == 0

# if postal code is in fixed effect, then don't add indivdual intercepts
fixed_effects = ["fe_a", "fe_b", "postal_code"]
featurizer = Featurizer(features, fixed_effects, states_for_separate_model)
df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True)

assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept"].all() == 1
assert df_new.loc[df.postal_code == "CC", "intercept"].all() == 0
assert df_new.loc[df.postal_code == "BB", "intercept"].all() == 0
assert "intercept_BB" not in df_new.columns
assert "intercept_CC" not in df_new.columns
8 changes: 6 additions & 2 deletions tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -876,5 +876,9 @@ def test_get_national_summary_votes_estimates(model_client, va_governor_county_d

current = model_client.get_national_summary_votes_estimates(None, 0, [0.99])

pd.testing.assert_frame_equal(current, model_client.results_handler.final_results["nat_sum_data"])
pd.testing.assert_frame_equal(expected_df, model_client.results_handler.final_results["nat_sum_data"])
pd.testing.assert_frame_equal(
current, model_client.results_handler.final_results["nat_sum_data"], check_dtype=False
)
pd.testing.assert_frame_equal(
expected_df, model_client.results_handler.final_results["nat_sum_data"], check_dtype=False
)
Loading