diff --git a/README.md b/README.md index 29b4a62d..2303c8cc 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,7 @@ Some model types have specific model parameters that can be included. | percent_expected_vote_error_bound | numeric | error tolerance on expected vote | `bootstrap` | | z_unobserved_lower_bound | numeric | lower bound for turnout factor | `bootstrap` | | z_unobserved_upper_bound | numeric | upper bound for turnout factor | `bootstrap` | +| states_for_separate_model | list | states with separate model fit | `bootstrap` | This is the class and function that invokes the general function to generate estimates. You can install `elex-model` as a Python package and use this code snippet in other projects. diff --git a/src/elexmodel/handlers/data/Featurizer.py b/src/elexmodel/handlers/data/Featurizer.py index 6a08017c..9e661a61 100644 --- a/src/elexmodel/handlers/data/Featurizer.py +++ b/src/elexmodel/handlers/data/Featurizer.py @@ -7,7 +7,7 @@ class Featurizer: Featurizer. Normalizes features, add intercept, expands fixed effects """ - def __init__(self, features: list, fixed_effects: list): + def __init__(self, features: list, fixed_effects: list, states_for_separate_model: list = []): self.features = features # fixed effects can be a list, in which case every value of a fixed effect gets its own column if isinstance(fixed_effects, list): @@ -39,6 +39,8 @@ def __init__(self, features: list, fixed_effects: list): # active features are features + active fixed effects self.active_features = [] + self.states_for_separate_model = states_for_separate_model + def _expand_fixed_effects(self, df: pd.DataFrame) -> pd.DataFrame: """ Convert fixed effect columns into dummy variables. @@ -81,6 +83,19 @@ def prepare_data( if add_intercept is true an intercept column is added to the features and one fixed effect value is dropped """ df = df.copy() # create copy so we can do things to the values + + # if a state is in the states for separate model, then we add separate feature columns for that state + # and we zero out the original feature column for those states + additional_state_features = [] + for state in self.states_for_separate_model: + mask = df.postal_code == state + for feature in self.features: + state_feature = f"{feature}_{state}" + df[state_feature] = df[feature].where(mask, 0) + df.loc[mask, feature] = 0 + additional_state_features.append(state_feature) + + df = df.copy() # create copy so we can do things to the values if center_features: df[self.features] -= df[self.features].mean() if scale_features: @@ -91,6 +106,21 @@ def prepare_data( self.active_features += ["intercept"] df["intercept"] = 1 + # if a state is in the states for separate model, then we add separate intercept columns for that state + # and we zero out the original intercept column for those stattes + for state in self.states_for_separate_model: + mask = df.postal_code == state + # if we have a postal code fixed effect then we do not want a separate intercept column for that state + # because the fixed effect takes the role of that column (ie. those would be linearly dependent) + # but we still want to zero out the original intercept for those states. + if "postal_code" not in self.fixed_effect_cols: + state_intercept = f"intercept_{state}" + df[state_intercept] = df["intercept"].where(mask, 0) + self.complete_features.append(state_intercept) + df.loc[mask, "intercept"] = 0 + + # if fixed effects are on then we have redundant with the state specific intercepts + if len(self.fixed_effect_cols) > 0: df = self._expand_fixed_effects(df) @@ -140,8 +170,8 @@ def prepare_data( # all features that the model will be fit on # these are all the features + the expanded fixed effects # (so all fixed effect values in the complete data excluding the ones dropped for multicolinearity) - self.complete_features += self.features + self.expanded_fixed_effects - self.active_features += self.features + self.active_fixed_effects + self.complete_features += self.features + additional_state_features + self.expanded_fixed_effects + self.active_features += self.features + additional_state_features + self.active_fixed_effects df = df[self.complete_features] return df diff --git a/src/elexmodel/models/BootstrapElectionModel.py b/src/elexmodel/models/BootstrapElectionModel.py index 003fe182..e5969064 100644 --- a/src/elexmodel/models/BootstrapElectionModel.py +++ b/src/elexmodel/models/BootstrapElectionModel.py @@ -80,7 +80,11 @@ def __init__(self, model_settings={}): self.z_unobserved_upper_bound = model_settings.get("z_unobserved_upper_bound", 1.5) self.z_unobserved_lower_bound = model_settings.get("z_unobserved_lower_bound", 0.5) - self.featurizer = Featurizer(self.features, self.fixed_effects) + self.states_for_separate_model = model_settings.get("states_for_separate_model", []) + self.featurizer = Featurizer( + self.features, self.fixed_effects, states_for_separate_model=self.states_for_separate_model + ) + self.seed = model_settings.get("seed", 0) self.rng = np.random.default_rng(seed=self.seed) # used for sampling self.ran_bootstrap = False @@ -661,7 +665,7 @@ def _get_strata( # but like with fixed effects we drop one strata category and use the intercept instead so the # example would be # rural: 0, 0 urban: 1, 0 and rural: 0, 1 - strata_featurizer = Featurizer([], self.strata) + strata_featurizer = Featurizer([], self.strata, states_for_separate_model=self.states_for_separate_model) all_units = pd.concat([reporting_units, nonreporting_units], axis=0) strata_all = strata_featurizer.prepare_data( diff --git a/tests/handlers/test_featurizer.py b/tests/handlers/test_featurizer.py index 027284b4..448a72a8 100644 --- a/tests/handlers/test_featurizer.py +++ b/tests/handlers/test_featurizer.py @@ -540,3 +540,85 @@ def test_generate_fixed_effects_mixed_reporting(va_governor_precinct_data): assert "county_fips" in featurizer.fixed_effect_cols assert len(featurizer.expanded_fixed_effects) == 133 - 1 + + +def test_separate_state_model(): + """ + This function tests to make sure that the featurizer returns the right columns + """ + features = ["a", "b", "c"] + fixed_effects = ["fe_a", "fe_b"] + states_for_separate_model = ["CC"] + + featurizer = Featurizer(features, fixed_effects, states_for_separate_model) + + df = pd.DataFrame( + { + "postal_code": ["AA", "AA", "BB", "BB", "CC", "CC", "CC", "DD"], + "a": [5, 3, 1, 5, 2, 2, 2, 2], + "b": [2, 2, 2, 2, 3, 3, 3, 3], + "c": [3, 3, 3, 3, 1, 2, 3, 4], + "d": [1, 2, 3, 4, 5, 3, 1, 5], + "fe_a": ["a", "a", "b", "c", "a", "a", "b", "d"], + "fe_b": ["1", "x", "7", "y", "1", "z", "z", "w"], + "reporting": [1, 1, 1, 1, 1, 0, 0, 0], + "unit_category": ["expected"] * 8, + } + ) + + df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True) + assert df_new.loc[df.postal_code != "CC", "intercept"].all() == 1 + assert df_new.loc[df.postal_code == "CC", "intercept"].all() == 0 + assert df_new.loc[df.postal_code != "CC", "intercept_CC"].all() == 0 + assert df_new.loc[df.postal_code == "CC", "intercept_CC"].all() == 1 + + assert df_new.loc[df.postal_code != "CC", "a"].all() > 0 + assert df_new.loc[df.postal_code == "CC", "a"].all() == 0 + assert df_new.loc[df.postal_code != "CC", "a_CC"].all() == 0 + assert df_new.loc[df.postal_code == "CC", "a_CC"].all() > 0 + + assert df_new.loc[df.postal_code != "CC", "b"].all() > 0 + assert df_new.loc[df.postal_code == "CC", "b"].all() == 0 + assert df_new.loc[df.postal_code != "CC", "b_CC"].all() == 0 + assert df_new.loc[df.postal_code == "CC", "b_CC"].all() > 0 + + assert df_new.loc[df.postal_code != "CC", "c"].all() > 0 + assert df_new.loc[df.postal_code == "CC", "c"].all() == 0 + assert df_new.loc[df.postal_code != "CC", "c_CC"].all() == 0 + assert df_new.loc[df.postal_code == "CC", "c_CC"].all() > 0 + + # slightly more complicated, with two states + states_for_separate_model = ["BB", "CC"] + featurizer = Featurizer(features, fixed_effects, states_for_separate_model) + df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True) + + assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept"].all() == 1 + assert df_new.loc[df.postal_code == "CC", "intercept"].all() == 0 + assert df_new.loc[df.postal_code == "BB", "intercept"].all() == 0 + assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept_CC"].all() == 0 + assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept_BB"].all() == 0 + assert df_new.loc[df.postal_code == "CC", "intercept_CC"].all() == 1 + assert df_new.loc[df.postal_code == "BB", "intercept_BB"].all() == 1 + assert df_new.loc[df.postal_code == "CC", "intercept_BB"].all() == 0 + assert df_new.loc[df.postal_code == "BB", "intercept_CC"].all() == 0 + + assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "a"].all() > 0 + assert df_new.loc[df.postal_code == "CC", "a"].all() == 0 + assert df_new.loc[df.postal_code == "BB", "a"].all() == 0 + assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "a_CC"].all() == 0 + assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "a_BB"].all() == 0 + assert df_new.loc[df.postal_code == "CC", "a_CC"].all() > 0 + assert df_new.loc[df.postal_code == "BB", "a_BB"].all() > 0 + assert df_new.loc[df.postal_code == "CC", "a_BB"].all() == 0 + assert df_new.loc[df.postal_code == "BB", "a_CC"].all() == 0 + + # if postal code is in fixed effect, then don't add indivdual intercepts + fixed_effects = ["fe_a", "fe_b", "postal_code"] + featurizer = Featurizer(features, fixed_effects, states_for_separate_model) + df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True) + + assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept"].all() == 1 + assert df_new.loc[df.postal_code == "CC", "intercept"].all() == 0 + assert df_new.loc[df.postal_code == "BB", "intercept"].all() == 0 + assert "intercept_BB" not in df_new.columns + assert "intercept_CC" not in df_new.columns diff --git a/tests/test_client.py b/tests/test_client.py index 3b821d28..8ecb690b 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -876,5 +876,9 @@ def test_get_national_summary_votes_estimates(model_client, va_governor_county_d current = model_client.get_national_summary_votes_estimates(None, 0, [0.99]) - pd.testing.assert_frame_equal(current, model_client.results_handler.final_results["nat_sum_data"]) - pd.testing.assert_frame_equal(expected_df, model_client.results_handler.final_results["nat_sum_data"]) + pd.testing.assert_frame_equal( + current, model_client.results_handler.final_results["nat_sum_data"], check_dtype=False + ) + pd.testing.assert_frame_equal( + expected_df, model_client.results_handler.final_results["nat_sum_data"], check_dtype=False + )