washingtonpost · lennybronner · Oct 27, 2024 · Oct 23, 2024 · Oct 23, 2024 · Oct 23, 2024
@@ -125,6 +125,7 @@ Some model types have specific model parameters that can be included.
 | percent_expected_vote_error_bound | numeric | error tolerance on expected vote | `bootstrap`     |
 | z_unobserved_lower_bound          | numeric | lower bound for turnout factor   | `bootstrap`     |
 | z_unobserved_upper_bound          | numeric | upper bound for turnout factor   | `bootstrap`     |
+| states_for_separate_model         | list    | states with separate model fit   | `bootstrap`     |
 
 This is the class and function that invokes the general function to generate estimates. You can install `elex-model` as a Python package and use this code snippet in other projects.
 

@@ -7,7 +7,7 @@ class Featurizer:
     Featurizer. Normalizes features, add intercept, expands fixed effects
     """
 
-    def __init__(self, features: list, fixed_effects: list):
+    def __init__(self, features: list, fixed_effects: list, states_for_separate_model: list = []):
         self.features = features
         # fixed effects can be a list, in which case every value of a fixed effect gets its own column
         if isinstance(fixed_effects, list):
@@ -39,6 +39,8 @@ def __init__(self, features: list, fixed_effects: list):
         # active features are features + active fixed effects
         self.active_features = []
 
+        self.states_for_separate_model = states_for_separate_model
+
     def _expand_fixed_effects(self, df: pd.DataFrame) -> pd.DataFrame:
         """
         Convert fixed effect columns into dummy variables.
@@ -81,6 +83,21 @@ def prepare_data(
         if add_intercept is true an intercept column is added to the features and one fixed effect value is dropped
         """
         df = df.copy()  # create copy so we can do things to the values
+
+        # if a state is in the states for separate model, then we add separate feature columns for that state
+        # and we zero out the original feature column for those states
+        additional_state_features = []
+        # if 'BB' in self.states_for_separate_model and 'CC' in self.states_for_separate_model:
+        # import pdb; pdb.set_trace()
+        for state in self.states_for_separate_model:
+            mask = df.postal_code == state
+            for feature in self.features:
+                state_feature = f"{feature}_{state}"
+                df[state_feature] = df[feature].where(mask, 0)
+                df.loc[mask, feature] = 0
+                additional_state_features.append(state_feature)
+
+        df = df.copy()  # create copy so we can do things to the values
         if center_features:
             df[self.features] -= df[self.features].mean()
         if scale_features:
@@ -91,6 +108,21 @@ def prepare_data(
             self.active_features += ["intercept"]
             df["intercept"] = 1
 
+            # if a state is in the states for separate model, then we add separate intercept columns for that state
+            # and we zero out the original intercept column for those stattes
+            for state in self.states_for_separate_model:
+                mask = df.postal_code == state
+                # if we have a postal code fixed effect then we do not want a separate intercept column for that state
+                # because the fixed effect takes the role of that column (ie. those would be linearly dependent)
+                # but we still want to zero out the original intercept for those states.
+                if "postal_code" not in self.fixed_effect_cols:
+                    state_intercept = f"intercept_{state}"
+                    df[state_intercept] = df["intercept"].where(mask, 0)
+                    self.complete_features.append(state_intercept)
+                df.loc[mask, "intercept"] = 0
+
+            # if fixed effects are on then we have redundant with the state specific intercepts
+
         if len(self.fixed_effect_cols) > 0:
             df = self._expand_fixed_effects(df)
 
@@ -140,8 +172,8 @@ def prepare_data(
         # all features that the model will be fit on
         # these are all the features + the expanded fixed effects
         # (so all fixed effect values in the complete data excluding the ones dropped for multicolinearity)
-        self.complete_features += self.features + self.expanded_fixed_effects
-        self.active_features += self.features + self.active_fixed_effects
+        self.complete_features += self.features + additional_state_features + self.expanded_fixed_effects
+        self.active_features += self.features + additional_state_features + self.active_fixed_effects
         df = df[self.complete_features]
 
         return df

@@ -80,7 +80,11 @@ def __init__(self, model_settings={}):
         self.z_unobserved_upper_bound = model_settings.get("z_unobserved_upper_bound", 1.5)
         self.z_unobserved_lower_bound = model_settings.get("z_unobserved_lower_bound", 0.5)
 
-        self.featurizer = Featurizer(self.features, self.fixed_effects)
+        self.states_for_separate_model = model_settings.get("states_for_separate_model", [])
+        self.featurizer = Featurizer(
+            self.features, self.fixed_effects, states_for_separate_model=self.states_for_separate_model
+        )
+
         self.seed = model_settings.get("seed", 0)
         self.rng = np.random.default_rng(seed=self.seed)  # used for sampling
         self.ran_bootstrap = False
@@ -661,7 +665,7 @@ def _get_strata(
         # but like with fixed effects we drop one strata category and use the intercept instead so the
         # example would be
         # rural: 0, 0 urban: 1, 0 and rural: 0, 1
-        strata_featurizer = Featurizer([], self.strata)
+        strata_featurizer = Featurizer([], self.strata, states_for_separate_model=self.states_for_separate_model)
         all_units = pd.concat([reporting_units, nonreporting_units], axis=0)
 
         strata_all = strata_featurizer.prepare_data(

@@ -540,3 +540,85 @@ def test_generate_fixed_effects_mixed_reporting(va_governor_precinct_data):
 
     assert "county_fips" in featurizer.fixed_effect_cols
     assert len(featurizer.expanded_fixed_effects) == 133 - 1
+
+
+def test_separate_state_model():
+    """
+    This function tests to make sure that the featurizer returns the right columns
+    """
+    features = ["a", "b", "c"]
+    fixed_effects = ["fe_a", "fe_b"]
+    states_for_separate_model = ["CC"]
+
+    featurizer = Featurizer(features, fixed_effects, states_for_separate_model)
+
+    df = pd.DataFrame(
+        {
+            "postal_code": ["AA", "AA", "BB", "BB", "CC", "CC", "CC", "DD"],
+            "a": [5, 3, 1, 5, 2, 2, 2, 2],
+            "b": [2, 2, 2, 2, 3, 3, 3, 3],
+            "c": [3, 3, 3, 3, 1, 2, 3, 4],
+            "d": [1, 2, 3, 4, 5, 3, 1, 5],
+            "fe_a": ["a", "a", "b", "c", "a", "a", "b", "d"],
+            "fe_b": ["1", "x", "7", "y", "1", "z", "z", "w"],
+            "reporting": [1, 1, 1, 1, 1, 0, 0, 0],
+            "unit_category": ["expected"] * 8,
+        }
+    )
+
+    df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True)
+    assert df_new.loc[df.postal_code != "CC", "intercept"].all() == 1
+    assert df_new.loc[df.postal_code == "CC", "intercept"].all() == 0
+    assert df_new.loc[df.postal_code != "CC", "intercept_CC"].all() == 0
+    assert df_new.loc[df.postal_code == "CC", "intercept_CC"].all() == 1
+
+    assert df_new.loc[df.postal_code != "CC", "a"].all() > 0
+    assert df_new.loc[df.postal_code == "CC", "a"].all() == 0
+    assert df_new.loc[df.postal_code != "CC", "a_CC"].all() == 0
+    assert df_new.loc[df.postal_code == "CC", "a_CC"].all() > 0
+
+    assert df_new.loc[df.postal_code != "CC", "b"].all() > 0
+    assert df_new.loc[df.postal_code == "CC", "b"].all() == 0
+    assert df_new.loc[df.postal_code != "CC", "b_CC"].all() == 0
+    assert df_new.loc[df.postal_code == "CC", "b_CC"].all() > 0
+
+    assert df_new.loc[df.postal_code != "CC", "c"].all() > 0
+    assert df_new.loc[df.postal_code == "CC", "c"].all() == 0
+    assert df_new.loc[df.postal_code != "CC", "c_CC"].all() == 0
+    assert df_new.loc[df.postal_code == "CC", "c_CC"].all() > 0
+
+    # slightly more complicated, with two states
+    states_for_separate_model = ["BB", "CC"]
+    featurizer = Featurizer(features, fixed_effects, states_for_separate_model)
+    df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True)
+
+    assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept"].all() == 1
+    assert df_new.loc[df.postal_code == "CC", "intercept"].all() == 0
+    assert df_new.loc[df.postal_code == "BB", "intercept"].all() == 0
+    assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept_CC"].all() == 0
+    assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept_BB"].all() == 0
+    assert df_new.loc[df.postal_code == "CC", "intercept_CC"].all() == 1
+    assert df_new.loc[df.postal_code == "BB", "intercept_BB"].all() == 1
+    assert df_new.loc[df.postal_code == "CC", "intercept_BB"].all() == 0
+    assert df_new.loc[df.postal_code == "BB", "intercept_CC"].all() == 0
+
+    assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "a"].all() > 0
+    assert df_new.loc[df.postal_code == "CC", "a"].all() == 0
+    assert df_new.loc[df.postal_code == "BB", "a"].all() == 0
+    assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "a_CC"].all() == 0
+    assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "a_BB"].all() == 0
+    assert df_new.loc[df.postal_code == "CC", "a_CC"].all() > 0
+    assert df_new.loc[df.postal_code == "BB", "a_BB"].all() > 0
+    assert df_new.loc[df.postal_code == "CC", "a_BB"].all() == 0
+    assert df_new.loc[df.postal_code == "BB", "a_CC"].all() == 0
+
+    # if postal code is in fixed effect, then don't add indivdual intercepts
+    fixed_effects = ["fe_a", "fe_b", "postal_code"]
+    featurizer = Featurizer(features, fixed_effects, states_for_separate_model)
+    df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True)
+
+    assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept"].all() == 1
+    assert df_new.loc[df.postal_code == "CC", "intercept"].all() == 0
+    assert df_new.loc[df.postal_code == "BB", "intercept"].all() == 0
+    assert "intercept_BB" not in df_new.columns
+    assert "intercept_CC" not in df_new.columns
@@ -876,5 +876,9 @@ def test_get_national_summary_votes_estimates(model_client, va_governor_county_d
 
     current = model_client.get_national_summary_votes_estimates(None, 0, [0.99])
 
-    pd.testing.assert_frame_equal(current, model_client.results_handler.final_results["nat_sum_data"])
-    pd.testing.assert_frame_equal(expected_df, model_client.results_handler.final_results["nat_sum_data"])
+    pd.testing.assert_frame_equal(
+        current, model_client.results_handler.final_results["nat_sum_data"], check_dtype=False
+    )
+    pd.testing.assert_frame_equal(
+        expected_df, model_client.results_handler.final_results["nat_sum_data"], check_dtype=False
+    )