From f08f0c4d36baed05b5bd999a2f2af83bb63114b0 Mon Sep 17 00:00:00 2001 From: lennybronner Date: Wed, 23 Oct 2024 17:22:54 -0400 Subject: [PATCH 1/6] additional model works --- README.md | 1 + src/elexmodel/handlers/data/Featurizer.py | 25 ++++++++++++++++- .../models/BootstrapElectionModel.py | 3 ++- tests/handlers/test_featurizer.py | 27 +++++++++++++++++++ 4 files changed, 54 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 29b4a62d..2303c8cc 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,7 @@ Some model types have specific model parameters that can be included. | percent_expected_vote_error_bound | numeric | error tolerance on expected vote | `bootstrap` | | z_unobserved_lower_bound | numeric | lower bound for turnout factor | `bootstrap` | | z_unobserved_upper_bound | numeric | upper bound for turnout factor | `bootstrap` | +| states_for_separate_model | list | states with separate model fit | `bootstrap` | This is the class and function that invokes the general function to generate estimates. You can install `elex-model` as a Python package and use this code snippet in other projects. diff --git a/src/elexmodel/handlers/data/Featurizer.py b/src/elexmodel/handlers/data/Featurizer.py index 6a08017c..b69c851d 100644 --- a/src/elexmodel/handlers/data/Featurizer.py +++ b/src/elexmodel/handlers/data/Featurizer.py @@ -7,7 +7,7 @@ class Featurizer: Featurizer. Normalizes features, add intercept, expands fixed effects """ - def __init__(self, features: list, fixed_effects: list): + def __init__(self, features: list, fixed_effects: list, states_for_separate_model: list): self.features = features # fixed effects can be a list, in which case every value of a fixed effect gets its own column if isinstance(fixed_effects, list): @@ -39,6 +39,8 @@ def __init__(self, features: list, fixed_effects: list): # active features are features + active fixed effects self.active_features = [] + self.states_for_separate_model = states_for_separate_model + def _expand_fixed_effects(self, df: pd.DataFrame) -> pd.DataFrame: """ Convert fixed effect columns into dummy variables. @@ -80,6 +82,14 @@ def prepare_data( this can improve the convergence of optimization algorithms if add_intercept is true an intercept column is added to the features and one fixed effect value is dropped """ + # if a state is in the states for separate model, then we add separate feature columns for that state + # and we zero out the original feature column for those states + for state in self.states_for_separate_model: + mask = df.postal_code == state + for feature in self.features: + df[ f"{feature}_{state}"] = df[feature].where(mask, 0) + df.loc[mask, feature] = 0 + df = df.copy() # create copy so we can do things to the values if center_features: df[self.features] -= df[self.features].mean() @@ -91,6 +101,19 @@ def prepare_data( self.active_features += ["intercept"] df["intercept"] = 1 + # if a state is in the states for separate model, then we add separate intercept columns for that state + # and we zero out the original intercept column for those stattes + for state in self.states_for_separate_model: + mask = df.postal_code == state + # if we have a postal code fixed effect then we do not want a separate intercept column for that state + # because the fixed effect takes the role of that column (ie. those would be linearly dependent) + # but we still want to zero out the original intercept for those states. + if not 'postal_code' in self.fixed_effect_cols: + df[f'intercept_{state}'] = df['intercept'].where(mask, 0) + df.loc[mask, 'intercept'] = 0 + + # if fixed effects are on then we have redundant with the state specific intercepts + if len(self.fixed_effect_cols) > 0: df = self._expand_fixed_effects(df) diff --git a/src/elexmodel/models/BootstrapElectionModel.py b/src/elexmodel/models/BootstrapElectionModel.py index 003fe182..44dad314 100644 --- a/src/elexmodel/models/BootstrapElectionModel.py +++ b/src/elexmodel/models/BootstrapElectionModel.py @@ -80,7 +80,8 @@ def __init__(self, model_settings={}): self.z_unobserved_upper_bound = model_settings.get("z_unobserved_upper_bound", 1.5) self.z_unobserved_lower_bound = model_settings.get("z_unobserved_lower_bound", 0.5) - self.featurizer = Featurizer(self.features, self.fixed_effects) + self.states_for_separate_model = model_settings.get("states_for_separate_model", []) + self.featurizer = Featurizer(self.features, self.fixed_effects, self.states_for_separate_model) self.seed = model_settings.get("seed", 0) self.rng = np.random.default_rng(seed=self.seed) # used for sampling self.ran_bootstrap = False diff --git a/tests/handlers/test_featurizer.py b/tests/handlers/test_featurizer.py index 027284b4..d6b68be3 100644 --- a/tests/handlers/test_featurizer.py +++ b/tests/handlers/test_featurizer.py @@ -540,3 +540,30 @@ def test_generate_fixed_effects_mixed_reporting(va_governor_precinct_data): assert "county_fips" in featurizer.fixed_effect_cols assert len(featurizer.expanded_fixed_effects) == 133 - 1 + +def test_separate_state_model(): + """ + This function tests to make sure that the featurizer returns the right columns + """ + features = ["a", "b", "c"] + fixed_effects = ["fe_a", "fe_b"] + states_for_separate_model = ['CC'] + + featurizer = Featurizer(features, fixed_effects, states_for_separate_model) + + split_fitting_heldout = 5 + df = pd.DataFrame( + { + "postal_code": ['AA', 'AA', 'BB', 'BB', 'CC', 'CC', 'CC', 'DD'], + "a": [5, 3, 1, 5, 2, 2, 2, 2], + "b": [2, 2, 2, 2, 3, 3, 3, 3], + "c": [3, 3, 3, 3, 1, 2, 3, 4], + "d": [1, 2, 3, 4, 5, 3, 1, 5], + "fe_a": ["a", "a", "b", "c", "a", "a", "b", "d"], + "fe_b": ["1", "x", "7", "y", "1", "z", "z", "w"], + "reporting": [1, 1, 1, 1, 1, 0, 0, 0], + "unit_category": ["expected"] * 8, + } + ) + df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True) + import pdb; pdb.set_trace() From 3a587ae5af0ec8d3bbb09e21707e5dcafca394f1 Mon Sep 17 00:00:00 2001 From: lennybronner Date: Wed, 23 Oct 2024 19:51:01 -0400 Subject: [PATCH 2/6] unit tests work now --- src/elexmodel/handlers/data/Featurizer.py | 21 +++++-- .../models/BootstrapElectionModel.py | 5 +- tests/handlers/test_featurizer.py | 58 ++++++++++++++++++- 3 files changed, 74 insertions(+), 10 deletions(-) diff --git a/src/elexmodel/handlers/data/Featurizer.py b/src/elexmodel/handlers/data/Featurizer.py index b69c851d..376270c9 100644 --- a/src/elexmodel/handlers/data/Featurizer.py +++ b/src/elexmodel/handlers/data/Featurizer.py @@ -7,7 +7,7 @@ class Featurizer: Featurizer. Normalizes features, add intercept, expands fixed effects """ - def __init__(self, features: list, fixed_effects: list, states_for_separate_model: list): + def __init__(self, features: list, fixed_effects: list, states_for_separate_model: list = []): self.features = features # fixed effects can be a list, in which case every value of a fixed effect gets its own column if isinstance(fixed_effects, list): @@ -82,13 +82,20 @@ def prepare_data( this can improve the convergence of optimization algorithms if add_intercept is true an intercept column is added to the features and one fixed effect value is dropped """ + df = df.copy() # create copy so we can do things to the values + # if a state is in the states for separate model, then we add separate feature columns for that state # and we zero out the original feature column for those states + additional_state_features = [] + # if 'BB' in self.states_for_separate_model and 'CC' in self.states_for_separate_model: + # import pdb; pdb.set_trace() for state in self.states_for_separate_model: mask = df.postal_code == state for feature in self.features: - df[ f"{feature}_{state}"] = df[feature].where(mask, 0) + state_feature = f"{feature}_{state}" + df[state_feature] = df[feature].where(mask, 0) df.loc[mask, feature] = 0 + additional_state_features.append(state_feature) df = df.copy() # create copy so we can do things to the values if center_features: @@ -108,8 +115,10 @@ def prepare_data( # if we have a postal code fixed effect then we do not want a separate intercept column for that state # because the fixed effect takes the role of that column (ie. those would be linearly dependent) # but we still want to zero out the original intercept for those states. - if not 'postal_code' in self.fixed_effect_cols: - df[f'intercept_{state}'] = df['intercept'].where(mask, 0) + if 'postal_code' not in self.fixed_effect_cols: + state_intercept = f'intercept_{state}' + df[state_intercept] = df['intercept'].where(mask, 0) + self.complete_features.append(state_intercept) df.loc[mask, 'intercept'] = 0 # if fixed effects are on then we have redundant with the state specific intercepts @@ -163,8 +172,8 @@ def prepare_data( # all features that the model will be fit on # these are all the features + the expanded fixed effects # (so all fixed effect values in the complete data excluding the ones dropped for multicolinearity) - self.complete_features += self.features + self.expanded_fixed_effects - self.active_features += self.features + self.active_fixed_effects + self.complete_features += self.features + additional_state_features + self.expanded_fixed_effects + self.active_features += self.features + additional_state_features + self.active_fixed_effects df = df[self.complete_features] return df diff --git a/src/elexmodel/models/BootstrapElectionModel.py b/src/elexmodel/models/BootstrapElectionModel.py index 44dad314..1f56d958 100644 --- a/src/elexmodel/models/BootstrapElectionModel.py +++ b/src/elexmodel/models/BootstrapElectionModel.py @@ -81,7 +81,8 @@ def __init__(self, model_settings={}): self.z_unobserved_lower_bound = model_settings.get("z_unobserved_lower_bound", 0.5) self.states_for_separate_model = model_settings.get("states_for_separate_model", []) - self.featurizer = Featurizer(self.features, self.fixed_effects, self.states_for_separate_model) + self.featurizer = Featurizer(self.features, self.fixed_effects, states_for_separate_model=self.states_for_separate_model) + self.seed = model_settings.get("seed", 0) self.rng = np.random.default_rng(seed=self.seed) # used for sampling self.ran_bootstrap = False @@ -662,7 +663,7 @@ def _get_strata( # but like with fixed effects we drop one strata category and use the intercept instead so the # example would be # rural: 0, 0 urban: 1, 0 and rural: 0, 1 - strata_featurizer = Featurizer([], self.strata) + strata_featurizer = Featurizer([], self.strata, states_for_separate_model=self.states_for_separate_model) all_units = pd.concat([reporting_units, nonreporting_units], axis=0) strata_all = strata_featurizer.prepare_data( diff --git a/tests/handlers/test_featurizer.py b/tests/handlers/test_featurizer.py index d6b68be3..6d5fa0a9 100644 --- a/tests/handlers/test_featurizer.py +++ b/tests/handlers/test_featurizer.py @@ -551,7 +551,6 @@ def test_separate_state_model(): featurizer = Featurizer(features, fixed_effects, states_for_separate_model) - split_fitting_heldout = 5 df = pd.DataFrame( { "postal_code": ['AA', 'AA', 'BB', 'BB', 'CC', 'CC', 'CC', 'DD'], @@ -565,5 +564,60 @@ def test_separate_state_model(): "unit_category": ["expected"] * 8, } ) + + df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True) + assert df_new.loc[df.postal_code != 'CC', 'intercept'].all() == 1 + assert df_new.loc[df.postal_code == 'CC', 'intercept'].all() == 0 + assert df_new.loc[df.postal_code != 'CC', 'intercept_CC'].all() == 0 + assert df_new.loc[df.postal_code == 'CC', 'intercept_CC'].all() == 1 + + assert df_new.loc[df.postal_code != 'CC', 'a'].all() > 0 + assert df_new.loc[df.postal_code == 'CC', 'a'].all() == 0 + assert df_new.loc[df.postal_code != 'CC', 'a_CC'].all() == 0 + assert df_new.loc[df.postal_code == 'CC', 'a_CC'].all() > 0 + + assert df_new.loc[df.postal_code != 'CC', 'b'].all() > 0 + assert df_new.loc[df.postal_code == 'CC', 'b'].all() == 0 + assert df_new.loc[df.postal_code != 'CC', 'b_CC'].all() == 0 + assert df_new.loc[df.postal_code == 'CC', 'b_CC'].all() > 0 + + assert df_new.loc[df.postal_code != 'CC', 'c'].all() > 0 + assert df_new.loc[df.postal_code == 'CC', 'c'].all() == 0 + assert df_new.loc[df.postal_code != 'CC', 'c_CC'].all() == 0 + assert df_new.loc[df.postal_code == 'CC', 'c_CC'].all() > 0 + + # slightly more complicated, with two states + states_for_separate_model = ['BB', 'CC'] + featurizer = Featurizer(features, fixed_effects, states_for_separate_model) df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True) - import pdb; pdb.set_trace() + + assert df_new.loc[(df.postal_code != 'CC') & (df.postal_code != 'BB'), 'intercept'].all() == 1 + assert df_new.loc[df.postal_code == 'CC', 'intercept'].all() == 0 + assert df_new.loc[df.postal_code == 'BB', 'intercept'].all() == 0 + assert df_new.loc[(df.postal_code != 'CC') & (df.postal_code != 'BB'), 'intercept_CC'].all() == 0 + assert df_new.loc[(df.postal_code != 'CC') & (df.postal_code != 'BB'), 'intercept_BB'].all() == 0 + assert df_new.loc[df.postal_code == 'CC', 'intercept_CC'].all() == 1 + assert df_new.loc[df.postal_code == 'BB', 'intercept_BB'].all() == 1 + assert df_new.loc[df.postal_code == 'CC', 'intercept_BB'].all() == 0 + assert df_new.loc[df.postal_code == 'BB', 'intercept_CC'].all() == 0 + + assert df_new.loc[(df.postal_code != 'CC') & (df.postal_code != 'BB'), 'a'].all() > 0 + assert df_new.loc[df.postal_code == 'CC', 'a'].all() == 0 + assert df_new.loc[df.postal_code == 'BB', 'a'].all() == 0 + assert df_new.loc[(df.postal_code != 'CC') & (df.postal_code != 'BB'), 'a_CC'].all() == 0 + assert df_new.loc[(df.postal_code != 'CC') & (df.postal_code != 'BB'), 'a_BB'].all() == 0 + assert df_new.loc[df.postal_code == 'CC', 'a_CC'].all() > 0 + assert df_new.loc[df.postal_code == 'BB', 'a_BB'].all() > 0 + assert df_new.loc[df.postal_code == 'CC', 'a_BB'].all() == 0 + assert df_new.loc[df.postal_code == 'BB', 'a_CC'].all() == 0 + + # if postal code is in fixed effect, then don't add indivdual intercepts + fixed_effects = ["fe_a", "fe_b", 'postal_code'] + featurizer = Featurizer(features, fixed_effects, states_for_separate_model) + df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True) + + assert df_new.loc[(df.postal_code != 'CC') & (df.postal_code != 'BB'), 'intercept'].all() == 1 + assert df_new.loc[df.postal_code == 'CC', 'intercept'].all() == 0 + assert df_new.loc[df.postal_code == 'BB', 'intercept'].all() == 0 + assert 'intercept_BB' not in df_new.columns + assert 'intercept_CC' not in df_new.columns From 6630908200d7bd0466e15551fc4b82b2f3747b5b Mon Sep 17 00:00:00 2001 From: lennybronner Date: Wed, 23 Oct 2024 19:56:55 -0400 Subject: [PATCH 3/6] linter --- src/elexmodel/handlers/data/Featurizer.py | 12 +-- .../models/BootstrapElectionModel.py | 6 +- tests/handlers/test_featurizer.py | 95 ++++++++++--------- 3 files changed, 58 insertions(+), 55 deletions(-) diff --git a/src/elexmodel/handlers/data/Featurizer.py b/src/elexmodel/handlers/data/Featurizer.py index 376270c9..f800fead 100644 --- a/src/elexmodel/handlers/data/Featurizer.py +++ b/src/elexmodel/handlers/data/Featurizer.py @@ -88,7 +88,7 @@ def prepare_data( # and we zero out the original feature column for those states additional_state_features = [] # if 'BB' in self.states_for_separate_model and 'CC' in self.states_for_separate_model: - # import pdb; pdb.set_trace() + # import pdb; pdb.set_trace() for state in self.states_for_separate_model: mask = df.postal_code == state for feature in self.features: @@ -96,7 +96,7 @@ def prepare_data( df[state_feature] = df[feature].where(mask, 0) df.loc[mask, feature] = 0 additional_state_features.append(state_feature) - + df = df.copy() # create copy so we can do things to the values if center_features: df[self.features] -= df[self.features].mean() @@ -115,11 +115,11 @@ def prepare_data( # if we have a postal code fixed effect then we do not want a separate intercept column for that state # because the fixed effect takes the role of that column (ie. those would be linearly dependent) # but we still want to zero out the original intercept for those states. - if 'postal_code' not in self.fixed_effect_cols: - state_intercept = f'intercept_{state}' - df[state_intercept] = df['intercept'].where(mask, 0) + if "postal_code" not in self.fixed_effect_cols: + state_intercept = f"intercept_{state}" + df[state_intercept] = df["intercept"].where(mask, 0) self.complete_features.append(state_intercept) - df.loc[mask, 'intercept'] = 0 + df.loc[mask, "intercept"] = 0 # if fixed effects are on then we have redundant with the state specific intercepts diff --git a/src/elexmodel/models/BootstrapElectionModel.py b/src/elexmodel/models/BootstrapElectionModel.py index 1f56d958..e5969064 100644 --- a/src/elexmodel/models/BootstrapElectionModel.py +++ b/src/elexmodel/models/BootstrapElectionModel.py @@ -81,8 +81,10 @@ def __init__(self, model_settings={}): self.z_unobserved_lower_bound = model_settings.get("z_unobserved_lower_bound", 0.5) self.states_for_separate_model = model_settings.get("states_for_separate_model", []) - self.featurizer = Featurizer(self.features, self.fixed_effects, states_for_separate_model=self.states_for_separate_model) - + self.featurizer = Featurizer( + self.features, self.fixed_effects, states_for_separate_model=self.states_for_separate_model + ) + self.seed = model_settings.get("seed", 0) self.rng = np.random.default_rng(seed=self.seed) # used for sampling self.ran_bootstrap = False diff --git a/tests/handlers/test_featurizer.py b/tests/handlers/test_featurizer.py index 6d5fa0a9..448a72a8 100644 --- a/tests/handlers/test_featurizer.py +++ b/tests/handlers/test_featurizer.py @@ -541,19 +541,20 @@ def test_generate_fixed_effects_mixed_reporting(va_governor_precinct_data): assert "county_fips" in featurizer.fixed_effect_cols assert len(featurizer.expanded_fixed_effects) == 133 - 1 + def test_separate_state_model(): """ This function tests to make sure that the featurizer returns the right columns """ features = ["a", "b", "c"] fixed_effects = ["fe_a", "fe_b"] - states_for_separate_model = ['CC'] + states_for_separate_model = ["CC"] featurizer = Featurizer(features, fixed_effects, states_for_separate_model) df = pd.DataFrame( { - "postal_code": ['AA', 'AA', 'BB', 'BB', 'CC', 'CC', 'CC', 'DD'], + "postal_code": ["AA", "AA", "BB", "BB", "CC", "CC", "CC", "DD"], "a": [5, 3, 1, 5, 2, 2, 2, 2], "b": [2, 2, 2, 2, 3, 3, 3, 3], "c": [3, 3, 3, 3, 1, 2, 3, 4], @@ -566,58 +567,58 @@ def test_separate_state_model(): ) df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True) - assert df_new.loc[df.postal_code != 'CC', 'intercept'].all() == 1 - assert df_new.loc[df.postal_code == 'CC', 'intercept'].all() == 0 - assert df_new.loc[df.postal_code != 'CC', 'intercept_CC'].all() == 0 - assert df_new.loc[df.postal_code == 'CC', 'intercept_CC'].all() == 1 - - assert df_new.loc[df.postal_code != 'CC', 'a'].all() > 0 - assert df_new.loc[df.postal_code == 'CC', 'a'].all() == 0 - assert df_new.loc[df.postal_code != 'CC', 'a_CC'].all() == 0 - assert df_new.loc[df.postal_code == 'CC', 'a_CC'].all() > 0 - - assert df_new.loc[df.postal_code != 'CC', 'b'].all() > 0 - assert df_new.loc[df.postal_code == 'CC', 'b'].all() == 0 - assert df_new.loc[df.postal_code != 'CC', 'b_CC'].all() == 0 - assert df_new.loc[df.postal_code == 'CC', 'b_CC'].all() > 0 - - assert df_new.loc[df.postal_code != 'CC', 'c'].all() > 0 - assert df_new.loc[df.postal_code == 'CC', 'c'].all() == 0 - assert df_new.loc[df.postal_code != 'CC', 'c_CC'].all() == 0 - assert df_new.loc[df.postal_code == 'CC', 'c_CC'].all() > 0 + assert df_new.loc[df.postal_code != "CC", "intercept"].all() == 1 + assert df_new.loc[df.postal_code == "CC", "intercept"].all() == 0 + assert df_new.loc[df.postal_code != "CC", "intercept_CC"].all() == 0 + assert df_new.loc[df.postal_code == "CC", "intercept_CC"].all() == 1 + + assert df_new.loc[df.postal_code != "CC", "a"].all() > 0 + assert df_new.loc[df.postal_code == "CC", "a"].all() == 0 + assert df_new.loc[df.postal_code != "CC", "a_CC"].all() == 0 + assert df_new.loc[df.postal_code == "CC", "a_CC"].all() > 0 + + assert df_new.loc[df.postal_code != "CC", "b"].all() > 0 + assert df_new.loc[df.postal_code == "CC", "b"].all() == 0 + assert df_new.loc[df.postal_code != "CC", "b_CC"].all() == 0 + assert df_new.loc[df.postal_code == "CC", "b_CC"].all() > 0 + + assert df_new.loc[df.postal_code != "CC", "c"].all() > 0 + assert df_new.loc[df.postal_code == "CC", "c"].all() == 0 + assert df_new.loc[df.postal_code != "CC", "c_CC"].all() == 0 + assert df_new.loc[df.postal_code == "CC", "c_CC"].all() > 0 # slightly more complicated, with two states - states_for_separate_model = ['BB', 'CC'] + states_for_separate_model = ["BB", "CC"] featurizer = Featurizer(features, fixed_effects, states_for_separate_model) df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True) - assert df_new.loc[(df.postal_code != 'CC') & (df.postal_code != 'BB'), 'intercept'].all() == 1 - assert df_new.loc[df.postal_code == 'CC', 'intercept'].all() == 0 - assert df_new.loc[df.postal_code == 'BB', 'intercept'].all() == 0 - assert df_new.loc[(df.postal_code != 'CC') & (df.postal_code != 'BB'), 'intercept_CC'].all() == 0 - assert df_new.loc[(df.postal_code != 'CC') & (df.postal_code != 'BB'), 'intercept_BB'].all() == 0 - assert df_new.loc[df.postal_code == 'CC', 'intercept_CC'].all() == 1 - assert df_new.loc[df.postal_code == 'BB', 'intercept_BB'].all() == 1 - assert df_new.loc[df.postal_code == 'CC', 'intercept_BB'].all() == 0 - assert df_new.loc[df.postal_code == 'BB', 'intercept_CC'].all() == 0 - - assert df_new.loc[(df.postal_code != 'CC') & (df.postal_code != 'BB'), 'a'].all() > 0 - assert df_new.loc[df.postal_code == 'CC', 'a'].all() == 0 - assert df_new.loc[df.postal_code == 'BB', 'a'].all() == 0 - assert df_new.loc[(df.postal_code != 'CC') & (df.postal_code != 'BB'), 'a_CC'].all() == 0 - assert df_new.loc[(df.postal_code != 'CC') & (df.postal_code != 'BB'), 'a_BB'].all() == 0 - assert df_new.loc[df.postal_code == 'CC', 'a_CC'].all() > 0 - assert df_new.loc[df.postal_code == 'BB', 'a_BB'].all() > 0 - assert df_new.loc[df.postal_code == 'CC', 'a_BB'].all() == 0 - assert df_new.loc[df.postal_code == 'BB', 'a_CC'].all() == 0 + assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept"].all() == 1 + assert df_new.loc[df.postal_code == "CC", "intercept"].all() == 0 + assert df_new.loc[df.postal_code == "BB", "intercept"].all() == 0 + assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept_CC"].all() == 0 + assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept_BB"].all() == 0 + assert df_new.loc[df.postal_code == "CC", "intercept_CC"].all() == 1 + assert df_new.loc[df.postal_code == "BB", "intercept_BB"].all() == 1 + assert df_new.loc[df.postal_code == "CC", "intercept_BB"].all() == 0 + assert df_new.loc[df.postal_code == "BB", "intercept_CC"].all() == 0 + + assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "a"].all() > 0 + assert df_new.loc[df.postal_code == "CC", "a"].all() == 0 + assert df_new.loc[df.postal_code == "BB", "a"].all() == 0 + assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "a_CC"].all() == 0 + assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "a_BB"].all() == 0 + assert df_new.loc[df.postal_code == "CC", "a_CC"].all() > 0 + assert df_new.loc[df.postal_code == "BB", "a_BB"].all() > 0 + assert df_new.loc[df.postal_code == "CC", "a_BB"].all() == 0 + assert df_new.loc[df.postal_code == "BB", "a_CC"].all() == 0 # if postal code is in fixed effect, then don't add indivdual intercepts - fixed_effects = ["fe_a", "fe_b", 'postal_code'] + fixed_effects = ["fe_a", "fe_b", "postal_code"] featurizer = Featurizer(features, fixed_effects, states_for_separate_model) df_new = featurizer.prepare_data(df, center_features=False, scale_features=False, add_intercept=True) - assert df_new.loc[(df.postal_code != 'CC') & (df.postal_code != 'BB'), 'intercept'].all() == 1 - assert df_new.loc[df.postal_code == 'CC', 'intercept'].all() == 0 - assert df_new.loc[df.postal_code == 'BB', 'intercept'].all() == 0 - assert 'intercept_BB' not in df_new.columns - assert 'intercept_CC' not in df_new.columns + assert df_new.loc[(df.postal_code != "CC") & (df.postal_code != "BB"), "intercept"].all() == 1 + assert df_new.loc[df.postal_code == "CC", "intercept"].all() == 0 + assert df_new.loc[df.postal_code == "BB", "intercept"].all() == 0 + assert "intercept_BB" not in df_new.columns + assert "intercept_CC" not in df_new.columns From 1b15de2ba2da0b5df16167384299b5624aaf3b88 Mon Sep 17 00:00:00 2001 From: lennybronner Date: Wed, 23 Oct 2024 20:04:50 -0400 Subject: [PATCH 4/6] stopping dtype check --- tests/test_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_client.py b/tests/test_client.py index 3b821d28..80c6a479 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -876,5 +876,5 @@ def test_get_national_summary_votes_estimates(model_client, va_governor_county_d current = model_client.get_national_summary_votes_estimates(None, 0, [0.99]) - pd.testing.assert_frame_equal(current, model_client.results_handler.final_results["nat_sum_data"]) - pd.testing.assert_frame_equal(expected_df, model_client.results_handler.final_results["nat_sum_data"]) + pd.testing.assert_frame_equal(current, model_client.results_handler.final_results["nat_sum_data"], check_dtype=False) + pd.testing.assert_frame_equal(expected_df, model_client.results_handler.final_results["nat_sum_data"], check_dtype=False) From bec329d41726d4f12d56f6926cb2322f65229a31 Mon Sep 17 00:00:00 2001 From: lennybronner Date: Wed, 23 Oct 2024 20:05:11 -0400 Subject: [PATCH 5/6] linter --- tests/test_client.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_client.py b/tests/test_client.py index 80c6a479..8ecb690b 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -876,5 +876,9 @@ def test_get_national_summary_votes_estimates(model_client, va_governor_county_d current = model_client.get_national_summary_votes_estimates(None, 0, [0.99]) - pd.testing.assert_frame_equal(current, model_client.results_handler.final_results["nat_sum_data"], check_dtype=False) - pd.testing.assert_frame_equal(expected_df, model_client.results_handler.final_results["nat_sum_data"], check_dtype=False) + pd.testing.assert_frame_equal( + current, model_client.results_handler.final_results["nat_sum_data"], check_dtype=False + ) + pd.testing.assert_frame_equal( + expected_df, model_client.results_handler.final_results["nat_sum_data"], check_dtype=False + ) From e1637b758822268fdd56ea0247e9925aeb390fa9 Mon Sep 17 00:00:00 2001 From: lennybronner Date: Thu, 24 Oct 2024 23:58:43 -0400 Subject: [PATCH 6/6] removed leftover pdb --- src/elexmodel/handlers/data/Featurizer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/elexmodel/handlers/data/Featurizer.py b/src/elexmodel/handlers/data/Featurizer.py index f800fead..9e661a61 100644 --- a/src/elexmodel/handlers/data/Featurizer.py +++ b/src/elexmodel/handlers/data/Featurizer.py @@ -87,8 +87,6 @@ def prepare_data( # if a state is in the states for separate model, then we add separate feature columns for that state # and we zero out the original feature column for those states additional_state_features = [] - # if 'BB' in self.states_for_separate_model and 'CC' in self.states_for_separate_model: - # import pdb; pdb.set_trace() for state in self.states_for_separate_model: mask = df.postal_code == state for feature in self.features: