From 44e6b90973f3a49cb651bbe66525a8edec280e09 Mon Sep 17 00:00:00 2001 From: lbvienna Date: Thu, 21 Sep 2023 18:36:20 -0400 Subject: [PATCH 01/14] added new features to combined data handler, estimandizer and updated conformal election model to use new elex-solver --- src/elexmodel/handlers/data/CombinedData.py | 28 +++++++++++++--- src/elexmodel/handlers/data/Estimandizer.py | 33 +++++++++++++++---- .../models/ConformalElectionModel.py | 27 +++++++-------- tests/handlers/test_combined_data.py | 25 +++++++++----- tests/handlers/test_featurizer.py | 9 +++-- tests/test_client.py | 1 - 6 files changed, 85 insertions(+), 38 deletions(-) diff --git a/src/elexmodel/handlers/data/CombinedData.py b/src/elexmodel/handlers/data/CombinedData.py index 76fc2672..d21f64a3 100644 --- a/src/elexmodel/handlers/data/CombinedData.py +++ b/src/elexmodel/handlers/data/CombinedData.py @@ -29,6 +29,7 @@ def __init__( self.current_data = current_data self.geographic_unit_type = geographic_unit_type data = preprocessed_data.merge(current_data, how="left", on=["postal_code", "geographic_unit_fips"]) + data = estimandizer.add_turnout_factor(data) # if unreporting is 'drop' then drop units that are not reporting (ie. units where results are na) # this is necessary if units will not be returning results in this election, # but we didn't know that (ie. townships) @@ -37,7 +38,7 @@ def __init__( # Drop the whole row if an estimand is not reporting data = data.dropna(axis=0, how="any", subset=result_cols) # if unreporting is 'zero' then we set the votes for non-reporting units to zero - # this is necessary if we are worried that there is no zero state for units (ie. some precincts) + # this is necessary if we are worried that there is no zero state for units (ie. some precinct states) elif handle_unreporting == "zero": indices_with_null_val = data[result_cols].isna().any(axis=1) data.update(data[result_cols].fillna(value=0)) @@ -45,13 +46,23 @@ def __init__( self.data = data - def get_reporting_units(self, percent_reporting_threshold, features_to_normalize=[], add_intercept=True): + def get_reporting_units( + self, + percent_reporting_threshold, + turnout_factor_lower=0.5, + turnout_factor_upper=1.5, + features_to_normalize=[], + add_intercept=True, + ): """ Get reporting data. These are units where the expected vote is greater than the percent reporting threshold. """ reporting_units = self.data[self.data.percent_expected_vote >= percent_reporting_threshold].reset_index( drop=True ) + # if turnout factor less than 0.5 or greater than 1.5 assume AP made a mistake and don't treat those as reporting units + reporting_units = reporting_units[reporting_units.turnout_factor > turnout_factor_lower] + reporting_units = reporting_units[reporting_units.turnout_factor < turnout_factor_upper] # residualize + normalize for estimand in self.estimands: @@ -64,16 +75,23 @@ def get_reporting_units(self, percent_reporting_threshold, features_to_normalize return reporting_units - def get_nonreporting_units(self, percent_reporting_threshold, features_to_normalize=[], add_intercept=True): + def get_nonreporting_units( + self, + percent_reporting_threshold, + turnout_factor_lower=0.5, + turnout_factor_upper=1.5, + features_to_normalize=[], + add_intercept=True, + ): """ Get nonreporting data. These are units where expected vote is less than the percent reporting threshold """ + # if turnout factor <= turnout_factor_lower or >= turnout_factor_upper assume the AP made a mistake and treat them as non-reporting units nonreporting_units = self.data.query( - "percent_expected_vote < @percent_reporting_threshold" + "(percent_expected_vote < @percent_reporting_threshold) | (turnout_factor <= @turnout_factor_lower) | (turnout_factor >= @turnout_factor_upper)" # ).reset_index( # not checking if results.isnull() anymore across multiple estimands drop=True ) - nonreporting_units["reporting"] = int(0) nonreporting_units["expected"] = True diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py index 64250a28..bff3e623 100644 --- a/src/elexmodel/handlers/data/Estimandizer.py +++ b/src/elexmodel/handlers/data/Estimandizer.py @@ -1,4 +1,4 @@ -from numpy import nan +import numpy as np RESULTS_PREFIX = "results_" BASELINE_PREFIX = "baseline_" @@ -11,12 +11,15 @@ class Estimandizer: def add_estimand_results(self, data_df, estimands, historical): columns_to_return = [] + turnout_col = f"{RESULTS_PREFIX}turnout" + for estimand in estimands: results_col = f"{RESULTS_PREFIX}{estimand}" + additional_columns_added = [] if results_col not in data_df.columns: # will raise a KeyError if a function with the same name as `estimand` doesn't exist try: - data_df = globals()[estimand](data_df, RESULTS_PREFIX) + data_df, additional_columns_added = globals()[estimand](data_df, RESULTS_PREFIX) except KeyError as e: if historical: # A historical run is one where we pull in data from a past election @@ -27,12 +30,21 @@ def add_estimand_results(self, data_df, estimands, historical): # and that data handler expects a results_ column for every estimand specified. # Hence, this is the only special case in which we'd want to add # an empty results_ column. - data_df[results_col] = nan + data_df[results_col] = np.nan else: # If this is not a historical run, then this is a live election # so we are expecting that there will be actual results data raise e - columns_to_return.append(results_col) + + + columns_to_return.extend([results_col] + additional_columns_added) + + # always adding turnout since we will want to generate weights + # but if turnout is the estimand, then we only want to add it once + if turnout_col not in columns_to_return: + columns_to_return.append(turnout_col) + + data_df = self.add_weights(data_df, RESULTS_PREFIX) return data_df, columns_to_return @@ -49,7 +61,7 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ baseline_col = f"{BASELINE_PREFIX}{pointer}" if baseline_col not in data_df.columns: - data_df = globals()[estimand](data_df, BASELINE_PREFIX) + data_df, __ = globals()[estimand](data_df, BASELINE_PREFIX) if not historical: data_df[f"last_election_results_{estimand}"] = data_df[baseline_col].copy() + 1 @@ -62,6 +74,15 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ # we need to add the results from the historical election as well. data_df, ___ = self.add_estimand_results(data_df, estimand_baselines.keys(), historical) + data_df = self.add_weights(data_df, BASELINE_PREFIX) + return data_df + + def add_weights(self, data_df, col_prefix): + data_df[f"{col_prefix}weights"] = data_df[f"{col_prefix}turnout"] + return data_df + + def add_turnout_factor(self, data_df): + data_df["turnout_factor"] = np.nan_to_num(data_df.results_weights / data_df.baseline_weights) return data_df @@ -76,4 +97,4 @@ def party_vote_share_dem(data_df, col_prefix): lambda x: 0 if x[numer] == 0 or x[denom] == 0 else x[numer] / x[denom], axis=1 ) - return data_df + return data_df, [] diff --git a/src/elexmodel/models/ConformalElectionModel.py b/src/elexmodel/models/ConformalElectionModel.py index f0a81a7b..8c240c7e 100644 --- a/src/elexmodel/models/ConformalElectionModel.py +++ b/src/elexmodel/models/ConformalElectionModel.py @@ -22,7 +22,6 @@ class ConformalElectionModel(BaseElectionModel.BaseElectionModel, ABC): def __init__(self, model_settings: dict): super(ConformalElectionModel, self).__init__(model_settings) - self.qr = QuantileRegressionSolver(solver="ECOS") self.lambda_ = model_settings.get("lambda_", 0) @classmethod @@ -56,11 +55,10 @@ def fit_model( model.fit( X, y, - tau_value=tau, + taus=tau, weights=weights, lambda_=self.lambda_, fit_intercept=self.add_intercept, - normalize_weights=normalize_weights, ) except (UserWarning, cvxpy.error.SolverError): LOG.warning("Warning: solution was inaccurate or solver broke. Re-running with normalize_weights=False.") @@ -88,10 +86,11 @@ def get_unit_predictions( x_all[self.n_train : self.n_train + n_test] # noqa: E203 ) - self.fit_model(self.qr, reporting_units_features, reporting_units_residuals, 0.5, weights, True) - self.features_to_coefficients = dict(zip(featurizer.complete_features, self.qr.coefficients)) + qr = QuantileRegressionSolver() + self.fit_model(qr, reporting_units_features, reporting_units_residuals, 0.5, weights, True) + self.features_to_coefficients = dict(zip(featurizer.complete_features, qr.coefficients)) - preds = self.qr.predict(nonreporting_units_features) + preds = qr.predict(nonreporting_units_features.values).flatten() # multiply by total voters to get unnormalized residuals preds = preds * nonreporting_units[f"last_election_results_{estimand}"] @@ -150,10 +149,10 @@ def get_unit_prediction_interval_bounds( train_data_weights = train_data[f"last_election_results_{estimand}"] # fit lower and upper model to training data. ECOS solver is better than SCS. - lower_qr = QuantileRegressionSolver(solver="ECOS") + lower_qr = QuantileRegressionSolver() self.fit_model(lower_qr, train_data_features, train_data_residuals, lower_bound, train_data_weights, True) - upper_qr = QuantileRegressionSolver(solver="ECOS") + upper_qr = QuantileRegressionSolver() self.fit_model(upper_qr, train_data_features, train_data_residuals, upper_bound, train_data_weights, True) # apply to conformalization data. Conformalization bounds will later tell us how much to adjust lower/upper @@ -169,10 +168,12 @@ def get_unit_prediction_interval_bounds( # we are interested in f(X) - r # since later conformity scores care about deviation of bounds from residuals conformalization_lower_bounds = ( - lower_qr.predict(conformalization_data_features) - conformalization_data[f"residuals_{estimand}"].values + lower_qr.predict(conformalization_data_features.values).flatten() + - conformalization_data[f"residuals_{estimand}"].values ) - conformalization_upper_bounds = conformalization_data[f"residuals_{estimand}"].values - upper_qr.predict( - conformalization_data_features + conformalization_upper_bounds = ( + conformalization_data[f"residuals_{estimand}"].values + - upper_qr.predict(conformalization_data_features.values).flatten() ) # save conformalization bounds for later @@ -185,8 +186,8 @@ def get_unit_prediction_interval_bounds( # are the same accross train_data, conformalization_data and nonreporting_units nonreporting_units_features = interval_featurizer.generate_holdout_data(x_all[self.n_train :]) # noqa: E203 - nonreporting_lower_bounds = lower_qr.predict(nonreporting_units_features) - nonreporting_upper_bounds = upper_qr.predict(nonreporting_units_features) + nonreporting_lower_bounds = lower_qr.predict(nonreporting_units_features.values).flatten() + nonreporting_upper_bounds = upper_qr.predict(nonreporting_units_features.values).flatten() return PredictionIntervals(nonreporting_lower_bounds, nonreporting_upper_bounds, conformalization_data) diff --git a/tests/handlers/test_combined_data.py b/tests/handlers/test_combined_data.py index 784c491b..111932c2 100644 --- a/tests/handlers/test_combined_data.py +++ b/tests/handlers/test_combined_data.py @@ -7,16 +7,25 @@ def test_load(va_governor_county_data): + election_id = "2017-11-07_VA_G" + office_id = "G" + geographic_unit_type = "county" estimands = ["turnout"] + estimand_baselines = {"turnout": "turnout"} live_data_handler = MockLiveDataHandler( - "2017-11-07_VA_G", "G", "county", estimands=["turnout"], data=va_governor_county_data + election_id, office_id, geographic_unit_type, estimands=["turnout"], data=va_governor_county_data ) current_data = live_data_handler.data + preprocessed_data_handler = PreprocessedDataHandler( + election_id, office_id, geographic_unit_type, estimand_baselines=estimand_baselines, estimands=["turnout"], data=va_governor_county_data + ) + preprocessed_data = preprocessed_data_handler.data combined_data_handler = CombinedDataHandler( - va_governor_county_data, current_data, estimands, "county", handle_unreporting="drop" + preprocessed_data, current_data, estimands, "county", handle_unreporting="drop" ) - assert combined_data_handler.data.shape == (133, 29) + + assert combined_data_handler.data.shape == (133, 33) def test_zero_unreporting_missing_single_estimand_value(va_governor_county_data): @@ -37,7 +46,7 @@ def test_zero_unreporting_missing_single_estimand_value(va_governor_county_data) assert ( combined_data_handler.data["percent_expected_vote"].iloc[0] == 0 ) # percent expected vote with na result has been set to zero - assert combined_data_handler.data.shape == (133, 31) # didn't drop any + assert combined_data_handler.data.shape == (133, 35) # didn't drop any assert combined_data_handler.data["results_dem"].iloc[1] != 0 # didn't accidentally set other to zero @@ -58,7 +67,7 @@ def test_zero_unreporting_missing_multiple_estimands_value(va_governor_county_da assert combined_data_handler.data["results_dem"].iloc[0] == 0.0 assert combined_data_handler.data["results_turnout"].iloc[0] == 0.0 assert combined_data_handler.data["percent_expected_vote"].iloc[0] == 0.0 - assert combined_data_handler.data.shape == (133, 31) + assert combined_data_handler.data.shape == (133, 35) assert combined_data_handler.data["results_dem"].iloc[1] != 0 # didn't accidentally set other to zero assert combined_data_handler.data["results_turnout"].iloc[1] != 0 # didn't accidentally set other to zero @@ -79,7 +88,7 @@ def test_zero_unreporting_missing_percent_expected_vote_value(va_governor_county ) assert combined_data_handler.data["results_dem"].iloc[0] == 0.0 assert combined_data_handler.data["percent_expected_vote"].iloc[0] == 0.0 - assert combined_data_handler.data.shape == (133, 31) + assert combined_data_handler.data.shape == (133, 35) assert combined_data_handler.data["results_dem"].iloc[1] != 0 # didn't accidentally set other to zero @@ -98,7 +107,7 @@ def test_zero_unreporting_random_percent_expected_vote_value(va_governor_county_ ) assert combined_data_handler.data["results_dem"].iloc[0] == 0.0 # all values set to 0.0 assert combined_data_handler.data["percent_expected_vote"].iloc[0] == 0.0 - assert combined_data_handler.data.shape == (133, 31) + assert combined_data_handler.data.shape == (133, 35) assert combined_data_handler.data["results_dem"].iloc[1] != 0 # didn't accidentally set other to zero @@ -115,7 +124,7 @@ def test_drop_unreporting_missing_single_estimand_value(va_governor_county_data) combined_data_handler = CombinedDataHandler( va_governor_county_data, current_data, estimands, "county", handle_unreporting="drop" ) - assert combined_data_handler.data.shape == (132, 31) # dropped one + assert combined_data_handler.data.shape == (132, 35) # dropped one assert combined_data_handler.data["results_dem"].iloc[0] != 0 # didn't accidentally set other to zero diff --git a/tests/handlers/test_featurizer.py b/tests/handlers/test_featurizer.py index 2b235195..64e1fe16 100644 --- a/tests/handlers/test_featurizer.py +++ b/tests/handlers/test_featurizer.py @@ -360,8 +360,7 @@ def test_generate_fixed_effects(va_governor_county_data): reporting_data_features = featurizer.filter_to_active_features(x_all[:n_train]) nonreporting_data_features = featurizer.generate_holdout_data(x_all[n_train:]) - assert combined_data_handler.data.shape == (133, 32) - + assert combined_data_handler.data.shape == (133, 35) n_expected_columns = 6 # (6 - 1) fixed effects + 1 intercept assert reporting_data_features.shape == (133, n_expected_columns) assert nonreporting_data_features.shape == (0, n_expected_columns) @@ -394,7 +393,7 @@ def test_generate_fixed_effects(va_governor_county_data): reporting_data_features = featurizer.filter_to_active_features(x_all[:n_train]) nonreporting_data_features = featurizer.generate_holdout_data(x_all[n_train:]) - assert combined_data_handler.data.shape == (133, 32) + assert combined_data_handler.data.shape == (133, 35) n_expected_columns = 138 # (6 - 1) + (133 - 1) fixed effects + 1 intercept assert reporting_data_features.shape == (133, n_expected_columns) @@ -451,7 +450,7 @@ def test_generate_fixed_effects_not_all_reporting(va_governor_county_data): reporting_data_features = featurizer.filter_to_active_features(x_all[:n_train]) nonreporting_data_features = featurizer.generate_holdout_data(x_all[n_train:]) - assert combined_data_handler.data.shape == (133, 32) + assert combined_data_handler.data.shape == (133, 35) n_expected_columns = (n - 1) + 1 # minus 1 for dropped fixed effect, plus 1 for intercept assert reporting_data_features.shape == (n, n_expected_columns) @@ -519,7 +518,7 @@ def test_generate_fixed_effects_mixed_reporting(va_governor_precinct_data): reporting_data_features = featurizer.filter_to_active_features(x_all[:n_train]) nonreporting_data_features = featurizer.generate_holdout_data(x_all[n_train:]) - assert combined_data_handler.data.shape == (2360, 32) + assert combined_data_handler.data.shape == (2360, 35) n_expected_columns = 7 # when n = 100 we get to county 51013 (minus dropped fixed effect, plus intercept) assert reporting_data_features.shape == (n_train, n_expected_columns) # use n_train since dropping columns diff --git a/tests/test_client.py b/tests/test_client.py index 837d14b9..802d20b4 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -785,7 +785,6 @@ def test_estimandizer_input(model_client, va_governor_county_data, va_config): preprocessed_data = va_governor_county_data.copy() preprocessed_data["last_election_results_turnout"] = preprocessed_data["baseline_turnout"].copy() + 1 - try: model_client.get_estimates( data, From 57a2d0d65d9f62210adec9c84570d8d86732b6db Mon Sep 17 00:00:00 2001 From: lbvienna Date: Fri, 22 Sep 2023 10:05:59 -0400 Subject: [PATCH 02/14] unit tests pass now --- src/elexmodel/handlers/data/Estimandizer.py | 3 +- tests/conftest.py | 14 +-- tests/handlers/test_combined_data.py | 117 +++++++++++++++--- tests/handlers/test_estimandizer.py | 4 +- .../test_nonparametric_election_model.py | 6 +- 5 files changed, 114 insertions(+), 30 deletions(-) diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py index bff3e623..1f5c43cb 100644 --- a/src/elexmodel/handlers/data/Estimandizer.py +++ b/src/elexmodel/handlers/data/Estimandizer.py @@ -36,14 +36,13 @@ def add_estimand_results(self, data_df, estimands, historical): # so we are expecting that there will be actual results data raise e - columns_to_return.extend([results_col] + additional_columns_added) # always adding turnout since we will want to generate weights # but if turnout is the estimand, then we only want to add it once if turnout_col not in columns_to_return: columns_to_return.append(turnout_col) - + data_df = self.add_weights(data_df, RESULTS_PREFIX) return data_df, columns_to_return diff --git a/tests/conftest.py b/tests/conftest.py index 1760a21b..871879e4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -66,43 +66,43 @@ def conformal_election_model(): return ConformalElectionModel.ConformalElectionModel(model_settings) -@pytest.fixture(scope="session") +@pytest.fixture(scope="function") def va_config(get_fixture): path = os.path.join("config", "2017-11-07_VA_G.json") return get_fixture(path, load=True, pandas=False) -@pytest.fixture(scope="session") +@pytest.fixture(scope="function") def tx_primary_governor_config(get_fixture): path = os.path.join("config", "2018-03-06_TX_R.json") return get_fixture(path, load=True, pandas=False) -@pytest.fixture(scope="session") +@pytest.fixture(scope="function") def va_governor_precinct_data(get_fixture): path = os.path.join("data", "2017-11-07_VA_G", "G", "data_precinct.csv") return get_fixture(path, load=False, pandas=True) -@pytest.fixture(scope="module") +@pytest.fixture(scope="function") def va_governor_county_data(get_fixture): path = os.path.join("data", "2017-11-07_VA_G", "G", "data_county.csv") return get_fixture(path, load=False, pandas=True) -@pytest.fixture(scope="session") +@pytest.fixture(scope="function") def va_assembly_county_data(get_fixture): path = os.path.join("data", "2017-11-07_VA_G", "Y", "data_county-district.csv") return get_fixture(path, load=False, pandas=True) -@pytest.fixture(scope="session") +@pytest.fixture(scope="function") def va_assembly_precinct_data(get_fixture): path = os.path.join("data", "2017-11-07_VA_G", "Y", "data_precinct-district.csv") return get_fixture(path, load=False, pandas=True) -@pytest.fixture(scope="session") +@pytest.fixture(scope="function") def az_assembly_precinct_data(get_fixture): path = os.path.join("data", "2020-08-04_AZ_R", "S", "data_precinct.csv") return get_fixture(path, load=False, pandas=True) diff --git a/tests/handlers/test_combined_data.py b/tests/handlers/test_combined_data.py index 111932c2..1888a121 100644 --- a/tests/handlers/test_combined_data.py +++ b/tests/handlers/test_combined_data.py @@ -17,7 +17,12 @@ def test_load(va_governor_county_data): ) current_data = live_data_handler.data preprocessed_data_handler = PreprocessedDataHandler( - election_id, office_id, geographic_unit_type, estimand_baselines=estimand_baselines, estimands=["turnout"], data=va_governor_county_data + election_id, + office_id, + geographic_unit_type, + estimand_baselines=estimand_baselines, + estimands=estimands, + data=va_governor_county_data, ) preprocessed_data = preprocessed_data_handler.data @@ -32,21 +37,37 @@ def test_zero_unreporting_missing_single_estimand_value(va_governor_county_data) """ Set the value for one estimand (dem) as na to test unreporting = "zero" """ + election_id = "2017-11-07_VA_G" + office_id = "G" + geographic_unit_type = "county" + estimand_baselines = {"turnout": "turnout", "dem": "dem"} estimands = ["turnout", "dem"] - live_data_handler = MockLiveDataHandler("2017-11-07_VA_G", "G", "county", estimands, data=va_governor_county_data) + live_data_handler = MockLiveDataHandler( + election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data + ) current_data = live_data_handler.data current_data["percent_expected_vote"] = 100 current_data.loc[0, "results_dem"] = np.nan + preprocessed_data_handler = PreprocessedDataHandler( + election_id, + office_id, + geographic_unit_type, + estimand_baselines=estimand_baselines, + estimands=estimands, + data=va_governor_county_data, + ) + preprocessed_data = preprocessed_data_handler.data + combined_data_handler = CombinedDataHandler( - va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero" + preprocessed_data, current_data, estimands, "county", handle_unreporting="zero" ) assert combined_data_handler.data["results_dem"].iloc[0] == 0.0 # value with na result has been set to zero assert combined_data_handler.data["results_turnout"].iloc[0] != 0 # has not been set to zero assert ( combined_data_handler.data["percent_expected_vote"].iloc[0] == 0 ) # percent expected vote with na result has been set to zero - assert combined_data_handler.data.shape == (133, 35) # didn't drop any + assert combined_data_handler.data.shape == (133, 36) # didn't drop any assert combined_data_handler.data["results_dem"].iloc[1] != 0 # didn't accidentally set other to zero @@ -54,20 +75,36 @@ def test_zero_unreporting_missing_multiple_estimands_value(va_governor_county_da """ Set the value for multiple estimands (dem, turnout) as na to test unreporting = "zero" """ + election_id = "2017-11-07_VA_G" + office_id = "G" + geographic_unit_type = "county" + estimand_baselines = {"turnout": "turnout", "dem": "dem"} estimands = ["turnout", "dem"] - live_data_handler = MockLiveDataHandler("2017-11-07_VA_G", "G", "county", estimands, data=va_governor_county_data) + live_data_handler = MockLiveDataHandler( + election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data + ) current_data = live_data_handler.data current_data["percent_expected_vote"] = 100 current_data.loc[0, "results_dem"] = np.nan current_data.loc[0, "results_turnout"] = np.nan + preprocessed_data_handler = PreprocessedDataHandler( + election_id, + office_id, + geographic_unit_type, + estimand_baselines=estimand_baselines, + estimands=estimands, + data=va_governor_county_data, + ) + preprocessed_data = preprocessed_data_handler.data + combined_data_handler = CombinedDataHandler( - va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero" + preprocessed_data, current_data, estimands, "county", handle_unreporting="zero" ) assert combined_data_handler.data["results_dem"].iloc[0] == 0.0 assert combined_data_handler.data["results_turnout"].iloc[0] == 0.0 assert combined_data_handler.data["percent_expected_vote"].iloc[0] == 0.0 - assert combined_data_handler.data.shape == (133, 35) + assert combined_data_handler.data.shape == (133, 36) assert combined_data_handler.data["results_dem"].iloc[1] != 0 # didn't accidentally set other to zero assert combined_data_handler.data["results_turnout"].iloc[1] != 0 # didn't accidentally set other to zero @@ -76,19 +113,35 @@ def test_zero_unreporting_missing_percent_expected_vote_value(va_governor_county """ Set the value and percent reporting for one estimand (dem) as na to test unreporting = "zero" """ + election_id = "2017-11-07_VA_G" + office_id = "G" + geographic_unit_type = "county" + estimand_baselines = {"turnout": "turnout", "dem": "dem"} estimands = ["turnout", "dem"] - live_data_handler = MockLiveDataHandler("2017-11-07_VA_G", "G", "county", estimands, data=va_governor_county_data) + live_data_handler = MockLiveDataHandler( + election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data + ) current_data = live_data_handler.data current_data["percent_expected_vote"] = 100 current_data.loc[0, "percent_expected_vote"] = np.nan current_data.loc[0, "results_dem"] = np.nan + preprocessed_data_handler = PreprocessedDataHandler( + election_id, + office_id, + geographic_unit_type, + estimand_baselines=estimand_baselines, + estimands=estimands, + data=va_governor_county_data, + ) + preprocessed_data = preprocessed_data_handler.data + combined_data_handler = CombinedDataHandler( - va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero" + preprocessed_data, current_data, estimands, "county", handle_unreporting="zero" ) assert combined_data_handler.data["results_dem"].iloc[0] == 0.0 assert combined_data_handler.data["percent_expected_vote"].iloc[0] == 0.0 - assert combined_data_handler.data.shape == (133, 35) + assert combined_data_handler.data.shape == (133, 36) assert combined_data_handler.data["results_dem"].iloc[1] != 0 # didn't accidentally set other to zero @@ -96,18 +149,34 @@ def test_zero_unreporting_random_percent_expected_vote_value(va_governor_county_ """ Set the value for one estimand (dem) as na to test unreporting = "zero" """ + election_id = "2017-11-07_VA_G" + office_id = "G" + geographic_unit_type = "county" + estimand_baselines = {"turnout": "turnout", "dem": "dem"} estimands = ["turnout", "dem"] - live_data_handler = MockLiveDataHandler("2017-11-07_VA_G", "G", "county", estimands, data=va_governor_county_data) + live_data_handler = MockLiveDataHandler( + election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data + ) current_data = live_data_handler.data current_data["percent_expected_vote"] = np.random.randint(1, 100, current_data.shape[0]) current_data.loc[0, "results_dem"] = np.nan + preprocessed_data_handler = PreprocessedDataHandler( + election_id, + office_id, + geographic_unit_type, + estimand_baselines=estimand_baselines, + estimands=estimands, + data=va_governor_county_data, + ) + preprocessed_data = preprocessed_data_handler.data + combined_data_handler = CombinedDataHandler( - va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero" + preprocessed_data, current_data, estimands, "county", handle_unreporting="zero" ) assert combined_data_handler.data["results_dem"].iloc[0] == 0.0 # all values set to 0.0 assert combined_data_handler.data["percent_expected_vote"].iloc[0] == 0.0 - assert combined_data_handler.data.shape == (133, 35) + assert combined_data_handler.data.shape == (133, 36) assert combined_data_handler.data["results_dem"].iloc[1] != 0 # didn't accidentally set other to zero @@ -115,16 +184,32 @@ def test_drop_unreporting_missing_single_estimand_value(va_governor_county_data) """ Set the value for one estimand (dem) as na to test unreporting = "drop" """ + election_id = "2017-11-07_VA_G" + office_id = "G" + geographic_unit_type = "county" + estimand_baselines = {"turnout": "turnout", "dem": "dem"} estimands = ["turnout", "dem"] - live_data_handler = MockLiveDataHandler("2017-11-07_VA_G", "G", "county", estimands, data=va_governor_county_data) + live_data_handler = MockLiveDataHandler( + election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data + ) current_data = live_data_handler.data current_data["percent_expected_vote"] = 100 current_data.loc[0, "results_dem"] = np.nan + preprocessed_data_handler = PreprocessedDataHandler( + election_id, + office_id, + geographic_unit_type, + estimand_baselines=estimand_baselines, + estimands=estimands, + data=va_governor_county_data, + ) + preprocessed_data = preprocessed_data_handler.data + combined_data_handler = CombinedDataHandler( - va_governor_county_data, current_data, estimands, "county", handle_unreporting="drop" + preprocessed_data, current_data, estimands, "county", handle_unreporting="drop" ) - assert combined_data_handler.data.shape == (132, 35) # dropped one + assert combined_data_handler.data.shape == (132, 36) # dropped one assert combined_data_handler.data["results_dem"].iloc[0] != 0 # didn't accidentally set other to zero diff --git a/tests/handlers/test_estimandizer.py b/tests/handlers/test_estimandizer.py index 91be637a..5b476d09 100644 --- a/tests/handlers/test_estimandizer.py +++ b/tests/handlers/test_estimandizer.py @@ -13,7 +13,7 @@ def test_add_estimand_results_not_historical(va_governor_county_data): (output_df, result_columns) = estimandizer.add_estimand_results(va_data_copy, estimands, False) assert "results_party_vote_share_dem" in output_df.columns - assert result_columns == ["results_party_vote_share_dem"] + assert result_columns == ["results_party_vote_share_dem", "results_turnout"] def test_add_estimand_results_historical(va_governor_county_data): @@ -27,7 +27,7 @@ def test_add_estimand_results_historical(va_governor_county_data): (output_df, result_columns) = estimandizer.add_estimand_results(va_data_copy, estimands, True) assert "results_party_vote_share_dem" in output_df.columns - assert result_columns == ["results_party_vote_share_dem"] + assert result_columns == ["results_party_vote_share_dem", "results_turnout"] def test_add_estimand_baselines_not_historical(va_governor_county_data): diff --git a/tests/models/test_nonparametric_election_model.py b/tests/models/test_nonparametric_election_model.py index ac7b37e6..7688eea9 100644 --- a/tests/models/test_nonparametric_election_model.py +++ b/tests/models/test_nonparametric_election_model.py @@ -408,7 +408,7 @@ def test_fit_model(): """ model_settings = {} model = NonparametricElectionModel.NonparametricElectionModel(model_settings) - qr = QuantileRegressionSolver(solver="ECOS") + qr = QuantileRegressionSolver() df_X = pd.DataFrame({"a": [1, 1, 1, 1], "b": [1, 1, 1, 2]}) @@ -416,8 +416,8 @@ def test_fit_model(): weights = pd.DataFrame({"weights": [1, 1, 1, 1]}).weights model.fit_model(qr, df_X, df_y, 0.5, weights, True) - assert all(np.abs(qr.predict(df_X) - [8, 8, 8, 15]) <= TOL) - assert all(np.abs(qr.coefficients - [1, 7]) <= TOL) + np.testing.assert_allclose(qr.predict(df_X), [[8, 8, 8, 15]], rtol=TOL) + np.testing.assert_allclose(qr.coefficients, [[1, 7]], rtol=TOL) def test_get_unit_predictions(): From 9145e8053949fcf963b5ffe5203171d08d2b737b Mon Sep 17 00:00:00 2001 From: lbvienna Date: Fri, 22 Sep 2023 10:11:03 -0400 Subject: [PATCH 03/14] resetting index to avoid copy warning in pandas --- src/elexmodel/models/ConformalElectionModel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/elexmodel/models/ConformalElectionModel.py b/src/elexmodel/models/ConformalElectionModel.py index 8c240c7e..a0c8ae72 100644 --- a/src/elexmodel/models/ConformalElectionModel.py +++ b/src/elexmodel/models/ConformalElectionModel.py @@ -157,7 +157,7 @@ def get_unit_prediction_interval_bounds( # apply to conformalization data. Conformalization bounds will later tell us how much to adjust lower/upper # bounds for nonreporting data. - conformalization_data = reporting_units_shuffled[train_rows:] + conformalization_data = reporting_units_shuffled[train_rows:].reset_index(drop=True) # all_data starts with reporting_units_shuffled, so the rows between train_rows and n_train are the # conformalization set From b8605113567086a6d423b3b7495d84b6444f2de6 Mon Sep 17 00:00:00 2001 From: lbvienna Date: Fri, 22 Sep 2023 10:28:53 -0400 Subject: [PATCH 04/14] updated combined data unit tests --- tests/handlers/test_combined_data.py | 84 ++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/tests/handlers/test_combined_data.py b/tests/handlers/test_combined_data.py index 1888a121..674c988d 100644 --- a/tests/handlers/test_combined_data.py +++ b/tests/handlers/test_combined_data.py @@ -238,6 +238,90 @@ def test_get_reporting_data(va_governor_county_data): assert observed_data.reporting.sum() == 20 +def test_get_reporting_data_dropping_with_turnout_factor(va_governor_county_data): + election_id = "2017-11-07_VA_G" + office = "G" + geographic_unit_type = "county" + estimands = ["turnout"] + estimand_baseline = {"turnout": "turnout"} + + live_data_handler = MockLiveDataHandler( + election_id, office, geographic_unit_type, estimands, data=va_governor_county_data + ) + current_data = live_data_handler.get_n_fully_reported(n=20) + preprocessed_data_handler = PreprocessedDataHandler( + election_id, office, geographic_unit_type, estimands, estimand_baseline, data=va_governor_county_data + ) + + combined_data_handler = CombinedDataHandler( + preprocessed_data_handler.data, current_data, estimands, geographic_unit_type + ) + + turnout_factor_lower = 0.95 + turnout_factor_upper = 1.2 + reporting_units_above_turnout_factor_threshold = combined_data_handler.data[ + combined_data_handler.data.turnout_factor > turnout_factor_upper + ].shape[0] + reporting_units_below_turnout_factor_threshold = combined_data_handler.data[ + (combined_data_handler.data.percent_expected_vote == 100) + & (combined_data_handler.data.turnout_factor < turnout_factor_lower) + ].shape[0] + + observed_data = combined_data_handler.get_reporting_units( + 100, turnout_factor_lower=turnout_factor_lower, turnout_factor_upper=turnout_factor_upper + ) + + # 20 units should be reporting, but the additional ones are dropped to nonreporting because they are above/below threshold + # and so are subtracted from the reporting ones + assert observed_data.shape[0] == 20 - ( + reporting_units_above_turnout_factor_threshold + reporting_units_below_turnout_factor_threshold + ) + + +def test_get_nonreporting_adding_with_turnout_factor(va_governor_county_data): + election_id = "2017-11-07_VA_G" + office = "G" + geographic_unit_type = "county" + estimands = ["turnout"] + estimand_baseline = {"turnout": "turnout"} + + live_data_handler = MockLiveDataHandler( + election_id, office, geographic_unit_type, estimands, data=va_governor_county_data + ) + n = 20 + current_data = live_data_handler.get_n_fully_reported(n=n) + preprocessed_data_handler = PreprocessedDataHandler( + election_id, office, geographic_unit_type, estimands, estimand_baseline, data=va_governor_county_data + ) + + combined_data_handler = CombinedDataHandler( + preprocessed_data_handler.data, current_data, estimands, geographic_unit_type + ) + + turnout_factor_lower = 0.95 + turnout_factor_upper = 1.2 + + reporting_units_above_turnout_factor_threshold = combined_data_handler.data[ + combined_data_handler.data.turnout_factor > turnout_factor_upper + ].shape[0] + reporting_units_below_turnout_factor_threshold = combined_data_handler.data[ + (combined_data_handler.data.percent_expected_vote == 100) + & (combined_data_handler.data.turnout_factor < turnout_factor_lower) + ].shape[0] + + nonreporting_data = combined_data_handler.get_nonreporting_units( + 100, turnout_factor_lower=turnout_factor_lower, turnout_factor_upper=turnout_factor_upper + ) + + assert ( + nonreporting_data.shape[0] + == va_governor_county_data.shape[0] + - n + + reporting_units_above_turnout_factor_threshold + + reporting_units_below_turnout_factor_threshold + ) + + def test_get_unexpected_units_county_district(va_assembly_county_data): election_id = "2017-11-07_VA_G" office = "Y" From 440a7e06b1cabd0fe2607d78f203dc232808762a Mon Sep 17 00:00:00 2001 From: lbvienna Date: Fri, 22 Sep 2023 10:49:31 -0400 Subject: [PATCH 05/14] updated estimandizer unit tests --- src/elexmodel/handlers/data/Estimandizer.py | 8 ++++++- tests/handlers/test_estimandizer.py | 26 +++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py index 1f5c43cb..bfa8ec98 100644 --- a/src/elexmodel/handlers/data/Estimandizer.py +++ b/src/elexmodel/handlers/data/Estimandizer.py @@ -81,7 +81,13 @@ def add_weights(self, data_df, col_prefix): return data_df def add_turnout_factor(self, data_df): - data_df["turnout_factor"] = np.nan_to_num(data_df.results_weights / data_df.baseline_weights) + # posinf and neginf are also set to zero because dividing by zero can lead to nan/posinf/neginf depending + # on the type of the numeric in the numpy array. Assume that if baseline_weights is zero then turnout + # would be incredibly low in this election too (ie. this is effectively an empty precinct) and so setting + # the turnout factor to zero is fine + data_df["turnout_factor"] = np.nan_to_num( + data_df.results_weights / data_df.baseline_weights, nan=0, posinf=0, neginf=0 + ) return data_df diff --git a/tests/handlers/test_estimandizer.py b/tests/handlers/test_estimandizer.py index 5b476d09..23509c9f 100644 --- a/tests/handlers/test_estimandizer.py +++ b/tests/handlers/test_estimandizer.py @@ -1,3 +1,5 @@ +import pytest + from elexmodel.handlers.data.Estimandizer import Estimandizer @@ -13,6 +15,7 @@ def test_add_estimand_results_not_historical(va_governor_county_data): (output_df, result_columns) = estimandizer.add_estimand_results(va_data_copy, estimands, False) assert "results_party_vote_share_dem" in output_df.columns + assert "results_weights" in output_df.columns assert result_columns == ["results_party_vote_share_dem", "results_turnout"] @@ -27,6 +30,7 @@ def test_add_estimand_results_historical(va_governor_county_data): (output_df, result_columns) = estimandizer.add_estimand_results(va_data_copy, estimands, True) assert "results_party_vote_share_dem" in output_df.columns + assert "results_weights" in output_df.columns assert result_columns == ["results_party_vote_share_dem", "results_turnout"] @@ -34,6 +38,7 @@ def test_add_estimand_baselines_not_historical(va_governor_county_data): estimand_baselines = {"turnout": "turnout", "party_vote_share_dem": "party_vote_share_dem"} estimandizer = Estimandizer() output_df = estimandizer.add_estimand_baselines(va_governor_county_data.copy(), estimand_baselines, False) + assert "baseline_weights" in output_df.columns assert "baseline_party_vote_share_dem" in output_df.columns assert "last_election_results_party_vote_share_dem" in output_df.columns @@ -45,5 +50,26 @@ def test_add_estimand_baselines_historical(va_governor_county_data): va_governor_county_data.copy(), estimand_baselines, True, include_results_estimand=True ) assert "baseline_party_vote_share_dem" in output_df.columns + assert "baseline_weights" in output_df.columns assert "results_party_vote_share_dem" in output_df.columns assert "last_election_results_party_vote_share_dem" not in output_df.columns + + +def test_add_turnout_factor(va_governor_county_data): + estimands = ["party_vote_share_dem", "turnout"] + estimand_baselines = {"turnout": "turnout", "party_vote_share_dem": "party_vote_share_dem"} + estimandizer = Estimandizer() + output_df = estimandizer.add_estimand_baselines( + va_governor_county_data.copy(), estimand_baselines, False, include_results_estimand=False + ) + output_df, __ = estimandizer.add_estimand_results(output_df, estimands, False) + + # check that nan turns into 0 + output_df.loc[0, "baseline_weights"] = 0.0 + import pdb + + pdb.set_trace() + output_df = estimandizer.add_turnout_factor(output_df) + + assert "turnout_factor" in output_df.columns + assert 0 == pytest.approx(output_df.loc[0, "turnout_factor"]) From 1effc1116fe0be5954161d4605ee6eb7aa5d59ff Mon Sep 17 00:00:00 2001 From: lbvienna Date: Mon, 25 Sep 2023 11:49:08 -0400 Subject: [PATCH 06/14] updated elex-solver version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f4507b08..6742c3bf 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import find_packages, setup -INSTALL_REQUIRES = ("click<8.1", "elex-solver<2", "pandas<1.5.0", "boto3<2", "python-dotenv==0.19.2", "scipy==1.10.1") +INSTALL_REQUIRES = ("click<8.1", "elex-solver<3", "pandas<1.5.0", "boto3<2", "python-dotenv==0.19.2", "scipy==1.10.1") THIS_FILE_DIR = os.path.dirname(__file__) From bf0ed06bbf390239c2ca170f4bdbb840615b1124 Mon Sep 17 00:00:00 2001 From: lbvienna Date: Mon, 25 Sep 2023 11:56:09 -0400 Subject: [PATCH 07/14] updated unit test --- tests/handlers/test_estimandizer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/handlers/test_estimandizer.py b/tests/handlers/test_estimandizer.py index 23509c9f..d2321fcc 100644 --- a/tests/handlers/test_estimandizer.py +++ b/tests/handlers/test_estimandizer.py @@ -66,9 +66,6 @@ def test_add_turnout_factor(va_governor_county_data): # check that nan turns into 0 output_df.loc[0, "baseline_weights"] = 0.0 - import pdb - - pdb.set_trace() output_df = estimandizer.add_turnout_factor(output_df) assert "turnout_factor" in output_df.columns From bc81f0e50ac8cdfee2da3382ebd195111e935aed Mon Sep 17 00:00:00 2001 From: lbvienna Date: Mon, 25 Sep 2023 12:22:40 -0400 Subject: [PATCH 08/14] updated historical --- src/elexmodel/client.py | 4 ++-- src/elexmodel/handlers/data/Estimandizer.py | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/elexmodel/client.py b/src/elexmodel/client.py index 6cd8b308..db83feee 100644 --- a/src/elexmodel/client.py +++ b/src/elexmodel/client.py @@ -416,8 +416,8 @@ def _format_historical_current_data( historical=True, include_results_estimand=True, ) - - results_to_return = [f"results_{estimand}" for estimand in estimands] + # we always want to pass turnout so that we can generate results weights + results_to_return = list(set([f"results_{estimand}" for estimand in estimands] + ["results_turnout"])) geo_columns = set(["geographic_unit_fips", "postal_code"] + [a for a in self.aggregates if a != "unit"]) preprocessed_data = preprocessed_data_handler.data[list(geo_columns) + results_to_return].copy() historical_current_data = preprocessed_data.merge(formatted_data, on=["postal_code", "geographic_unit_fips"]) diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py index bfa8ec98..431c7235 100644 --- a/src/elexmodel/handlers/data/Estimandizer.py +++ b/src/elexmodel/handlers/data/Estimandizer.py @@ -13,6 +13,8 @@ def add_estimand_results(self, data_df, estimands, historical): columns_to_return = [] turnout_col = f"{RESULTS_PREFIX}turnout" + data_df = self.add_weights(data_df, RESULTS_PREFIX) + for estimand in estimands: results_col = f"{RESULTS_PREFIX}{estimand}" additional_columns_added = [] @@ -31,6 +33,7 @@ def add_estimand_results(self, data_df, estimands, historical): # Hence, this is the only special case in which we'd want to add # an empty results_ column. data_df[results_col] = np.nan + data_df[turnout_col] = np.nan else: # If this is not a historical run, then this is a live election # so we are expecting that there will be actual results data @@ -43,14 +46,14 @@ def add_estimand_results(self, data_df, estimands, historical): if turnout_col not in columns_to_return: columns_to_return.append(turnout_col) - data_df = self.add_weights(data_df, RESULTS_PREFIX) - return data_df, columns_to_return def add_estimand_baselines(self, data_df, estimand_baselines, historical, include_results_estimand=False): # if we are in a historical election we are only reading preprocessed data to get # the historical election results of the currently reporting units. # so we don't care about the total voters or the baseline election. + + data_df = self.add_weights(data_df, BASELINE_PREFIX) for estimand, pointer in estimand_baselines.items(): if pointer is None: @@ -73,7 +76,6 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ # we need to add the results from the historical election as well. data_df, ___ = self.add_estimand_results(data_df, estimand_baselines.keys(), historical) - data_df = self.add_weights(data_df, BASELINE_PREFIX) return data_df def add_weights(self, data_df, col_prefix): From 7662f0c0f2cc498b073f3a7d72b6f206ef34766b Mon Sep 17 00:00:00 2001 From: lbvienna Date: Mon, 25 Sep 2023 12:23:06 -0400 Subject: [PATCH 09/14] linter --- src/elexmodel/handlers/data/Estimandizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py index 431c7235..0359d148 100644 --- a/src/elexmodel/handlers/data/Estimandizer.py +++ b/src/elexmodel/handlers/data/Estimandizer.py @@ -52,7 +52,7 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ # if we are in a historical election we are only reading preprocessed data to get # the historical election results of the currently reporting units. # so we don't care about the total voters or the baseline election. - + data_df = self.add_weights(data_df, BASELINE_PREFIX) for estimand, pointer in estimand_baselines.items(): From 23b9f7d2b13c22c522cc680ef2ca7cbb1440f6f9 Mon Sep 17 00:00:00 2001 From: lbvienna Date: Mon, 25 Sep 2023 12:40:44 -0400 Subject: [PATCH 10/14] move weights adding --- src/elexmodel/handlers/data/Estimandizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py index 0359d148..9e45003a 100644 --- a/src/elexmodel/handlers/data/Estimandizer.py +++ b/src/elexmodel/handlers/data/Estimandizer.py @@ -13,8 +13,6 @@ def add_estimand_results(self, data_df, estimands, historical): columns_to_return = [] turnout_col = f"{RESULTS_PREFIX}turnout" - data_df = self.add_weights(data_df, RESULTS_PREFIX) - for estimand in estimands: results_col = f"{RESULTS_PREFIX}{estimand}" additional_columns_added = [] @@ -46,6 +44,8 @@ def add_estimand_results(self, data_df, estimands, historical): if turnout_col not in columns_to_return: columns_to_return.append(turnout_col) + data_df = self.add_weights(data_df, RESULTS_PREFIX) + return data_df, columns_to_return def add_estimand_baselines(self, data_df, estimand_baselines, historical, include_results_estimand=False): @@ -53,8 +53,6 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ # the historical election results of the currently reporting units. # so we don't care about the total voters or the baseline election. - data_df = self.add_weights(data_df, BASELINE_PREFIX) - for estimand, pointer in estimand_baselines.items(): if pointer is None: # when we are creating a new estimand @@ -75,6 +73,8 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ # Since this method is only called by the PreprocessedDataHandler, for historical runs, # we need to add the results from the historical election as well. data_df, ___ = self.add_estimand_results(data_df, estimand_baselines.keys(), historical) + + data_df = self.add_weights(data_df, BASELINE_PREFIX) return data_df From 82567c3b24d043ca30260914b478962e10bd0e61 Mon Sep 17 00:00:00 2001 From: lbvienna Date: Mon, 25 Sep 2023 12:47:49 -0400 Subject: [PATCH 11/14] linter --- src/elexmodel/handlers/data/Estimandizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py index 9e45003a..2807154f 100644 --- a/src/elexmodel/handlers/data/Estimandizer.py +++ b/src/elexmodel/handlers/data/Estimandizer.py @@ -73,7 +73,7 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ # Since this method is only called by the PreprocessedDataHandler, for historical runs, # we need to add the results from the historical election as well. data_df, ___ = self.add_estimand_results(data_df, estimand_baselines.keys(), historical) - + data_df = self.add_weights(data_df, BASELINE_PREFIX) return data_df From 72dd9a2d16500b47da32e4ab7714efba8315de64 Mon Sep 17 00:00:00 2001 From: lbvienna Date: Mon, 25 Sep 2023 14:56:09 -0400 Subject: [PATCH 12/14] using nan to num in party_vote_share_dem --- src/elexmodel/handlers/data/Estimandizer.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py index 2807154f..6acc43b1 100644 --- a/src/elexmodel/handlers/data/Estimandizer.py +++ b/src/elexmodel/handlers/data/Estimandizer.py @@ -97,11 +97,5 @@ def add_turnout_factor(self, data_df): def party_vote_share_dem(data_df, col_prefix): - numer = f"{col_prefix}dem" - denom = f"{col_prefix}turnout" - - data_df[f"{col_prefix}party_vote_share_dem"] = data_df.apply( - lambda x: 0 if x[numer] == 0 or x[denom] == 0 else x[numer] / x[denom], axis=1 - ) - + data_df[f"{col_prefix}party_vote_share_dem"] = np.nan_to_num(data_df[f"{col_prefix}dem"] /data_df[f"{col_prefix}turnout"]) return data_df, [] From 88719c7e01822d98db30d5f42d140fb6b24f9d3f Mon Sep 17 00:00:00 2001 From: lbvienna Date: Mon, 25 Sep 2023 15:04:59 -0400 Subject: [PATCH 13/14] changed tests slightly --- tests/handlers/test_combined_data.py | 123 ++++++++------------------- 1 file changed, 37 insertions(+), 86 deletions(-) diff --git a/tests/handlers/test_combined_data.py b/tests/handlers/test_combined_data.py index 674c988d..b66de0ba 100644 --- a/tests/handlers/test_combined_data.py +++ b/tests/handlers/test_combined_data.py @@ -16,21 +16,13 @@ def test_load(va_governor_county_data): election_id, office_id, geographic_unit_type, estimands=["turnout"], data=va_governor_county_data ) current_data = live_data_handler.data - preprocessed_data_handler = PreprocessedDataHandler( - election_id, - office_id, - geographic_unit_type, - estimand_baselines=estimand_baselines, - estimands=estimands, - data=va_governor_county_data, - ) - preprocessed_data = preprocessed_data_handler.data + va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout combined_data_handler = CombinedDataHandler( - preprocessed_data, current_data, estimands, "county", handle_unreporting="drop" + va_governor_county_data, current_data, estimands, "county", handle_unreporting="drop" ) - assert combined_data_handler.data.shape == (133, 33) + assert combined_data_handler.data.shape == (133, 32) def test_zero_unreporting_missing_single_estimand_value(va_governor_county_data): @@ -49,25 +41,17 @@ def test_zero_unreporting_missing_single_estimand_value(va_governor_county_data) current_data["percent_expected_vote"] = 100 current_data.loc[0, "results_dem"] = np.nan - preprocessed_data_handler = PreprocessedDataHandler( - election_id, - office_id, - geographic_unit_type, - estimand_baselines=estimand_baselines, - estimands=estimands, - data=va_governor_county_data, - ) - preprocessed_data = preprocessed_data_handler.data + va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout combined_data_handler = CombinedDataHandler( - preprocessed_data, current_data, estimands, "county", handle_unreporting="zero" + va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero" ) assert combined_data_handler.data["results_dem"].iloc[0] == 0.0 # value with na result has been set to zero assert combined_data_handler.data["results_turnout"].iloc[0] != 0 # has not been set to zero assert ( combined_data_handler.data["percent_expected_vote"].iloc[0] == 0 ) # percent expected vote with na result has been set to zero - assert combined_data_handler.data.shape == (133, 36) # didn't drop any + assert combined_data_handler.data.shape == (133, 34) # didn't drop any assert combined_data_handler.data["results_dem"].iloc[1] != 0 # didn't accidentally set other to zero @@ -88,23 +72,15 @@ def test_zero_unreporting_missing_multiple_estimands_value(va_governor_county_da current_data.loc[0, "results_dem"] = np.nan current_data.loc[0, "results_turnout"] = np.nan - preprocessed_data_handler = PreprocessedDataHandler( - election_id, - office_id, - geographic_unit_type, - estimand_baselines=estimand_baselines, - estimands=estimands, - data=va_governor_county_data, - ) - preprocessed_data = preprocessed_data_handler.data + va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout combined_data_handler = CombinedDataHandler( - preprocessed_data, current_data, estimands, "county", handle_unreporting="zero" + va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero" ) assert combined_data_handler.data["results_dem"].iloc[0] == 0.0 assert combined_data_handler.data["results_turnout"].iloc[0] == 0.0 assert combined_data_handler.data["percent_expected_vote"].iloc[0] == 0.0 - assert combined_data_handler.data.shape == (133, 36) + assert combined_data_handler.data.shape == (133, 34) assert combined_data_handler.data["results_dem"].iloc[1] != 0 # didn't accidentally set other to zero assert combined_data_handler.data["results_turnout"].iloc[1] != 0 # didn't accidentally set other to zero @@ -126,22 +102,14 @@ def test_zero_unreporting_missing_percent_expected_vote_value(va_governor_county current_data.loc[0, "percent_expected_vote"] = np.nan current_data.loc[0, "results_dem"] = np.nan - preprocessed_data_handler = PreprocessedDataHandler( - election_id, - office_id, - geographic_unit_type, - estimand_baselines=estimand_baselines, - estimands=estimands, - data=va_governor_county_data, - ) - preprocessed_data = preprocessed_data_handler.data + va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout combined_data_handler = CombinedDataHandler( - preprocessed_data, current_data, estimands, "county", handle_unreporting="zero" + va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero" ) assert combined_data_handler.data["results_dem"].iloc[0] == 0.0 assert combined_data_handler.data["percent_expected_vote"].iloc[0] == 0.0 - assert combined_data_handler.data.shape == (133, 36) + assert combined_data_handler.data.shape == (133, 34) assert combined_data_handler.data["results_dem"].iloc[1] != 0 # didn't accidentally set other to zero @@ -161,22 +129,14 @@ def test_zero_unreporting_random_percent_expected_vote_value(va_governor_county_ current_data["percent_expected_vote"] = np.random.randint(1, 100, current_data.shape[0]) current_data.loc[0, "results_dem"] = np.nan - preprocessed_data_handler = PreprocessedDataHandler( - election_id, - office_id, - geographic_unit_type, - estimand_baselines=estimand_baselines, - estimands=estimands, - data=va_governor_county_data, - ) - preprocessed_data = preprocessed_data_handler.data + va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout combined_data_handler = CombinedDataHandler( - preprocessed_data, current_data, estimands, "county", handle_unreporting="zero" + va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero" ) assert combined_data_handler.data["results_dem"].iloc[0] == 0.0 # all values set to 0.0 assert combined_data_handler.data["percent_expected_vote"].iloc[0] == 0.0 - assert combined_data_handler.data.shape == (133, 36) + assert combined_data_handler.data.shape == (133, 34) assert combined_data_handler.data["results_dem"].iloc[1] != 0 # didn't accidentally set other to zero @@ -196,20 +156,12 @@ def test_drop_unreporting_missing_single_estimand_value(va_governor_county_data) current_data["percent_expected_vote"] = 100 current_data.loc[0, "results_dem"] = np.nan - preprocessed_data_handler = PreprocessedDataHandler( - election_id, - office_id, - geographic_unit_type, - estimand_baselines=estimand_baselines, - estimands=estimands, - data=va_governor_county_data, - ) - preprocessed_data = preprocessed_data_handler.data + va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout combined_data_handler = CombinedDataHandler( - preprocessed_data, current_data, estimands, "county", handle_unreporting="drop" + va_governor_county_data, current_data, estimands, "county", handle_unreporting="drop" ) - assert combined_data_handler.data.shape == (132, 36) # dropped one + assert combined_data_handler.data.shape == (132, 34) # dropped one assert combined_data_handler.data["results_dem"].iloc[0] != 0 # didn't accidentally set other to zero @@ -224,13 +176,13 @@ def test_get_reporting_data(va_governor_county_data): election_id, office, geographic_unit_type, estimands, data=va_governor_county_data ) current_data = live_data_handler.get_n_fully_reported(n=20) - preprocessed_data_handler = PreprocessedDataHandler( - election_id, office, geographic_unit_type, estimands, estimand_baseline, data=va_governor_county_data - ) + + va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout + va_governor_county_data['last_election_results_turnout'] = va_governor_county_data.baseline_turnout + 1 # no fixed effects combined_data_handler = CombinedDataHandler( - preprocessed_data_handler.data, current_data, estimands, geographic_unit_type + va_governor_county_data, current_data, estimands, geographic_unit_type ) observed_data = combined_data_handler.get_reporting_units(100) assert observed_data.shape[0] == 20 @@ -249,12 +201,12 @@ def test_get_reporting_data_dropping_with_turnout_factor(va_governor_county_data election_id, office, geographic_unit_type, estimands, data=va_governor_county_data ) current_data = live_data_handler.get_n_fully_reported(n=20) - preprocessed_data_handler = PreprocessedDataHandler( - election_id, office, geographic_unit_type, estimands, estimand_baseline, data=va_governor_county_data - ) + + va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout + va_governor_county_data['last_election_results_turnout'] = va_governor_county_data.baseline_turnout + 1 combined_data_handler = CombinedDataHandler( - preprocessed_data_handler.data, current_data, estimands, geographic_unit_type + va_governor_county_data, current_data, estimands, geographic_unit_type ) turnout_factor_lower = 0.95 @@ -290,12 +242,12 @@ def test_get_nonreporting_adding_with_turnout_factor(va_governor_county_data): ) n = 20 current_data = live_data_handler.get_n_fully_reported(n=n) - preprocessed_data_handler = PreprocessedDataHandler( - election_id, office, geographic_unit_type, estimands, estimand_baseline, data=va_governor_county_data - ) + + va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout + va_governor_county_data['last_election_results_turnout'] = va_governor_county_data.baseline_turnout + 1 combined_data_handler = CombinedDataHandler( - preprocessed_data_handler.data, current_data, estimands, geographic_unit_type + va_governor_county_data, current_data, estimands, geographic_unit_type ) turnout_factor_lower = 0.95 @@ -339,12 +291,12 @@ def test_get_unexpected_units_county_district(va_assembly_county_data): unexpected_units=unexpected_units, ) current_data = live_data_handler.get_n_fully_reported(n=20) - preprocessed_data_handler = PreprocessedDataHandler( - election_id, office, geographic_unit_type, estimands, estimand_baseline, data=va_assembly_county_data - ) + + va_assembly_county_data['baseline_weights'] = va_assembly_county_data.baseline_turnout + va_assembly_county_data['last_election_results_turnout'] = va_assembly_county_data.baseline_turnout + 1 combined_data_handler = CombinedDataHandler( - preprocessed_data_handler.data, current_data, estimands, geographic_unit_type + va_assembly_county_data, current_data, estimands, geographic_unit_type ) unexpected_data = combined_data_handler.get_unexpected_units(100, ["county_fips", "district"]) assert unexpected_data.shape[0] == unexpected_units @@ -377,12 +329,11 @@ def test_get_unexpected_units_county(va_governor_county_data): extra_row["percent_expected_vote"] = 50 current_data = pd.concat([current_data, extra_row]) - preprocessed_data_handler = PreprocessedDataHandler( - election_id, office, geographic_unit_type, estimands, estimand_baseline, data=va_governor_county_data - ) + va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout + va_governor_county_data['last_election_results_turnout'] = va_governor_county_data.baseline_turnout + 1 combined_data_handler = CombinedDataHandler( - preprocessed_data_handler.data, current_data, estimands, geographic_unit_type + va_governor_county_data, current_data, estimands, geographic_unit_type ) unexpected_data = combined_data_handler.get_unexpected_units(100, ["county_fips"]) assert unexpected_data.shape[0] == reporting_unexpected_units + 1 From eb5de999e0d08bfce5615fac6e0abca27131d99e Mon Sep 17 00:00:00 2001 From: lbvienna Date: Mon, 25 Sep 2023 15:12:57 -0400 Subject: [PATCH 14/14] linter --- src/elexmodel/handlers/data/Estimandizer.py | 4 +- tests/handlers/test_combined_data.py | 70 +++++++-------------- 2 files changed, 27 insertions(+), 47 deletions(-) diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py index 6acc43b1..038bd623 100644 --- a/src/elexmodel/handlers/data/Estimandizer.py +++ b/src/elexmodel/handlers/data/Estimandizer.py @@ -97,5 +97,7 @@ def add_turnout_factor(self, data_df): def party_vote_share_dem(data_df, col_prefix): - data_df[f"{col_prefix}party_vote_share_dem"] = np.nan_to_num(data_df[f"{col_prefix}dem"] /data_df[f"{col_prefix}turnout"]) + data_df[f"{col_prefix}party_vote_share_dem"] = np.nan_to_num( + data_df[f"{col_prefix}dem"] / data_df[f"{col_prefix}turnout"] + ) return data_df, [] diff --git a/tests/handlers/test_combined_data.py b/tests/handlers/test_combined_data.py index b66de0ba..d1cd53aa 100644 --- a/tests/handlers/test_combined_data.py +++ b/tests/handlers/test_combined_data.py @@ -3,7 +3,6 @@ from elexmodel.handlers.data.CombinedData import CombinedDataHandler from elexmodel.handlers.data.LiveData import MockLiveDataHandler -from elexmodel.handlers.data.PreprocessedData import PreprocessedDataHandler def test_load(va_governor_county_data): @@ -11,12 +10,11 @@ def test_load(va_governor_county_data): office_id = "G" geographic_unit_type = "county" estimands = ["turnout"] - estimand_baselines = {"turnout": "turnout"} live_data_handler = MockLiveDataHandler( election_id, office_id, geographic_unit_type, estimands=["turnout"], data=va_governor_county_data ) current_data = live_data_handler.data - va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout + va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout combined_data_handler = CombinedDataHandler( va_governor_county_data, current_data, estimands, "county", handle_unreporting="drop" @@ -32,7 +30,6 @@ def test_zero_unreporting_missing_single_estimand_value(va_governor_county_data) election_id = "2017-11-07_VA_G" office_id = "G" geographic_unit_type = "county" - estimand_baselines = {"turnout": "turnout", "dem": "dem"} estimands = ["turnout", "dem"] live_data_handler = MockLiveDataHandler( election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data @@ -41,7 +38,7 @@ def test_zero_unreporting_missing_single_estimand_value(va_governor_county_data) current_data["percent_expected_vote"] = 100 current_data.loc[0, "results_dem"] = np.nan - va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout + va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout combined_data_handler = CombinedDataHandler( va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero" @@ -62,7 +59,6 @@ def test_zero_unreporting_missing_multiple_estimands_value(va_governor_county_da election_id = "2017-11-07_VA_G" office_id = "G" geographic_unit_type = "county" - estimand_baselines = {"turnout": "turnout", "dem": "dem"} estimands = ["turnout", "dem"] live_data_handler = MockLiveDataHandler( election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data @@ -72,7 +68,7 @@ def test_zero_unreporting_missing_multiple_estimands_value(va_governor_county_da current_data.loc[0, "results_dem"] = np.nan current_data.loc[0, "results_turnout"] = np.nan - va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout + va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout combined_data_handler = CombinedDataHandler( va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero" @@ -92,7 +88,6 @@ def test_zero_unreporting_missing_percent_expected_vote_value(va_governor_county election_id = "2017-11-07_VA_G" office_id = "G" geographic_unit_type = "county" - estimand_baselines = {"turnout": "turnout", "dem": "dem"} estimands = ["turnout", "dem"] live_data_handler = MockLiveDataHandler( election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data @@ -102,7 +97,7 @@ def test_zero_unreporting_missing_percent_expected_vote_value(va_governor_county current_data.loc[0, "percent_expected_vote"] = np.nan current_data.loc[0, "results_dem"] = np.nan - va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout + va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout combined_data_handler = CombinedDataHandler( va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero" @@ -120,7 +115,6 @@ def test_zero_unreporting_random_percent_expected_vote_value(va_governor_county_ election_id = "2017-11-07_VA_G" office_id = "G" geographic_unit_type = "county" - estimand_baselines = {"turnout": "turnout", "dem": "dem"} estimands = ["turnout", "dem"] live_data_handler = MockLiveDataHandler( election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data @@ -129,7 +123,7 @@ def test_zero_unreporting_random_percent_expected_vote_value(va_governor_county_ current_data["percent_expected_vote"] = np.random.randint(1, 100, current_data.shape[0]) current_data.loc[0, "results_dem"] = np.nan - va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout + va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout combined_data_handler = CombinedDataHandler( va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero" @@ -147,7 +141,6 @@ def test_drop_unreporting_missing_single_estimand_value(va_governor_county_data) election_id = "2017-11-07_VA_G" office_id = "G" geographic_unit_type = "county" - estimand_baselines = {"turnout": "turnout", "dem": "dem"} estimands = ["turnout", "dem"] live_data_handler = MockLiveDataHandler( election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data @@ -156,7 +149,7 @@ def test_drop_unreporting_missing_single_estimand_value(va_governor_county_data) current_data["percent_expected_vote"] = 100 current_data.loc[0, "results_dem"] = np.nan - va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout + va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout combined_data_handler = CombinedDataHandler( va_governor_county_data, current_data, estimands, "county", handle_unreporting="drop" @@ -170,20 +163,17 @@ def test_get_reporting_data(va_governor_county_data): office = "G" geographic_unit_type = "county" estimands = ["turnout"] - estimand_baseline = {"turnout": "turnout"} live_data_handler = MockLiveDataHandler( election_id, office, geographic_unit_type, estimands, data=va_governor_county_data ) current_data = live_data_handler.get_n_fully_reported(n=20) - va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout - va_governor_county_data['last_election_results_turnout'] = va_governor_county_data.baseline_turnout + 1 + va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout + va_governor_county_data["last_election_results_turnout"] = va_governor_county_data.baseline_turnout + 1 # no fixed effects - combined_data_handler = CombinedDataHandler( - va_governor_county_data, current_data, estimands, geographic_unit_type - ) + combined_data_handler = CombinedDataHandler(va_governor_county_data, current_data, estimands, geographic_unit_type) observed_data = combined_data_handler.get_reporting_units(100) assert observed_data.shape[0] == 20 assert observed_data.reporting.iloc[0] == 1 @@ -195,19 +185,16 @@ def test_get_reporting_data_dropping_with_turnout_factor(va_governor_county_data office = "G" geographic_unit_type = "county" estimands = ["turnout"] - estimand_baseline = {"turnout": "turnout"} live_data_handler = MockLiveDataHandler( election_id, office, geographic_unit_type, estimands, data=va_governor_county_data ) current_data = live_data_handler.get_n_fully_reported(n=20) - - va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout - va_governor_county_data['last_election_results_turnout'] = va_governor_county_data.baseline_turnout + 1 - combined_data_handler = CombinedDataHandler( - va_governor_county_data, current_data, estimands, geographic_unit_type - ) + va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout + va_governor_county_data["last_election_results_turnout"] = va_governor_county_data.baseline_turnout + 1 + + combined_data_handler = CombinedDataHandler(va_governor_county_data, current_data, estimands, geographic_unit_type) turnout_factor_lower = 0.95 turnout_factor_upper = 1.2 @@ -235,20 +222,17 @@ def test_get_nonreporting_adding_with_turnout_factor(va_governor_county_data): office = "G" geographic_unit_type = "county" estimands = ["turnout"] - estimand_baseline = {"turnout": "turnout"} live_data_handler = MockLiveDataHandler( election_id, office, geographic_unit_type, estimands, data=va_governor_county_data ) n = 20 current_data = live_data_handler.get_n_fully_reported(n=n) - - va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout - va_governor_county_data['last_election_results_turnout'] = va_governor_county_data.baseline_turnout + 1 - combined_data_handler = CombinedDataHandler( - va_governor_county_data, current_data, estimands, geographic_unit_type - ) + va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout + va_governor_county_data["last_election_results_turnout"] = va_governor_county_data.baseline_turnout + 1 + + combined_data_handler = CombinedDataHandler(va_governor_county_data, current_data, estimands, geographic_unit_type) turnout_factor_lower = 0.95 turnout_factor_upper = 1.2 @@ -280,7 +264,6 @@ def test_get_unexpected_units_county_district(va_assembly_county_data): geographic_unit_type = "county-district" estimands = ["turnout"] unexpected_units = 5 - estimand_baseline = {"turnout": "turnout"} live_data_handler = MockLiveDataHandler( election_id, @@ -291,13 +274,11 @@ def test_get_unexpected_units_county_district(va_assembly_county_data): unexpected_units=unexpected_units, ) current_data = live_data_handler.get_n_fully_reported(n=20) - - va_assembly_county_data['baseline_weights'] = va_assembly_county_data.baseline_turnout - va_assembly_county_data['last_election_results_turnout'] = va_assembly_county_data.baseline_turnout + 1 - combined_data_handler = CombinedDataHandler( - va_assembly_county_data, current_data, estimands, geographic_unit_type - ) + va_assembly_county_data["baseline_weights"] = va_assembly_county_data.baseline_turnout + va_assembly_county_data["last_election_results_turnout"] = va_assembly_county_data.baseline_turnout + 1 + + combined_data_handler = CombinedDataHandler(va_assembly_county_data, current_data, estimands, geographic_unit_type) unexpected_data = combined_data_handler.get_unexpected_units(100, ["county_fips", "district"]) assert unexpected_data.shape[0] == unexpected_units assert unexpected_data[unexpected_data.county_fips == ""].shape[0] == 0 @@ -312,7 +293,6 @@ def test_get_unexpected_units_county(va_governor_county_data): geographic_unit_type = "county" estimands = ["turnout"] reporting_unexpected_units = 5 - estimand_baseline = {"turnout": "turnout"} live_data_handler = MockLiveDataHandler( election_id, @@ -329,12 +309,10 @@ def test_get_unexpected_units_county(va_governor_county_data): extra_row["percent_expected_vote"] = 50 current_data = pd.concat([current_data, extra_row]) - va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout - va_governor_county_data['last_election_results_turnout'] = va_governor_county_data.baseline_turnout + 1 + va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout + va_governor_county_data["last_election_results_turnout"] = va_governor_county_data.baseline_turnout + 1 - combined_data_handler = CombinedDataHandler( - va_governor_county_data, current_data, estimands, geographic_unit_type - ) + combined_data_handler = CombinedDataHandler(va_governor_county_data, current_data, estimands, geographic_unit_type) unexpected_data = combined_data_handler.get_unexpected_units(100, ["county_fips"]) assert unexpected_data.shape[0] == reporting_unexpected_units + 1 assert unexpected_data[unexpected_data.county_fips == ""].shape[0] == 0