From f122e66427da231f8eaf47bfcc895aea26ab877d Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Thu, 12 Dec 2024 15:45:20 -0500 Subject: [PATCH] Resolving warnings and pylint complaints in VersionedDataHandler, also getting unit tests to pass again --- src/elexmodel/handlers/data/VersionedData.py | 32 +++++++++++------- tests/handlers/test_versioned_data.py | 35 +++++++++++--------- 2 files changed, 39 insertions(+), 28 deletions(-) diff --git a/src/elexmodel/handlers/data/VersionedData.py b/src/elexmodel/handlers/data/VersionedData.py index b76a55d..4575447 100644 --- a/src/elexmodel/handlers/data/VersionedData.py +++ b/src/elexmodel/handlers/data/VersionedData.py @@ -1,3 +1,4 @@ +import warnings from datetime import datetime import numpy as np @@ -64,10 +65,12 @@ def get_versioned_results(self, filepath=None): if self.election_id.startswith("2020-11-03_USA_G"): path = "elex-models-prod/2020-general/results/pres/current.csv" - elif self.election_id.startswith("2024-11-05_USA_G"): - path = f"{S3_FILE_PATH}/{self.election_id}/results/{self.office_id}/{self.geographic_unit_type}/current_counties.csv" else: - path = f"{S3_FILE_PATH}/{self.election_id}/results/{self.office_id}/{self.geographic_unit_type}/current.csv" + base_dir = f"{S3_FILE_PATH}/{self.election_id}/results/{self.office_id}/{self.geographic_unit_type}" + if self.election_id.startswith("2024-11-05_USA_G"): + path = base_dir + "/current_counties.csv" + else: + path = base_dir + "/current.csv" data = self.s3_client.get(path, self.sample) LOG.info("Loaded versioned results from S3") @@ -124,7 +127,8 @@ def compute_estimated_margin(df): casting="unsafe", ) - # check if perc_expected_vote_corr is monotone increasing (if not, give up and don't try to estimate a margin) + # check if perc_expected_vote_corr is monotone increasing + # (if not, give up and don't try to estimate a margin) if not np.all(np.diff(perc_expected_vote_corr) >= 0): return pd.DataFrame( { @@ -143,15 +147,18 @@ def compute_estimated_margin(df): # Compute batch_margin using NumPy # this is the difference in dem_votes - the difference in gop_votes divided by the difference in total votes # that is, this is the normalized margin in the batch of votes recorded between versions - batch_margin = ( - np.diff(results_dem, append=results_dem[-1]) - np.diff(results_gop, append=results_gop[-1]) - ) / np.diff(results_weights, append=results_weights[-1]) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + batch_margin = ( + np.diff(results_dem, append=results_dem[-1]) - np.diff(results_gop, append=results_gop[-1]) + ) / np.diff(results_weights, append=results_weights[-1]) # nan values in batch_margin are due to div-by-zero since there's no change in votes batch_margin[np.isnan(batch_margin)] = 0 # Set NaN margins to 0 df["batch_margin"] = batch_margin - # batch_margins should be between -1 and 1 (otherwise, there was a data entry issue and we will not use this unit) + # batch_margins should be between -1 and 1 + # (otherwise, there was a data entry issue and we will not use this unit) if np.abs(batch_margin).max() > 1: return pd.DataFrame( { @@ -208,7 +215,9 @@ def compute_estimated_margin(df): } ) - results = results.groupby("geographic_unit_fips").apply(compute_estimated_margin).reset_index() + results = ( + results.groupby("geographic_unit_fips").apply(compute_estimated_margin, include_groups=False).reset_index() + ) for error_type in sorted(set(results["error_type"])): if error_type == "none": @@ -222,9 +231,8 @@ def get_versioned_predictions(self, filepath=None): return pd.read_csv(filepath) if self.election_id.startswith("2020-11-03_USA_G"): - path = "elex-models-prod/2020-general/prediction/pres/current.csv" raise ValueError("No versioned predictions available for this election.") - else: - path = f"{S3_FILE_PATH}/{self.election_id}/predictions/{self.office_id}/{self.geographic_unit_type}/current.csv" + + path = f"{S3_FILE_PATH}/{self.election_id}/predictions/{self.office_id}/{self.geographic_unit_type}/current.csv" return self.s3_client.get(path, self.sample) diff --git a/tests/handlers/test_versioned_data.py b/tests/handlers/test_versioned_data.py index e64c758..d19892e 100644 --- a/tests/handlers/test_versioned_data.py +++ b/tests/handlers/test_versioned_data.py @@ -2,25 +2,28 @@ def test_versioned_data_without_errors(versioned_data_no_errors): - vdh = VersionedDataHandler("2024-11-05_USA_G", "S", "county", data=versioned_data_no_errors) - assert len(vdh.data) == 100 - assert (vdh.data["nearest_observed_vote"] == 99.0).all() - assert vdh.data["est_margin"].max().round(6) == 0.186405 - assert vdh.data["est_correction"].min() == -2.7755575615628914e-17 - assert (vdh.data["error_type"] == "none").all() + vdh = VersionedDataHandler("2024-11-05_USA_G", "S", "county") + results = vdh.compute_versioned_margin_estimate(data=versioned_data_no_errors) + assert len(results) == 100 + assert (results["nearest_observed_vote"] == 99.0).all() + assert results["est_margin"].max().round(6) == 0.186405 + assert results["est_correction"].min() == -2.7755575615628914e-17 + assert (results["error_type"] == "none").all() def test_versioned_data_with_non_monotone_ev(versioned_data_non_monotone): - vdh = VersionedDataHandler("2024-11-05_USA_G", "S", "county", data=versioned_data_non_monotone) - assert len(vdh.data) == 101 - assert len(vdh.data[vdh.data["est_margin"].isnull()]) == 101 - assert len(vdh.data[vdh.data["est_correction"].isnull()]) == 101 - assert (vdh.data["error_type"] == "non-monotone percent expected vote").all() + vdh = VersionedDataHandler("2024-11-05_USA_G", "S", "county") + results = vdh.compute_versioned_margin_estimate(data=versioned_data_non_monotone) + assert len(results) == 101 + assert len(results[results["est_margin"].isnull()]) == 101 + assert len(results[results["est_correction"].isnull()]) == 101 + assert (results["error_type"] == "non-monotone percent expected vote").all() def test_versioned_data_with_batch_margin_error(versioned_data_batch_margin): - vdh = VersionedDataHandler("2024-11-05_USA_G", "S", "county", data=versioned_data_batch_margin) - assert len(vdh.data) == 101 - assert len(vdh.data[vdh.data["est_margin"].isnull()]) == 101 - assert len(vdh.data[vdh.data["est_correction"].isnull()]) == 101 - assert (vdh.data["error_type"] == "batch_margin").all() + vdh = VersionedDataHandler("2024-11-05_USA_G", "S", "county") + results = vdh.compute_versioned_margin_estimate(data=versioned_data_batch_margin) + assert len(results) == 101 + assert len(results[results["est_margin"].isnull()]) == 101 + assert len(results[results["est_correction"].isnull()]) == 101 + assert (results["error_type"] == "batch_margin").all()