From f122e66427da231f8eaf47bfcc895aea26ab877d Mon Sep 17 00:00:00 2001
From: Diane Napolitano <diane.napolitano@washpost.com>
Date: Thu, 12 Dec 2024 15:45:20 -0500
Subject: [PATCH] Resolving warnings and pylint complaints in
 VersionedDataHandler, also getting unit tests to pass again

---
 src/elexmodel/handlers/data/VersionedData.py | 32 +++++++++++-------
 tests/handlers/test_versioned_data.py        | 35 +++++++++++---------
 2 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/src/elexmodel/handlers/data/VersionedData.py b/src/elexmodel/handlers/data/VersionedData.py
index b76a55d..4575447 100644
--- a/src/elexmodel/handlers/data/VersionedData.py
+++ b/src/elexmodel/handlers/data/VersionedData.py
@@ -1,3 +1,4 @@
+import warnings
 from datetime import datetime
 
 import numpy as np
@@ -64,10 +65,12 @@ def get_versioned_results(self, filepath=None):
 
         if self.election_id.startswith("2020-11-03_USA_G"):
             path = "elex-models-prod/2020-general/results/pres/current.csv"
-        elif self.election_id.startswith("2024-11-05_USA_G"):
-            path = f"{S3_FILE_PATH}/{self.election_id}/results/{self.office_id}/{self.geographic_unit_type}/current_counties.csv"
         else:
-            path = f"{S3_FILE_PATH}/{self.election_id}/results/{self.office_id}/{self.geographic_unit_type}/current.csv"
+            base_dir = f"{S3_FILE_PATH}/{self.election_id}/results/{self.office_id}/{self.geographic_unit_type}"
+            if self.election_id.startswith("2024-11-05_USA_G"):
+                path = base_dir + "/current_counties.csv"
+            else:
+                path = base_dir + "/current.csv"
 
         data = self.s3_client.get(path, self.sample)
         LOG.info("Loaded versioned results from S3")
@@ -124,7 +127,8 @@ def compute_estimated_margin(df):
                 casting="unsafe",
             )
 
-            # check if perc_expected_vote_corr is monotone increasing (if not, give up and don't try to estimate a margin)
+            # check if perc_expected_vote_corr is monotone increasing
+            # (if not, give up and don't try to estimate a margin)
             if not np.all(np.diff(perc_expected_vote_corr) >= 0):
                 return pd.DataFrame(
                     {
@@ -143,15 +147,18 @@ def compute_estimated_margin(df):
             # Compute batch_margin using NumPy
             # this is the difference in dem_votes - the difference in gop_votes divided by the difference in total votes
             # that is, this is the normalized margin in the batch of votes recorded between versions
-            batch_margin = (
-                np.diff(results_dem, append=results_dem[-1]) - np.diff(results_gop, append=results_gop[-1])
-            ) / np.diff(results_weights, append=results_weights[-1])
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", RuntimeWarning)
+                batch_margin = (
+                    np.diff(results_dem, append=results_dem[-1]) - np.diff(results_gop, append=results_gop[-1])
+                ) / np.diff(results_weights, append=results_weights[-1])
 
             # nan values in batch_margin are due to div-by-zero since there's no change in votes
             batch_margin[np.isnan(batch_margin)] = 0  # Set NaN margins to 0
             df["batch_margin"] = batch_margin
 
-            # batch_margins should be between -1 and 1 (otherwise, there was a data entry issue and we will not use this unit)
+            # batch_margins should be between -1 and 1
+            # (otherwise, there was a data entry issue and we will not use this unit)
             if np.abs(batch_margin).max() > 1:
                 return pd.DataFrame(
                     {
@@ -208,7 +215,9 @@ def compute_estimated_margin(df):
                 }
             )
 
-        results = results.groupby("geographic_unit_fips").apply(compute_estimated_margin).reset_index()
+        results = (
+            results.groupby("geographic_unit_fips").apply(compute_estimated_margin, include_groups=False).reset_index()
+        )
 
         for error_type in sorted(set(results["error_type"])):
             if error_type == "none":
@@ -222,9 +231,8 @@ def get_versioned_predictions(self, filepath=None):
             return pd.read_csv(filepath)
 
         if self.election_id.startswith("2020-11-03_USA_G"):
-            path = "elex-models-prod/2020-general/prediction/pres/current.csv"
             raise ValueError("No versioned predictions available for this election.")
-        else:
-            path = f"{S3_FILE_PATH}/{self.election_id}/predictions/{self.office_id}/{self.geographic_unit_type}/current.csv"
+
+        path = f"{S3_FILE_PATH}/{self.election_id}/predictions/{self.office_id}/{self.geographic_unit_type}/current.csv"
 
         return self.s3_client.get(path, self.sample)
diff --git a/tests/handlers/test_versioned_data.py b/tests/handlers/test_versioned_data.py
index e64c758..d19892e 100644
--- a/tests/handlers/test_versioned_data.py
+++ b/tests/handlers/test_versioned_data.py
@@ -2,25 +2,28 @@
 
 
 def test_versioned_data_without_errors(versioned_data_no_errors):
-    vdh = VersionedDataHandler("2024-11-05_USA_G", "S", "county", data=versioned_data_no_errors)
-    assert len(vdh.data) == 100
-    assert (vdh.data["nearest_observed_vote"] == 99.0).all()
-    assert vdh.data["est_margin"].max().round(6) == 0.186405
-    assert vdh.data["est_correction"].min() == -2.7755575615628914e-17
-    assert (vdh.data["error_type"] == "none").all()
+    vdh = VersionedDataHandler("2024-11-05_USA_G", "S", "county")
+    results = vdh.compute_versioned_margin_estimate(data=versioned_data_no_errors)
+    assert len(results) == 100
+    assert (results["nearest_observed_vote"] == 99.0).all()
+    assert results["est_margin"].max().round(6) == 0.186405
+    assert results["est_correction"].min() == -2.7755575615628914e-17
+    assert (results["error_type"] == "none").all()
 
 
 def test_versioned_data_with_non_monotone_ev(versioned_data_non_monotone):
-    vdh = VersionedDataHandler("2024-11-05_USA_G", "S", "county", data=versioned_data_non_monotone)
-    assert len(vdh.data) == 101
-    assert len(vdh.data[vdh.data["est_margin"].isnull()]) == 101
-    assert len(vdh.data[vdh.data["est_correction"].isnull()]) == 101
-    assert (vdh.data["error_type"] == "non-monotone percent expected vote").all()
+    vdh = VersionedDataHandler("2024-11-05_USA_G", "S", "county")
+    results = vdh.compute_versioned_margin_estimate(data=versioned_data_non_monotone)
+    assert len(results) == 101
+    assert len(results[results["est_margin"].isnull()]) == 101
+    assert len(results[results["est_correction"].isnull()]) == 101
+    assert (results["error_type"] == "non-monotone percent expected vote").all()
 
 
 def test_versioned_data_with_batch_margin_error(versioned_data_batch_margin):
-    vdh = VersionedDataHandler("2024-11-05_USA_G", "S", "county", data=versioned_data_batch_margin)
-    assert len(vdh.data) == 101
-    assert len(vdh.data[vdh.data["est_margin"].isnull()]) == 101
-    assert len(vdh.data[vdh.data["est_correction"].isnull()]) == 101
-    assert (vdh.data["error_type"] == "batch_margin").all()
+    vdh = VersionedDataHandler("2024-11-05_USA_G", "S", "county")
+    results = vdh.compute_versioned_margin_estimate(data=versioned_data_batch_margin)
+    assert len(results) == 101
+    assert len(results[results["est_margin"].isnull()]) == 101
+    assert len(results[results["est_correction"].isnull()]) == 101
+    assert (results["error_type"] == "batch_margin").all()