From 781412ff9794c88e5e60edbacc7137959bc750ae Mon Sep 17 00:00:00 2001 From: John Cherian Date: Tue, 5 Nov 2024 14:15:34 -0500 Subject: [PATCH 1/4] error handling when versioned results are bad is fixed + extra logging --- src/elexmodel/handlers/data/VersionedData.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/elexmodel/handlers/data/VersionedData.py b/src/elexmodel/handlers/data/VersionedData.py index 77a4a7e3..48c84588 100644 --- a/src/elexmodel/handlers/data/VersionedData.py +++ b/src/elexmodel/handlers/data/VersionedData.py @@ -122,11 +122,13 @@ def compute_estimated_margin(df): # check if perc_expected_vote_corr is monotone increasing (if not, give up and don't try to estimate a margin) if not np.all(np.diff(perc_expected_vote_corr) >= 0): + LOG.info(f"Non-monotonic percent_expected_vote in versioned data for {df.geographic_unit_fips.iloc[0]}") return pd.DataFrame( { "percent_expected_vote": np.arange(101), "nearest_observed_vote": np.nan * np.ones(101), "est_margin": np.nan * np.ones(101), + "est_correction": np.nan * np.ones(101), } ) @@ -147,11 +149,15 @@ def compute_estimated_margin(df): # batch_margins should be between -1 and 1 (otherwise, there was a data entry issue and we will not use this unit) if np.abs(batch_margin).max() > 1: + LOG.info( + f"Implausible batch margin {np.abs(batch_margin).max()} in versioned data for {df.geographic_unit_fips.iloc[0]}" + ) return pd.DataFrame( { "percent_expected_vote": np.arange(101), "nearest_observed_vote": np.nan * np.ones(101), "est_margin": np.nan * np.ones(101), + "est_correction": np.nan * np.ones(101), } ) From e2ba261b834189298f60da7f824cfc531138a01c Mon Sep 17 00:00:00 2001 From: John Cherian Date: Tue, 5 Nov 2024 14:16:08 -0500 Subject: [PATCH 2/4] linter --- src/elexmodel/handlers/data/VersionedData.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/elexmodel/handlers/data/VersionedData.py b/src/elexmodel/handlers/data/VersionedData.py index 48c84588..6900bfd2 100644 --- a/src/elexmodel/handlers/data/VersionedData.py +++ b/src/elexmodel/handlers/data/VersionedData.py @@ -122,7 +122,9 @@ def compute_estimated_margin(df): # check if perc_expected_vote_corr is monotone increasing (if not, give up and don't try to estimate a margin) if not np.all(np.diff(perc_expected_vote_corr) >= 0): - LOG.info(f"Non-monotonic percent_expected_vote in versioned data for {df.geographic_unit_fips.iloc[0]}") + LOG.info( + f"Non-monotonic percent_expected_vote in versioned data for {df.geographic_unit_fips.iloc[0]}." + ) return pd.DataFrame( { "percent_expected_vote": np.arange(101), @@ -150,7 +152,7 @@ def compute_estimated_margin(df): # batch_margins should be between -1 and 1 (otherwise, there was a data entry issue and we will not use this unit) if np.abs(batch_margin).max() > 1: LOG.info( - f"Implausible batch margin {np.abs(batch_margin).max()} in versioned data for {df.geographic_unit_fips.iloc[0]}" + f"Implausible batch margin {np.abs(batch_margin).max()} in versioned data for {df.geographic_unit_fips.iloc[0]}." ) return pd.DataFrame( { From 6aae2ba4f0b3def35f6a00c6f62d9d134797ae9f Mon Sep 17 00:00:00 2001 From: John Cherian Date: Tue, 5 Nov 2024 14:39:18 -0500 Subject: [PATCH 3/4] changed logging to release # of error types rather than fips IDs --- src/elexmodel/handlers/data/VersionedData.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/elexmodel/handlers/data/VersionedData.py b/src/elexmodel/handlers/data/VersionedData.py index 6900bfd2..231bfc29 100644 --- a/src/elexmodel/handlers/data/VersionedData.py +++ b/src/elexmodel/handlers/data/VersionedData.py @@ -122,15 +122,13 @@ def compute_estimated_margin(df): # check if perc_expected_vote_corr is monotone increasing (if not, give up and don't try to estimate a margin) if not np.all(np.diff(perc_expected_vote_corr) >= 0): - LOG.info( - f"Non-monotonic percent_expected_vote in versioned data for {df.geographic_unit_fips.iloc[0]}." - ) return pd.DataFrame( { "percent_expected_vote": np.arange(101), "nearest_observed_vote": np.nan * np.ones(101), "est_margin": np.nan * np.ones(101), "est_correction": np.nan * np.ones(101), + "error_type": "non-monotone percent expected vote", } ) @@ -151,15 +149,13 @@ def compute_estimated_margin(df): # batch_margins should be between -1 and 1 (otherwise, there was a data entry issue and we will not use this unit) if np.abs(batch_margin).max() > 1: - LOG.info( - f"Implausible batch margin {np.abs(batch_margin).max()} in versioned data for {df.geographic_unit_fips.iloc[0]}." - ) return pd.DataFrame( { "percent_expected_vote": np.arange(101), "nearest_observed_vote": np.nan * np.ones(101), "est_margin": np.nan * np.ones(101), "est_correction": np.nan * np.ones(101), + "error_type": "batch_margin", } ) @@ -204,10 +200,17 @@ def compute_estimated_margin(df): "nearest_observed_vote": percent_vote[np.clip(obs_indices + 1, 0, len(percent_vote) - 1)], "est_margin": est_margins, "est_correction": norm_margin[-1] - est_margins, + "error_type": "none", } ) results = results.groupby("geographic_unit_fips").apply(compute_estimated_margin).reset_index() + + for error_type in sorted(set(results["error_type"])): + if error_type == "none": + continue + category_error_type = results[results["error_type"] == error_type].geographic_unit_fips.unique() + LOG.info(f"# of versioned units with {error_type} error: {len(category_error_type)}") return results def get_versioned_predictions(self, filepath=None): From 838fd125f21c1f20cdf2b30138f5c10b72d57b6d Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 5 Nov 2024 15:06:08 -0500 Subject: [PATCH 4/4] Preparing for release --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 18915c73..f5d7f2f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +# 2.2.2 (11/5/2024) +- fix: missing `est_correction` column in `VersionedResults` `DataFrame` in the event of bad data [#131](https://github.com/washingtonpost/elex-live-model/pull/131) + # 2.2.1 (11/1/2024) - chore: downgrade botocore and s3transfer as per live team dependency [#128](https://github.com/washingtonpost/elex-live-model/pull/128)