From cd22a53db9a1955bec79a977c558bed563421156 Mon Sep 17 00:00:00 2001
From: lennybronner <lenny.bronner@gmail.com>
Date: Tue, 15 Oct 2024 18:44:45 -0400
Subject: [PATCH] renamed + better printing of non-modeled units

---
 src/elexmodel/client.py                       | 16 ++--
 src/elexmodel/handlers/data/CombinedData.py   | 79 +++++++++++--------
 tests/models/test_bootstrap_election_model.py |  8 +-
 3 files changed, 57 insertions(+), 46 deletions(-)

diff --git a/src/elexmodel/client.py b/src/elexmodel/client.py
index 5194bd8f..bc194655 100644
--- a/src/elexmodel/client.py
+++ b/src/elexmodel/client.py
@@ -1,5 +1,6 @@
 import logging
 from collections import defaultdict
+import pprint
 
 import numpy as np
 import pandas as pd
@@ -299,16 +300,16 @@ def get_estimates(
         turnout_factor_lower = model_parameters.get("turnout_factor_lower", 0.5)
         turnout_factor_upper = model_parameters.get("turnout_factor_upper", 2.0)
         margin_change_threshold = model_parameters.get("margin_change_threshold", 0.3)
-        unit_blacklist = model_parameters.get("unit_blacklist", [])
-        postal_code_blacklist = model_parameters.get("postal_code_blacklist", [])
+        unit_blocklist = model_parameters.get("unit_blocklist", [])
+        postal_code_blocklist = model_parameters.get("postal_code_blocklist", [])
 
         (reporting_units, nonreporting_units, unexpected_units) = data.get_units(
             percent_reporting_threshold,
             turnout_factor_lower,
             turnout_factor_upper,
             margin_change_threshold,
-            unit_blacklist,
-            postal_code_blacklist,
+            unit_blocklist,
+            postal_code_blocklist,
             aggregates,
         )
 
@@ -338,7 +339,7 @@ def get_estimates(
         if APP_ENV != "local" and self.save_results:
             data.write_data(self.election_id, self.office)
 
-        non_modeled_units = unexpected_units[unexpected_units["unit_category"] == "non-modeled"]
+        non_modeled_units = unexpected_units[unexpected_units["unit_category"].str.startswith("non-modeled")]
         n_reporting_expected_units = reporting_units.shape[0]
         n_unexpected_units = len(unexpected_units[unexpected_units["unit_category"] == "unexpected"])
         n_nonreporting_units = nonreporting_units.shape[0]
@@ -351,8 +352,9 @@ def get_estimates(
             There are {n_nonreporting_units} nonreporting units."""
         )
         if len(non_modeled_units) > 0:
-            non_modeled_units = non_modeled_units.groupby("postal_code")["geographic_unit_fips"].apply(list).to_dict()
-            LOG.info(f"non-modeled units:\n{non_modeled_units}")
+            non_modeled_units = non_modeled_units.groupby("unit_category")[["postal_code", "geographic_unit_fips"]].apply(lambda x: list(x.itertuples(index=False, name=None))).to_dict()
+            non_modeled_units_pprint = pprint.pformat(non_modeled_units)
+            LOG.info(f"non-modeled units:\n{non_modeled_units_pprint}")
 
         if n_reporting_expected_units < minimum_reporting_units_max:
             raise ModelNotEnoughSubunitsException(
diff --git a/src/elexmodel/handlers/data/CombinedData.py b/src/elexmodel/handlers/data/CombinedData.py
index 54fa92d0..8b930234 100644
--- a/src/elexmodel/handlers/data/CombinedData.py
+++ b/src/elexmodel/handlers/data/CombinedData.py
@@ -54,8 +54,8 @@ def get_units(
         turnout_factor_lower,
         turnout_factor_upper,
         margin_change_threshold,
-        unit_blacklist,
-        postal_code_blacklist,
+        unit_blocklist,
+        postal_code_blocklist,
         aggregates,
     ):
         """
@@ -66,7 +66,7 @@ def get_units(
             - unexpected units (ie. units for which we have no covariates prepared)
             - units for which the baseline results is zero (ie. units that are tiny)
             - units with strange turnout factors (ie. units that are likely precinct mismatches)
-            - units that have been blacklisted
+            - units that have been blocklisted
         """
 
         # units where the expected vote is greater than the percent reporting threshold
@@ -81,8 +81,8 @@ def get_units(
             turnout_factor_lower,
             turnout_factor_upper,
             margin_change_threshold,
-            unit_blacklist,
-            postal_code_blacklist,
+            unit_blocklist,
+            postal_code_blocklist,
         )
 
         # remove these units from the reporting units
@@ -119,8 +119,6 @@ def get_units(
         nonreporting_units["unit_category"] = "expected"
 
         # finalize all unexpected/non-modeled units
-        unexpected_units["unit_category"] = "unexpected"
-        non_modeled_units["unit_category"] = "non-modeled"
         all_unexpected_units = pd.concat([unexpected_units, non_modeled_units]).reset_index(drop=True)
         all_unexpected_units["reporting"] = int(0)
 
@@ -164,6 +162,7 @@ def _get_unexpected_units(self, aggregates):
             .drop_duplicates(subset="geographic_unit_fips")
             .copy()
         )
+        unexpected_units["unit_category"] = "unexpected"
 
         # since we were not expecting them, we have don't have their county or district
         # from preprocessed data. so we have to add that back in.
@@ -184,43 +183,53 @@ def _get_non_modeled_units(
         turnout_factor_lower,
         turnout_factor_upper,
         margin_change_threshold,
-        unit_blacklist,
-        postal_code_blacklist,
+        unit_blocklist,
+        postal_code_blocklist,
     ):
         expected_geographic_units = self._get_expected_geographic_unit_fips().tolist()
         zero_baseline_units = self._get_units_with_baseline_of_zero()
-        if "margin" in self.estimands:
-            self.data["normalized_margin_change"] = (
-                self.data.baseline_normalized_margin - self.data.results_normalized_margin
-            ).abs()
+
+        units_blocklisted = self.data[
+            (self.data["geographic_unit_fips"].isin(unit_blocklist))
+            | (self.data["postal_code"].isin(postal_code_blocklist))
+        ].copy()
+        units_blocklisted["unit_category"] = "non-modeled: blocklisted"
+
+        units_with_zero_baseline = self.data[self.data["geographic_unit_fips"].isin(zero_baseline_units)].copy()
+        units_with_zero_baseline["unit_category"] = "non-modeled: zero baseline"
+
         units_with_strange_turnout_factor = (
             self.data[
-                (
-                    self.data.percent_expected_vote >= percent_reporting_threshold
-                )  # reporting units (otherwise nonreporting unit)
-                & (
-                    self.data["geographic_unit_fips"].isin(expected_geographic_units)
-                )  # and expected unit (otherwise unexpected unit)
+                (self.data.percent_expected_vote >= percent_reporting_threshold)
+                & (self.data["geographic_unit_fips"].isin(expected_geographic_units))
                 & (
-                    (self.data["geographic_unit_fips"].isin(zero_baseline_units))  # zero baseline
-                    | (self.data.turnout_factor <= turnout_factor_lower)  # or low turnout factor
-                    | (self.data.turnout_factor >= turnout_factor_upper)  # or high turnout factor
-                    | (
-                        ("margin" in self.estimands) and (self.data.normalized_margin_change > margin_change_threshold)
-                    )  # or large margin change if margin is an estimand (using and rather than & here is fine since first clause is a boolean and not a series)
+                    (self.data.turnout_factor <= turnout_factor_lower)
+                    | (self.data.turnout_factor >= turnout_factor_upper)
                 )
-                | (
-                    self.data["geographic_unit_fips"].isin(unit_blacklist)
-                )  # or blacklisted (doesn't need to be reporting/expected)
-                | (
-                    self.data["postal_code"].isin(postal_code_blacklist)
-                )  # or entire state is blacklisted (doesn't need to be reporting/expected)
             ]
-            .drop_duplicates(subset="geographic_unit_fips")
-            .copy()
-        )
+        ).copy()
+        units_with_strange_turnout_factor["unit_category"] = "non-modeled: strange turnout factor"
+
+        non_modeled_units_list = [units_blocklisted, units_with_zero_baseline, units_with_strange_turnout_factor]
+
+        if "margin" in self.estimands:
+            # instead fit stupid model based on all reporting units (whether modeled or not)
+            # units that are far from that model should also be non-modeled
+
+            self.data["normalized_margin_change"] = (
+                self.data.baseline_normalized_margin - self.data.results_normalized_margin
+            ).abs()
 
-        return units_with_strange_turnout_factor
+            units_with_strange_margin_change = self.data[
+                (self.data.percent_expected_vote >= percent_reporting_threshold)
+                & (self.data["geographic_unit_fips"].isin(expected_geographic_units))
+                & (self.data.normalized_margin_change > margin_change_threshold)
+            ].copy()
+            units_with_strange_margin_change["unit_category"] = "non-modeled: strange margin change"
+            non_modeled_units_list.append(units_with_strange_margin_change)
+        
+        non_modeled_units = pd.concat(non_modeled_units_list).reset_index(drop=True).drop_duplicates(subset='geographic_unit_fips')
+        return non_modeled_units
 
     def write_data(self, election_id, office):
         s3_client = s3.S3CsvUtil(TARGET_BUCKET)
diff --git a/tests/models/test_bootstrap_election_model.py b/tests/models/test_bootstrap_election_model.py
index 430ffdb0..6d0651c0 100644
--- a/tests/models/test_bootstrap_election_model.py
+++ b/tests/models/test_bootstrap_election_model.py
@@ -498,8 +498,8 @@ def test_compute_bootstrap_errors(bootstrap_election_model, va_governor_county_d
         turnout_factor_lower=turnout_factor_lower,
         turnout_factor_upper=turnout_factor_upper,
         margin_change_threshold=0.4,
-        unit_blacklist=[],
-        postal_code_blacklist=[],
+        unit_blocklist=[],
+        postal_code_blocklist=[],
         aggregates=["postal_code"],
     )
 
@@ -544,8 +544,8 @@ def test_get_unit_predictions(bootstrap_election_model, va_governor_county_data)
         turnout_factor_lower=turnout_factor_lower,
         turnout_factor_upper=turnout_factor_upper,
         margin_change_threshold=0.4,
-        unit_blacklist=[],
-        postal_code_blacklist=[],
+        unit_blocklist=[],
+        postal_code_blocklist=[],
         aggregates=["postal_code"],
     )