From cd22a53db9a1955bec79a977c558bed563421156 Mon Sep 17 00:00:00 2001 From: lennybronner Date: Tue, 15 Oct 2024 18:44:45 -0400 Subject: [PATCH] renamed + better printing of non-modeled units --- src/elexmodel/client.py | 16 ++-- src/elexmodel/handlers/data/CombinedData.py | 79 +++++++++++-------- tests/models/test_bootstrap_election_model.py | 8 +- 3 files changed, 57 insertions(+), 46 deletions(-) diff --git a/src/elexmodel/client.py b/src/elexmodel/client.py index 5194bd8f..bc194655 100644 --- a/src/elexmodel/client.py +++ b/src/elexmodel/client.py @@ -1,5 +1,6 @@ import logging from collections import defaultdict +import pprint import numpy as np import pandas as pd @@ -299,16 +300,16 @@ def get_estimates( turnout_factor_lower = model_parameters.get("turnout_factor_lower", 0.5) turnout_factor_upper = model_parameters.get("turnout_factor_upper", 2.0) margin_change_threshold = model_parameters.get("margin_change_threshold", 0.3) - unit_blacklist = model_parameters.get("unit_blacklist", []) - postal_code_blacklist = model_parameters.get("postal_code_blacklist", []) + unit_blocklist = model_parameters.get("unit_blocklist", []) + postal_code_blocklist = model_parameters.get("postal_code_blocklist", []) (reporting_units, nonreporting_units, unexpected_units) = data.get_units( percent_reporting_threshold, turnout_factor_lower, turnout_factor_upper, margin_change_threshold, - unit_blacklist, - postal_code_blacklist, + unit_blocklist, + postal_code_blocklist, aggregates, ) @@ -338,7 +339,7 @@ def get_estimates( if APP_ENV != "local" and self.save_results: data.write_data(self.election_id, self.office) - non_modeled_units = unexpected_units[unexpected_units["unit_category"] == "non-modeled"] + non_modeled_units = unexpected_units[unexpected_units["unit_category"].str.startswith("non-modeled")] n_reporting_expected_units = reporting_units.shape[0] n_unexpected_units = len(unexpected_units[unexpected_units["unit_category"] == "unexpected"]) n_nonreporting_units = nonreporting_units.shape[0] @@ -351,8 +352,9 @@ def get_estimates( There are {n_nonreporting_units} nonreporting units.""" ) if len(non_modeled_units) > 0: - non_modeled_units = non_modeled_units.groupby("postal_code")["geographic_unit_fips"].apply(list).to_dict() - LOG.info(f"non-modeled units:\n{non_modeled_units}") + non_modeled_units = non_modeled_units.groupby("unit_category")[["postal_code", "geographic_unit_fips"]].apply(lambda x: list(x.itertuples(index=False, name=None))).to_dict() + non_modeled_units_pprint = pprint.pformat(non_modeled_units) + LOG.info(f"non-modeled units:\n{non_modeled_units_pprint}") if n_reporting_expected_units < minimum_reporting_units_max: raise ModelNotEnoughSubunitsException( diff --git a/src/elexmodel/handlers/data/CombinedData.py b/src/elexmodel/handlers/data/CombinedData.py index 54fa92d0..8b930234 100644 --- a/src/elexmodel/handlers/data/CombinedData.py +++ b/src/elexmodel/handlers/data/CombinedData.py @@ -54,8 +54,8 @@ def get_units( turnout_factor_lower, turnout_factor_upper, margin_change_threshold, - unit_blacklist, - postal_code_blacklist, + unit_blocklist, + postal_code_blocklist, aggregates, ): """ @@ -66,7 +66,7 @@ def get_units( - unexpected units (ie. units for which we have no covariates prepared) - units for which the baseline results is zero (ie. units that are tiny) - units with strange turnout factors (ie. units that are likely precinct mismatches) - - units that have been blacklisted + - units that have been blocklisted """ # units where the expected vote is greater than the percent reporting threshold @@ -81,8 +81,8 @@ def get_units( turnout_factor_lower, turnout_factor_upper, margin_change_threshold, - unit_blacklist, - postal_code_blacklist, + unit_blocklist, + postal_code_blocklist, ) # remove these units from the reporting units @@ -119,8 +119,6 @@ def get_units( nonreporting_units["unit_category"] = "expected" # finalize all unexpected/non-modeled units - unexpected_units["unit_category"] = "unexpected" - non_modeled_units["unit_category"] = "non-modeled" all_unexpected_units = pd.concat([unexpected_units, non_modeled_units]).reset_index(drop=True) all_unexpected_units["reporting"] = int(0) @@ -164,6 +162,7 @@ def _get_unexpected_units(self, aggregates): .drop_duplicates(subset="geographic_unit_fips") .copy() ) + unexpected_units["unit_category"] = "unexpected" # since we were not expecting them, we have don't have their county or district # from preprocessed data. so we have to add that back in. @@ -184,43 +183,53 @@ def _get_non_modeled_units( turnout_factor_lower, turnout_factor_upper, margin_change_threshold, - unit_blacklist, - postal_code_blacklist, + unit_blocklist, + postal_code_blocklist, ): expected_geographic_units = self._get_expected_geographic_unit_fips().tolist() zero_baseline_units = self._get_units_with_baseline_of_zero() - if "margin" in self.estimands: - self.data["normalized_margin_change"] = ( - self.data.baseline_normalized_margin - self.data.results_normalized_margin - ).abs() + + units_blocklisted = self.data[ + (self.data["geographic_unit_fips"].isin(unit_blocklist)) + | (self.data["postal_code"].isin(postal_code_blocklist)) + ].copy() + units_blocklisted["unit_category"] = "non-modeled: blocklisted" + + units_with_zero_baseline = self.data[self.data["geographic_unit_fips"].isin(zero_baseline_units)].copy() + units_with_zero_baseline["unit_category"] = "non-modeled: zero baseline" + units_with_strange_turnout_factor = ( self.data[ - ( - self.data.percent_expected_vote >= percent_reporting_threshold - ) # reporting units (otherwise nonreporting unit) - & ( - self.data["geographic_unit_fips"].isin(expected_geographic_units) - ) # and expected unit (otherwise unexpected unit) + (self.data.percent_expected_vote >= percent_reporting_threshold) + & (self.data["geographic_unit_fips"].isin(expected_geographic_units)) & ( - (self.data["geographic_unit_fips"].isin(zero_baseline_units)) # zero baseline - | (self.data.turnout_factor <= turnout_factor_lower) # or low turnout factor - | (self.data.turnout_factor >= turnout_factor_upper) # or high turnout factor - | ( - ("margin" in self.estimands) and (self.data.normalized_margin_change > margin_change_threshold) - ) # or large margin change if margin is an estimand (using and rather than & here is fine since first clause is a boolean and not a series) + (self.data.turnout_factor <= turnout_factor_lower) + | (self.data.turnout_factor >= turnout_factor_upper) ) - | ( - self.data["geographic_unit_fips"].isin(unit_blacklist) - ) # or blacklisted (doesn't need to be reporting/expected) - | ( - self.data["postal_code"].isin(postal_code_blacklist) - ) # or entire state is blacklisted (doesn't need to be reporting/expected) ] - .drop_duplicates(subset="geographic_unit_fips") - .copy() - ) + ).copy() + units_with_strange_turnout_factor["unit_category"] = "non-modeled: strange turnout factor" + + non_modeled_units_list = [units_blocklisted, units_with_zero_baseline, units_with_strange_turnout_factor] + + if "margin" in self.estimands: + # instead fit stupid model based on all reporting units (whether modeled or not) + # units that are far from that model should also be non-modeled + + self.data["normalized_margin_change"] = ( + self.data.baseline_normalized_margin - self.data.results_normalized_margin + ).abs() - return units_with_strange_turnout_factor + units_with_strange_margin_change = self.data[ + (self.data.percent_expected_vote >= percent_reporting_threshold) + & (self.data["geographic_unit_fips"].isin(expected_geographic_units)) + & (self.data.normalized_margin_change > margin_change_threshold) + ].copy() + units_with_strange_margin_change["unit_category"] = "non-modeled: strange margin change" + non_modeled_units_list.append(units_with_strange_margin_change) + + non_modeled_units = pd.concat(non_modeled_units_list).reset_index(drop=True).drop_duplicates(subset='geographic_unit_fips') + return non_modeled_units def write_data(self, election_id, office): s3_client = s3.S3CsvUtil(TARGET_BUCKET) diff --git a/tests/models/test_bootstrap_election_model.py b/tests/models/test_bootstrap_election_model.py index 430ffdb0..6d0651c0 100644 --- a/tests/models/test_bootstrap_election_model.py +++ b/tests/models/test_bootstrap_election_model.py @@ -498,8 +498,8 @@ def test_compute_bootstrap_errors(bootstrap_election_model, va_governor_county_d turnout_factor_lower=turnout_factor_lower, turnout_factor_upper=turnout_factor_upper, margin_change_threshold=0.4, - unit_blacklist=[], - postal_code_blacklist=[], + unit_blocklist=[], + postal_code_blocklist=[], aggregates=["postal_code"], ) @@ -544,8 +544,8 @@ def test_get_unit_predictions(bootstrap_election_model, va_governor_county_data) turnout_factor_lower=turnout_factor_lower, turnout_factor_upper=turnout_factor_upper, margin_change_threshold=0.4, - unit_blacklist=[], - postal_code_blacklist=[], + unit_blocklist=[], + postal_code_blocklist=[], aggregates=["postal_code"], )