Skip to content

Commit

Permalink
renamed + better printing of non-modeled units
Browse files Browse the repository at this point in the history
  • Loading branch information
lennybronner committed Oct 15, 2024
1 parent f71b7b6 commit cd22a53
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 46 deletions.
16 changes: 9 additions & 7 deletions src/elexmodel/client.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
from collections import defaultdict
import pprint

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -299,16 +300,16 @@ def get_estimates(
turnout_factor_lower = model_parameters.get("turnout_factor_lower", 0.5)
turnout_factor_upper = model_parameters.get("turnout_factor_upper", 2.0)
margin_change_threshold = model_parameters.get("margin_change_threshold", 0.3)
unit_blacklist = model_parameters.get("unit_blacklist", [])
postal_code_blacklist = model_parameters.get("postal_code_blacklist", [])
unit_blocklist = model_parameters.get("unit_blocklist", [])
postal_code_blocklist = model_parameters.get("postal_code_blocklist", [])

(reporting_units, nonreporting_units, unexpected_units) = data.get_units(
percent_reporting_threshold,
turnout_factor_lower,
turnout_factor_upper,
margin_change_threshold,
unit_blacklist,
postal_code_blacklist,
unit_blocklist,
postal_code_blocklist,
aggregates,
)

Expand Down Expand Up @@ -338,7 +339,7 @@ def get_estimates(
if APP_ENV != "local" and self.save_results:
data.write_data(self.election_id, self.office)

non_modeled_units = unexpected_units[unexpected_units["unit_category"] == "non-modeled"]
non_modeled_units = unexpected_units[unexpected_units["unit_category"].str.startswith("non-modeled")]
n_reporting_expected_units = reporting_units.shape[0]
n_unexpected_units = len(unexpected_units[unexpected_units["unit_category"] == "unexpected"])
n_nonreporting_units = nonreporting_units.shape[0]
Expand All @@ -351,8 +352,9 @@ def get_estimates(
There are {n_nonreporting_units} nonreporting units."""
)
if len(non_modeled_units) > 0:
non_modeled_units = non_modeled_units.groupby("postal_code")["geographic_unit_fips"].apply(list).to_dict()
LOG.info(f"non-modeled units:\n{non_modeled_units}")
non_modeled_units = non_modeled_units.groupby("unit_category")[["postal_code", "geographic_unit_fips"]].apply(lambda x: list(x.itertuples(index=False, name=None))).to_dict()
non_modeled_units_pprint = pprint.pformat(non_modeled_units)
LOG.info(f"non-modeled units:\n{non_modeled_units_pprint}")

if n_reporting_expected_units < minimum_reporting_units_max:
raise ModelNotEnoughSubunitsException(
Expand Down
79 changes: 44 additions & 35 deletions src/elexmodel/handlers/data/CombinedData.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ def get_units(
turnout_factor_lower,
turnout_factor_upper,
margin_change_threshold,
unit_blacklist,
postal_code_blacklist,
unit_blocklist,
postal_code_blocklist,
aggregates,
):
"""
Expand All @@ -66,7 +66,7 @@ def get_units(
- unexpected units (ie. units for which we have no covariates prepared)
- units for which the baseline results is zero (ie. units that are tiny)
- units with strange turnout factors (ie. units that are likely precinct mismatches)
- units that have been blacklisted
- units that have been blocklisted
"""

# units where the expected vote is greater than the percent reporting threshold
Expand All @@ -81,8 +81,8 @@ def get_units(
turnout_factor_lower,
turnout_factor_upper,
margin_change_threshold,
unit_blacklist,
postal_code_blacklist,
unit_blocklist,
postal_code_blocklist,
)

# remove these units from the reporting units
Expand Down Expand Up @@ -119,8 +119,6 @@ def get_units(
nonreporting_units["unit_category"] = "expected"

# finalize all unexpected/non-modeled units
unexpected_units["unit_category"] = "unexpected"
non_modeled_units["unit_category"] = "non-modeled"
all_unexpected_units = pd.concat([unexpected_units, non_modeled_units]).reset_index(drop=True)
all_unexpected_units["reporting"] = int(0)

Expand Down Expand Up @@ -164,6 +162,7 @@ def _get_unexpected_units(self, aggregates):
.drop_duplicates(subset="geographic_unit_fips")
.copy()
)
unexpected_units["unit_category"] = "unexpected"

# since we were not expecting them, we have don't have their county or district
# from preprocessed data. so we have to add that back in.
Expand All @@ -184,43 +183,53 @@ def _get_non_modeled_units(
turnout_factor_lower,
turnout_factor_upper,
margin_change_threshold,
unit_blacklist,
postal_code_blacklist,
unit_blocklist,
postal_code_blocklist,
):
expected_geographic_units = self._get_expected_geographic_unit_fips().tolist()
zero_baseline_units = self._get_units_with_baseline_of_zero()
if "margin" in self.estimands:
self.data["normalized_margin_change"] = (
self.data.baseline_normalized_margin - self.data.results_normalized_margin
).abs()

units_blocklisted = self.data[
(self.data["geographic_unit_fips"].isin(unit_blocklist))
| (self.data["postal_code"].isin(postal_code_blocklist))
].copy()
units_blocklisted["unit_category"] = "non-modeled: blocklisted"

units_with_zero_baseline = self.data[self.data["geographic_unit_fips"].isin(zero_baseline_units)].copy()
units_with_zero_baseline["unit_category"] = "non-modeled: zero baseline"

units_with_strange_turnout_factor = (
self.data[
(
self.data.percent_expected_vote >= percent_reporting_threshold
) # reporting units (otherwise nonreporting unit)
& (
self.data["geographic_unit_fips"].isin(expected_geographic_units)
) # and expected unit (otherwise unexpected unit)
(self.data.percent_expected_vote >= percent_reporting_threshold)
& (self.data["geographic_unit_fips"].isin(expected_geographic_units))
& (
(self.data["geographic_unit_fips"].isin(zero_baseline_units)) # zero baseline
| (self.data.turnout_factor <= turnout_factor_lower) # or low turnout factor
| (self.data.turnout_factor >= turnout_factor_upper) # or high turnout factor
| (
("margin" in self.estimands) and (self.data.normalized_margin_change > margin_change_threshold)
) # or large margin change if margin is an estimand (using and rather than & here is fine since first clause is a boolean and not a series)
(self.data.turnout_factor <= turnout_factor_lower)
| (self.data.turnout_factor >= turnout_factor_upper)
)
| (
self.data["geographic_unit_fips"].isin(unit_blacklist)
) # or blacklisted (doesn't need to be reporting/expected)
| (
self.data["postal_code"].isin(postal_code_blacklist)
) # or entire state is blacklisted (doesn't need to be reporting/expected)
]
.drop_duplicates(subset="geographic_unit_fips")
.copy()
)
).copy()
units_with_strange_turnout_factor["unit_category"] = "non-modeled: strange turnout factor"

non_modeled_units_list = [units_blocklisted, units_with_zero_baseline, units_with_strange_turnout_factor]

if "margin" in self.estimands:
# instead fit stupid model based on all reporting units (whether modeled or not)
# units that are far from that model should also be non-modeled

self.data["normalized_margin_change"] = (
self.data.baseline_normalized_margin - self.data.results_normalized_margin
).abs()

return units_with_strange_turnout_factor
units_with_strange_margin_change = self.data[
(self.data.percent_expected_vote >= percent_reporting_threshold)
& (self.data["geographic_unit_fips"].isin(expected_geographic_units))
& (self.data.normalized_margin_change > margin_change_threshold)
].copy()
units_with_strange_margin_change["unit_category"] = "non-modeled: strange margin change"
non_modeled_units_list.append(units_with_strange_margin_change)

non_modeled_units = pd.concat(non_modeled_units_list).reset_index(drop=True).drop_duplicates(subset='geographic_unit_fips')
return non_modeled_units

def write_data(self, election_id, office):
s3_client = s3.S3CsvUtil(TARGET_BUCKET)
Expand Down
8 changes: 4 additions & 4 deletions tests/models/test_bootstrap_election_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,8 +498,8 @@ def test_compute_bootstrap_errors(bootstrap_election_model, va_governor_county_d
turnout_factor_lower=turnout_factor_lower,
turnout_factor_upper=turnout_factor_upper,
margin_change_threshold=0.4,
unit_blacklist=[],
postal_code_blacklist=[],
unit_blocklist=[],
postal_code_blocklist=[],
aggregates=["postal_code"],
)

Expand Down Expand Up @@ -544,8 +544,8 @@ def test_get_unit_predictions(bootstrap_election_model, va_governor_county_data)
turnout_factor_lower=turnout_factor_lower,
turnout_factor_upper=turnout_factor_upper,
margin_change_threshold=0.4,
unit_blacklist=[],
postal_code_blacklist=[],
unit_blocklist=[],
postal_code_blocklist=[],
aggregates=["postal_code"],
)

Expand Down

0 comments on commit cd22a53

Please sign in to comment.