Skip to content

Commit

Permalink
Merge pull request #77 from washingtonpost/elex-2771-new-ols-qr
Browse files Browse the repository at this point in the history
Elex 2771 new ols qr
  • Loading branch information
lennybronner authored Sep 25, 2023
2 parents f359935 + eb5de99 commit 2043a30
Show file tree
Hide file tree
Showing 11 changed files with 257 additions and 86 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from setuptools import find_packages, setup

INSTALL_REQUIRES = ("click<8.1", "elex-solver<2", "pandas<1.5.0", "boto3<2", "python-dotenv==0.19.2", "scipy==1.10.1")
INSTALL_REQUIRES = ("click<8.1", "elex-solver<3", "pandas<1.5.0", "boto3<2", "python-dotenv==0.19.2", "scipy==1.10.1")

THIS_FILE_DIR = os.path.dirname(__file__)

Expand Down
4 changes: 2 additions & 2 deletions src/elexmodel/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,8 +416,8 @@ def _format_historical_current_data(
historical=True,
include_results_estimand=True,
)

results_to_return = [f"results_{estimand}" for estimand in estimands]
# we always want to pass turnout so that we can generate results weights
results_to_return = list(set([f"results_{estimand}" for estimand in estimands] + ["results_turnout"]))
geo_columns = set(["geographic_unit_fips", "postal_code"] + [a for a in self.aggregates if a != "unit"])
preprocessed_data = preprocessed_data_handler.data[list(geo_columns) + results_to_return].copy()
historical_current_data = preprocessed_data.merge(formatted_data, on=["postal_code", "geographic_unit_fips"])
Expand Down
28 changes: 23 additions & 5 deletions src/elexmodel/handlers/data/CombinedData.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def __init__(
self.current_data = current_data
self.geographic_unit_type = geographic_unit_type
data = preprocessed_data.merge(current_data, how="left", on=["postal_code", "geographic_unit_fips"])
data = estimandizer.add_turnout_factor(data)
# if unreporting is 'drop' then drop units that are not reporting (ie. units where results are na)
# this is necessary if units will not be returning results in this election,
# but we didn't know that (ie. townships)
Expand All @@ -37,21 +38,31 @@ def __init__(
# Drop the whole row if an estimand is not reporting
data = data.dropna(axis=0, how="any", subset=result_cols)
# if unreporting is 'zero' then we set the votes for non-reporting units to zero
# this is necessary if we are worried that there is no zero state for units (ie. some precincts)
# this is necessary if we are worried that there is no zero state for units (ie. some precinct states)
elif handle_unreporting == "zero":
indices_with_null_val = data[result_cols].isna().any(axis=1)
data.update(data[result_cols].fillna(value=0))
data.loc[indices_with_null_val, "percent_expected_vote"] = 0

self.data = data

def get_reporting_units(self, percent_reporting_threshold, features_to_normalize=[], add_intercept=True):
def get_reporting_units(
self,
percent_reporting_threshold,
turnout_factor_lower=0.5,
turnout_factor_upper=1.5,
features_to_normalize=[],
add_intercept=True,
):
"""
Get reporting data. These are units where the expected vote is greater than the percent reporting threshold.
"""
reporting_units = self.data[self.data.percent_expected_vote >= percent_reporting_threshold].reset_index(
drop=True
)
# if turnout factor less than 0.5 or greater than 1.5 assume AP made a mistake and don't treat those as reporting units
reporting_units = reporting_units[reporting_units.turnout_factor > turnout_factor_lower]
reporting_units = reporting_units[reporting_units.turnout_factor < turnout_factor_upper]

# residualize + normalize
for estimand in self.estimands:
Expand All @@ -64,16 +75,23 @@ def get_reporting_units(self, percent_reporting_threshold, features_to_normalize

return reporting_units

def get_nonreporting_units(self, percent_reporting_threshold, features_to_normalize=[], add_intercept=True):
def get_nonreporting_units(
self,
percent_reporting_threshold,
turnout_factor_lower=0.5,
turnout_factor_upper=1.5,
features_to_normalize=[],
add_intercept=True,
):
"""
Get nonreporting data. These are units where expected vote is less than the percent reporting threshold
"""
# if turnout factor <= turnout_factor_lower or >= turnout_factor_upper assume the AP made a mistake and treat them as non-reporting units
nonreporting_units = self.data.query(
"percent_expected_vote < @percent_reporting_threshold"
"(percent_expected_vote < @percent_reporting_threshold) | (turnout_factor <= @turnout_factor_lower) | (turnout_factor >= @turnout_factor_upper)" #
).reset_index( # not checking if results.isnull() anymore across multiple estimands
drop=True
)

nonreporting_units["reporting"] = int(0)
nonreporting_units["expected"] = True

Expand Down
48 changes: 36 additions & 12 deletions src/elexmodel/handlers/data/Estimandizer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from numpy import nan
import numpy as np

RESULTS_PREFIX = "results_"
BASELINE_PREFIX = "baseline_"
Expand All @@ -11,12 +11,15 @@ class Estimandizer:

def add_estimand_results(self, data_df, estimands, historical):
columns_to_return = []
turnout_col = f"{RESULTS_PREFIX}turnout"

for estimand in estimands:
results_col = f"{RESULTS_PREFIX}{estimand}"
additional_columns_added = []
if results_col not in data_df.columns:
# will raise a KeyError if a function with the same name as `estimand` doesn't exist
try:
data_df = globals()[estimand](data_df, RESULTS_PREFIX)
data_df, additional_columns_added = globals()[estimand](data_df, RESULTS_PREFIX)
except KeyError as e:
if historical:
# A historical run is one where we pull in data from a past election
Expand All @@ -27,12 +30,21 @@ def add_estimand_results(self, data_df, estimands, historical):
# and that data handler expects a results_ column for every estimand specified.
# Hence, this is the only special case in which we'd want to add
# an empty results_ column.
data_df[results_col] = nan
data_df[results_col] = np.nan
data_df[turnout_col] = np.nan
else:
# If this is not a historical run, then this is a live election
# so we are expecting that there will be actual results data
raise e
columns_to_return.append(results_col)

columns_to_return.extend([results_col] + additional_columns_added)

# always adding turnout since we will want to generate weights
# but if turnout is the estimand, then we only want to add it once
if turnout_col not in columns_to_return:
columns_to_return.append(turnout_col)

data_df = self.add_weights(data_df, RESULTS_PREFIX)

return data_df, columns_to_return

Expand All @@ -49,7 +61,7 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ
baseline_col = f"{BASELINE_PREFIX}{pointer}"

if baseline_col not in data_df.columns:
data_df = globals()[estimand](data_df, BASELINE_PREFIX)
data_df, __ = globals()[estimand](data_df, BASELINE_PREFIX)

if not historical:
data_df[f"last_election_results_{estimand}"] = data_df[baseline_col].copy() + 1
Expand All @@ -62,18 +74,30 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ
# we need to add the results from the historical election as well.
data_df, ___ = self.add_estimand_results(data_df, estimand_baselines.keys(), historical)

data_df = self.add_weights(data_df, BASELINE_PREFIX)

return data_df

def add_weights(self, data_df, col_prefix):
data_df[f"{col_prefix}weights"] = data_df[f"{col_prefix}turnout"]
return data_df

def add_turnout_factor(self, data_df):
# posinf and neginf are also set to zero because dividing by zero can lead to nan/posinf/neginf depending
# on the type of the numeric in the numpy array. Assume that if baseline_weights is zero then turnout
# would be incredibly low in this election too (ie. this is effectively an empty precinct) and so setting
# the turnout factor to zero is fine
data_df["turnout_factor"] = np.nan_to_num(
data_df.results_weights / data_df.baseline_weights, nan=0, posinf=0, neginf=0
)
return data_df


# custom estimands


def party_vote_share_dem(data_df, col_prefix):
numer = f"{col_prefix}dem"
denom = f"{col_prefix}turnout"

data_df[f"{col_prefix}party_vote_share_dem"] = data_df.apply(
lambda x: 0 if x[numer] == 0 or x[denom] == 0 else x[numer] / x[denom], axis=1
data_df[f"{col_prefix}party_vote_share_dem"] = np.nan_to_num(
data_df[f"{col_prefix}dem"] / data_df[f"{col_prefix}turnout"]
)

return data_df
return data_df, []
29 changes: 15 additions & 14 deletions src/elexmodel/models/ConformalElectionModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
class ConformalElectionModel(BaseElectionModel.BaseElectionModel, ABC):
def __init__(self, model_settings: dict):
super(ConformalElectionModel, self).__init__(model_settings)
self.qr = QuantileRegressionSolver(solver="ECOS")
self.lambda_ = model_settings.get("lambda_", 0)

@classmethod
Expand Down Expand Up @@ -56,11 +55,10 @@ def fit_model(
model.fit(
X,
y,
tau_value=tau,
taus=tau,
weights=weights,
lambda_=self.lambda_,
fit_intercept=self.add_intercept,
normalize_weights=normalize_weights,
)
except (UserWarning, cvxpy.error.SolverError):
LOG.warning("Warning: solution was inaccurate or solver broke. Re-running with normalize_weights=False.")
Expand Down Expand Up @@ -88,10 +86,11 @@ def get_unit_predictions(
x_all[self.n_train : self.n_train + n_test] # noqa: E203
)

self.fit_model(self.qr, reporting_units_features, reporting_units_residuals, 0.5, weights, True)
self.features_to_coefficients = dict(zip(featurizer.complete_features, self.qr.coefficients))
qr = QuantileRegressionSolver()
self.fit_model(qr, reporting_units_features, reporting_units_residuals, 0.5, weights, True)
self.features_to_coefficients = dict(zip(featurizer.complete_features, qr.coefficients))

preds = self.qr.predict(nonreporting_units_features)
preds = qr.predict(nonreporting_units_features.values).flatten()

# multiply by total voters to get unnormalized residuals
preds = preds * nonreporting_units[f"last_election_results_{estimand}"]
Expand Down Expand Up @@ -150,15 +149,15 @@ def get_unit_prediction_interval_bounds(
train_data_weights = train_data[f"last_election_results_{estimand}"]

# fit lower and upper model to training data. ECOS solver is better than SCS.
lower_qr = QuantileRegressionSolver(solver="ECOS")
lower_qr = QuantileRegressionSolver()
self.fit_model(lower_qr, train_data_features, train_data_residuals, lower_bound, train_data_weights, True)

upper_qr = QuantileRegressionSolver(solver="ECOS")
upper_qr = QuantileRegressionSolver()
self.fit_model(upper_qr, train_data_features, train_data_residuals, upper_bound, train_data_weights, True)

# apply to conformalization data. Conformalization bounds will later tell us how much to adjust lower/upper
# bounds for nonreporting data.
conformalization_data = reporting_units_shuffled[train_rows:]
conformalization_data = reporting_units_shuffled[train_rows:].reset_index(drop=True)

# all_data starts with reporting_units_shuffled, so the rows between train_rows and n_train are the
# conformalization set
Expand All @@ -169,10 +168,12 @@ def get_unit_prediction_interval_bounds(
# we are interested in f(X) - r
# since later conformity scores care about deviation of bounds from residuals
conformalization_lower_bounds = (
lower_qr.predict(conformalization_data_features) - conformalization_data[f"residuals_{estimand}"].values
lower_qr.predict(conformalization_data_features.values).flatten()
- conformalization_data[f"residuals_{estimand}"].values
)
conformalization_upper_bounds = conformalization_data[f"residuals_{estimand}"].values - upper_qr.predict(
conformalization_data_features
conformalization_upper_bounds = (
conformalization_data[f"residuals_{estimand}"].values
- upper_qr.predict(conformalization_data_features.values).flatten()
)

# save conformalization bounds for later
Expand All @@ -185,8 +186,8 @@ def get_unit_prediction_interval_bounds(
# are the same accross train_data, conformalization_data and nonreporting_units
nonreporting_units_features = interval_featurizer.generate_holdout_data(x_all[self.n_train :]) # noqa: E203

nonreporting_lower_bounds = lower_qr.predict(nonreporting_units_features)
nonreporting_upper_bounds = upper_qr.predict(nonreporting_units_features)
nonreporting_lower_bounds = lower_qr.predict(nonreporting_units_features.values).flatten()
nonreporting_upper_bounds = upper_qr.predict(nonreporting_units_features.values).flatten()

return PredictionIntervals(nonreporting_lower_bounds, nonreporting_upper_bounds, conformalization_data)

Expand Down
14 changes: 7 additions & 7 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,43 +66,43 @@ def conformal_election_model():
return ConformalElectionModel.ConformalElectionModel(model_settings)


@pytest.fixture(scope="session")
@pytest.fixture(scope="function")
def va_config(get_fixture):
path = os.path.join("config", "2017-11-07_VA_G.json")
return get_fixture(path, load=True, pandas=False)


@pytest.fixture(scope="session")
@pytest.fixture(scope="function")
def tx_primary_governor_config(get_fixture):
path = os.path.join("config", "2018-03-06_TX_R.json")
return get_fixture(path, load=True, pandas=False)


@pytest.fixture(scope="session")
@pytest.fixture(scope="function")
def va_governor_precinct_data(get_fixture):
path = os.path.join("data", "2017-11-07_VA_G", "G", "data_precinct.csv")
return get_fixture(path, load=False, pandas=True)


@pytest.fixture(scope="module")
@pytest.fixture(scope="function")
def va_governor_county_data(get_fixture):
path = os.path.join("data", "2017-11-07_VA_G", "G", "data_county.csv")
return get_fixture(path, load=False, pandas=True)


@pytest.fixture(scope="session")
@pytest.fixture(scope="function")
def va_assembly_county_data(get_fixture):
path = os.path.join("data", "2017-11-07_VA_G", "Y", "data_county-district.csv")
return get_fixture(path, load=False, pandas=True)


@pytest.fixture(scope="session")
@pytest.fixture(scope="function")
def va_assembly_precinct_data(get_fixture):
path = os.path.join("data", "2017-11-07_VA_G", "Y", "data_precinct-district.csv")
return get_fixture(path, load=False, pandas=True)


@pytest.fixture(scope="session")
@pytest.fixture(scope="function")
def az_assembly_precinct_data(get_fixture):
path = os.path.join("data", "2020-08-04_AZ_R", "S", "data_precinct.csv")
return get_fixture(path, load=False, pandas=True)
Expand Down
Loading

0 comments on commit 2043a30

Please sign in to comment.