Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Elex 2771 new ols qr #77

Merged
merged 14 commits into from
Sep 25, 2023
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from setuptools import find_packages, setup

INSTALL_REQUIRES = ("click<8.1", "elex-solver<2", "pandas<1.5.0", "boto3<2", "python-dotenv==0.19.2", "scipy==1.10.1")
INSTALL_REQUIRES = ("click<8.1", "elex-solver<3", "pandas<1.5.0", "boto3<2", "python-dotenv==0.19.2", "scipy==1.10.1")

THIS_FILE_DIR = os.path.dirname(__file__)

Expand Down
4 changes: 2 additions & 2 deletions src/elexmodel/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,8 +416,8 @@ def _format_historical_current_data(
historical=True,
include_results_estimand=True,
)

results_to_return = [f"results_{estimand}" for estimand in estimands]
# we always want to pass turnout so that we can generate results weights
results_to_return = list(set([f"results_{estimand}" for estimand in estimands] + ["results_turnout"]))
geo_columns = set(["geographic_unit_fips", "postal_code"] + [a for a in self.aggregates if a != "unit"])
preprocessed_data = preprocessed_data_handler.data[list(geo_columns) + results_to_return].copy()
historical_current_data = preprocessed_data.merge(formatted_data, on=["postal_code", "geographic_unit_fips"])
Expand Down
28 changes: 23 additions & 5 deletions src/elexmodel/handlers/data/CombinedData.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def __init__(
self.current_data = current_data
self.geographic_unit_type = geographic_unit_type
data = preprocessed_data.merge(current_data, how="left", on=["postal_code", "geographic_unit_fips"])
data = estimandizer.add_turnout_factor(data)
# if unreporting is 'drop' then drop units that are not reporting (ie. units where results are na)
# this is necessary if units will not be returning results in this election,
# but we didn't know that (ie. townships)
Expand All @@ -37,21 +38,31 @@ def __init__(
# Drop the whole row if an estimand is not reporting
data = data.dropna(axis=0, how="any", subset=result_cols)
# if unreporting is 'zero' then we set the votes for non-reporting units to zero
# this is necessary if we are worried that there is no zero state for units (ie. some precincts)
# this is necessary if we are worried that there is no zero state for units (ie. some precinct states)
elif handle_unreporting == "zero":
indices_with_null_val = data[result_cols].isna().any(axis=1)
data.update(data[result_cols].fillna(value=0))
data.loc[indices_with_null_val, "percent_expected_vote"] = 0

self.data = data

def get_reporting_units(self, percent_reporting_threshold, features_to_normalize=[], add_intercept=True):
def get_reporting_units(
self,
percent_reporting_threshold,
turnout_factor_lower=0.5,
turnout_factor_upper=1.5,
features_to_normalize=[],
add_intercept=True,
):
"""
Get reporting data. These are units where the expected vote is greater than the percent reporting threshold.
"""
reporting_units = self.data[self.data.percent_expected_vote >= percent_reporting_threshold].reset_index(
drop=True
)
# if turnout factor less than 0.5 or greater than 1.5 assume AP made a mistake and don't treat those as reporting units
reporting_units = reporting_units[reporting_units.turnout_factor > turnout_factor_lower]
reporting_units = reporting_units[reporting_units.turnout_factor < turnout_factor_upper]

# residualize + normalize
for estimand in self.estimands:
Expand All @@ -64,16 +75,23 @@ def get_reporting_units(self, percent_reporting_threshold, features_to_normalize

return reporting_units

def get_nonreporting_units(self, percent_reporting_threshold, features_to_normalize=[], add_intercept=True):
def get_nonreporting_units(
self,
percent_reporting_threshold,
turnout_factor_lower=0.5,
turnout_factor_upper=1.5,
features_to_normalize=[],
add_intercept=True,
):
"""
Get nonreporting data. These are units where expected vote is less than the percent reporting threshold
"""
# if turnout factor <= turnout_factor_lower or >= turnout_factor_upper assume the AP made a mistake and treat them as non-reporting units
nonreporting_units = self.data.query(
"percent_expected_vote < @percent_reporting_threshold"
"(percent_expected_vote < @percent_reporting_threshold) | (turnout_factor <= @turnout_factor_lower) | (turnout_factor >= @turnout_factor_upper)" #
).reset_index( # not checking if results.isnull() anymore across multiple estimands
drop=True
)

nonreporting_units["reporting"] = int(0)
nonreporting_units["expected"] = True

Expand Down
48 changes: 36 additions & 12 deletions src/elexmodel/handlers/data/Estimandizer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from numpy import nan
import numpy as np

RESULTS_PREFIX = "results_"
BASELINE_PREFIX = "baseline_"
Expand All @@ -11,12 +11,15 @@ class Estimandizer:

def add_estimand_results(self, data_df, estimands, historical):
columns_to_return = []
turnout_col = f"{RESULTS_PREFIX}turnout"

for estimand in estimands:
results_col = f"{RESULTS_PREFIX}{estimand}"
additional_columns_added = []
if results_col not in data_df.columns:
# will raise a KeyError if a function with the same name as `estimand` doesn't exist
try:
data_df = globals()[estimand](data_df, RESULTS_PREFIX)
data_df, additional_columns_added = globals()[estimand](data_df, RESULTS_PREFIX)
dmnapolitano marked this conversation as resolved.
Show resolved Hide resolved
except KeyError as e:
if historical:
# A historical run is one where we pull in data from a past election
Expand All @@ -27,12 +30,21 @@ def add_estimand_results(self, data_df, estimands, historical):
# and that data handler expects a results_ column for every estimand specified.
# Hence, this is the only special case in which we'd want to add
# an empty results_ column.
data_df[results_col] = nan
data_df[results_col] = np.nan
data_df[turnout_col] = np.nan
else:
# If this is not a historical run, then this is a live election
# so we are expecting that there will be actual results data
raise e
columns_to_return.append(results_col)

columns_to_return.extend([results_col] + additional_columns_added)

# always adding turnout since we will want to generate weights
# but if turnout is the estimand, then we only want to add it once
if turnout_col not in columns_to_return:
columns_to_return.append(turnout_col)

data_df = self.add_weights(data_df, RESULTS_PREFIX)

return data_df, columns_to_return

Expand All @@ -49,7 +61,7 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ
baseline_col = f"{BASELINE_PREFIX}{pointer}"

if baseline_col not in data_df.columns:
data_df = globals()[estimand](data_df, BASELINE_PREFIX)
data_df, __ = globals()[estimand](data_df, BASELINE_PREFIX)

if not historical:
data_df[f"last_election_results_{estimand}"] = data_df[baseline_col].copy() + 1
Expand All @@ -62,18 +74,30 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ
# we need to add the results from the historical election as well.
data_df, ___ = self.add_estimand_results(data_df, estimand_baselines.keys(), historical)

data_df = self.add_weights(data_df, BASELINE_PREFIX)

return data_df

def add_weights(self, data_df, col_prefix):
data_df[f"{col_prefix}weights"] = data_df[f"{col_prefix}turnout"]
return data_df

def add_turnout_factor(self, data_df):
# posinf and neginf are also set to zero because dividing by zero can lead to nan/posinf/neginf depending
# on the type of the numeric in the numpy array. Assume that if baseline_weights is zero then turnout
# would be incredibly low in this election too (ie. this is effectively an empty precinct) and so setting
# the turnout factor to zero is fine
data_df["turnout_factor"] = np.nan_to_num(
dmnapolitano marked this conversation as resolved.
Show resolved Hide resolved
data_df.results_weights / data_df.baseline_weights, nan=0, posinf=0, neginf=0
)
return data_df


# custom estimands


def party_vote_share_dem(data_df, col_prefix):
numer = f"{col_prefix}dem"
denom = f"{col_prefix}turnout"

data_df[f"{col_prefix}party_vote_share_dem"] = data_df.apply(
lambda x: 0 if x[numer] == 0 or x[denom] == 0 else x[numer] / x[denom], axis=1
data_df[f"{col_prefix}party_vote_share_dem"] = np.nan_to_num(
data_df[f"{col_prefix}dem"] / data_df[f"{col_prefix}turnout"]
)

return data_df
return data_df, []
29 changes: 15 additions & 14 deletions src/elexmodel/models/ConformalElectionModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
class ConformalElectionModel(BaseElectionModel.BaseElectionModel, ABC):
def __init__(self, model_settings: dict):
super(ConformalElectionModel, self).__init__(model_settings)
self.qr = QuantileRegressionSolver(solver="ECOS")
self.lambda_ = model_settings.get("lambda_", 0)

@classmethod
Expand Down Expand Up @@ -56,11 +55,10 @@ def fit_model(
model.fit(
X,
y,
tau_value=tau,
taus=tau,
weights=weights,
lambda_=self.lambda_,
fit_intercept=self.add_intercept,
normalize_weights=normalize_weights,
)
except (UserWarning, cvxpy.error.SolverError):
LOG.warning("Warning: solution was inaccurate or solver broke. Re-running with normalize_weights=False.")
Expand Down Expand Up @@ -88,10 +86,11 @@ def get_unit_predictions(
x_all[self.n_train : self.n_train + n_test] # noqa: E203
)

self.fit_model(self.qr, reporting_units_features, reporting_units_residuals, 0.5, weights, True)
self.features_to_coefficients = dict(zip(featurizer.complete_features, self.qr.coefficients))
qr = QuantileRegressionSolver()
self.fit_model(qr, reporting_units_features, reporting_units_residuals, 0.5, weights, True)
self.features_to_coefficients = dict(zip(featurizer.complete_features, qr.coefficients))

preds = self.qr.predict(nonreporting_units_features)
preds = qr.predict(nonreporting_units_features.values).flatten()

# multiply by total voters to get unnormalized residuals
preds = preds * nonreporting_units[f"last_election_results_{estimand}"]
Expand Down Expand Up @@ -150,15 +149,15 @@ def get_unit_prediction_interval_bounds(
train_data_weights = train_data[f"last_election_results_{estimand}"]

# fit lower and upper model to training data. ECOS solver is better than SCS.
lower_qr = QuantileRegressionSolver(solver="ECOS")
lower_qr = QuantileRegressionSolver()
self.fit_model(lower_qr, train_data_features, train_data_residuals, lower_bound, train_data_weights, True)

upper_qr = QuantileRegressionSolver(solver="ECOS")
upper_qr = QuantileRegressionSolver()
self.fit_model(upper_qr, train_data_features, train_data_residuals, upper_bound, train_data_weights, True)

# apply to conformalization data. Conformalization bounds will later tell us how much to adjust lower/upper
# bounds for nonreporting data.
conformalization_data = reporting_units_shuffled[train_rows:]
conformalization_data = reporting_units_shuffled[train_rows:].reset_index(drop=True)

# all_data starts with reporting_units_shuffled, so the rows between train_rows and n_train are the
# conformalization set
Expand All @@ -169,10 +168,12 @@ def get_unit_prediction_interval_bounds(
# we are interested in f(X) - r
# since later conformity scores care about deviation of bounds from residuals
conformalization_lower_bounds = (
lower_qr.predict(conformalization_data_features) - conformalization_data[f"residuals_{estimand}"].values
lower_qr.predict(conformalization_data_features.values).flatten()
- conformalization_data[f"residuals_{estimand}"].values
)
conformalization_upper_bounds = conformalization_data[f"residuals_{estimand}"].values - upper_qr.predict(
conformalization_data_features
conformalization_upper_bounds = (
conformalization_data[f"residuals_{estimand}"].values
- upper_qr.predict(conformalization_data_features.values).flatten()
)

# save conformalization bounds for later
Expand All @@ -185,8 +186,8 @@ def get_unit_prediction_interval_bounds(
# are the same accross train_data, conformalization_data and nonreporting_units
nonreporting_units_features = interval_featurizer.generate_holdout_data(x_all[self.n_train :]) # noqa: E203

nonreporting_lower_bounds = lower_qr.predict(nonreporting_units_features)
nonreporting_upper_bounds = upper_qr.predict(nonreporting_units_features)
nonreporting_lower_bounds = lower_qr.predict(nonreporting_units_features.values).flatten()
nonreporting_upper_bounds = upper_qr.predict(nonreporting_units_features.values).flatten()
dmnapolitano marked this conversation as resolved.
Show resolved Hide resolved

return PredictionIntervals(nonreporting_lower_bounds, nonreporting_upper_bounds, conformalization_data)

Expand Down
14 changes: 7 additions & 7 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,43 +66,43 @@ def conformal_election_model():
return ConformalElectionModel.ConformalElectionModel(model_settings)


@pytest.fixture(scope="session")
@pytest.fixture(scope="function")
def va_config(get_fixture):
path = os.path.join("config", "2017-11-07_VA_G.json")
return get_fixture(path, load=True, pandas=False)


@pytest.fixture(scope="session")
@pytest.fixture(scope="function")
def tx_primary_governor_config(get_fixture):
path = os.path.join("config", "2018-03-06_TX_R.json")
return get_fixture(path, load=True, pandas=False)


@pytest.fixture(scope="session")
@pytest.fixture(scope="function")
def va_governor_precinct_data(get_fixture):
path = os.path.join("data", "2017-11-07_VA_G", "G", "data_precinct.csv")
return get_fixture(path, load=False, pandas=True)


@pytest.fixture(scope="module")
@pytest.fixture(scope="function")
def va_governor_county_data(get_fixture):
path = os.path.join("data", "2017-11-07_VA_G", "G", "data_county.csv")
return get_fixture(path, load=False, pandas=True)


@pytest.fixture(scope="session")
@pytest.fixture(scope="function")
def va_assembly_county_data(get_fixture):
path = os.path.join("data", "2017-11-07_VA_G", "Y", "data_county-district.csv")
return get_fixture(path, load=False, pandas=True)


@pytest.fixture(scope="session")
@pytest.fixture(scope="function")
def va_assembly_precinct_data(get_fixture):
path = os.path.join("data", "2017-11-07_VA_G", "Y", "data_precinct-district.csv")
return get_fixture(path, load=False, pandas=True)


@pytest.fixture(scope="session")
@pytest.fixture(scope="function")
def az_assembly_precinct_data(get_fixture):
path = os.path.join("data", "2020-08-04_AZ_R", "S", "data_precinct.csv")
return get_fixture(path, load=False, pandas=True)
Expand Down
Loading