Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ELEX-2763: Estimandizer update #73

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions src/elexmodel/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,23 @@ def _format_historical_current_data(
"""
Formats data for historical model run
"""

"""
dmnapolitano marked this conversation as resolved.
Show resolved Hide resolved
What does the historical model client do?
- If we are running the election in 2024 and 100 counties are reporting, we want to see what
our model error would have been in 2020 with these counties reporting
- To do that we need to merge the 2020 results onto the 2024 reporting counties

- So for 2020 (cli) this means -> we have 2020 data and we pick 100 random counties reporting
in the MockLiveDataHandler
- in this function we get the 2016 results and merge that to the 100 reporting counties in 2020


running election id: 2020-11-03_USA_G --historical
-> historical election id: 2016-11-08_USA_G, 2012, ...

"""

formatted_data = current_data[["postal_code", "geographic_unit_fips", "percent_expected_vote"]]
print(f"Getting data for historical election: {historical_election_id}")
preprocessed_data_handler = PreprocessedDataHandler(
Expand All @@ -401,6 +418,7 @@ def _format_historical_current_data(
estimand_baselines,
s3_client=s3.S3CsvUtil(TARGET_BUCKET),
historical=True,
include_results_estimand=True,
)

results_to_return = [f"results_{estimand}" for estimand in estimands]
Expand Down
4 changes: 1 addition & 3 deletions src/elexmodel/handlers/data/CombinedData.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@ def __init__(
self.estimands = estimands

estimandizer = Estimandizer()
(current_data, _) = estimandizer.check_and_create_estimands(
current_data.copy(), self.estimands, False, current_data=True
)
(current_data, _) = estimandizer.check_and_create_estimands(current_data.copy(), self.estimands, False)

# if we're running this for a past election, drop results columns from preprocessed data
# so we use results_{estimand} numbers from current_data
Expand Down
53 changes: 23 additions & 30 deletions src/elexmodel/handlers/data/Estimandizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,14 @@ class Estimandizer:
Estimandizer. Generate estimands explicitly.
"""

def check_and_create_estimands(self, data_df, estimands, historical, current_data=False):
def check_and_create_estimands(self, data_df, estimands, historical):
dmnapolitano marked this conversation as resolved.
Show resolved Hide resolved
columns_to_return = []

for estimand in estimands:
results_col = f"{RESULTS_PREFIX}{estimand}"
baseline_col = f"{BASELINE_PREFIX}{estimand}"
target_col = results_col if current_data else baseline_col

if target_col not in data_df.columns:
if estimand in data_df.columns:
data_df[target_col] = data_df[estimand].copy()
else:
# will raise a KeyError if a function with the same name as `estimand` doesn't exist
data_df = globals()[estimand](data_df)
if target_col == baseline_col:
data_df[results_col] = data_df[baseline_col].copy()

if results_col not in data_df.columns:
# will raise a KeyError if a function with the same name as `estimand` doesn't exist
data_df = globals()[estimand](data_df, RESULTS_PREFIX)

if historical:
dmnapolitano marked this conversation as resolved.
Show resolved Hide resolved
data_df[results_col] = nan
Expand All @@ -45,44 +37,45 @@ def check_and_create_estimands(self, data_df, estimands, historical, current_dat
if not historical and len(results_column_names) == 0:
raise EstimandException("This is not a test election, it is missing results data")

return (data_df, columns_to_return)
return data_df, columns_to_return

def add_estimand_baselines(self, data_df, estimand_baselines, historical):
def add_estimand_baselines(self, data_df, estimand_baselines, historical, include_results_estimand=False):
# if we are in a historical election we are only reading preprocessed data to get
# the historical election results of the currently reporting units.
# so we don't care about the total voters or the baseline election.

for estimand, pointer in estimand_baselines.items():
if pointer is None:
# should only happen when we're going to create a new estimand
# when we are creating a new estimand
pointer = estimand

baseline_col = f"{BASELINE_PREFIX}{pointer}"

if baseline_col not in data_df.columns:
# will raise a KeyError if a function with the same name as `pointer` doesn't exist
data_df = globals()[pointer](data_df)
results_col = f"{RESULTS_PREFIX}{estimand}"
data_df[results_col] = data_df[baseline_col].copy()
data_df = globals()[estimand](data_df, BASELINE_PREFIX)

if not historical:
# Adding one to prevent zero divison
data_df[f"last_election_results_{estimand}"] = data_df[baseline_col].copy() + 1

if include_results_estimand:
data_df, ___ = self.check_and_create_estimands(data_df, estimand_baselines.keys(), historical)

return data_df


# custom estimands


def party_vote_share_dem(data_df):
# should only happen when we're replaying an election
if f"{BASELINE_PREFIX}dem" not in data_df.columns and f"{BASELINE_PREFIX}turnout" not in data_df.columns:
data_df[f"{RESULTS_PREFIX}party_vote_share_dem"] = (
data_df[f"{RESULTS_PREFIX}dem"] / data_df[f"{RESULTS_PREFIX}turnout"]
)
def party_vote_share_dem(data_df, col_prefix):
if f"{col_prefix}dem" in data_df.columns and f"{col_prefix}turnout" in data_df.columns:
dmnapolitano marked this conversation as resolved.
Show resolved Hide resolved
numer = f"{col_prefix}dem"
denom = f"{col_prefix}turnout"
else:
data_df[f"{BASELINE_PREFIX}party_vote_share_dem"] = (
data_df[f"{BASELINE_PREFIX}dem"] / data_df[f"{BASELINE_PREFIX}turnout"]
)
numer = "dem"
denom = "total"

data_df[f"{col_prefix}party_vote_share_dem"] = data_df.apply(
lambda x: 0 if x[numer] == 0 or x[denom] == 0 else x[numer] / x[denom], axis=1
)

return data_df
10 changes: 9 additions & 1 deletion src/elexmodel/handlers/data/PreprocessedData.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def __init__(
s3_client=None,
historical=False,
data=None,
include_results_estimand=False,
):
"""
Initialize preprocessed data. If not present, download from s3.
Expand All @@ -36,6 +37,7 @@ def __init__(
self.s3_client = s3_client
self.estimand_baselines = estimand_baselines
self.historical = historical
self.include_results_estimand = include_results_estimand
self.estimandizer = Estimandizer()

self.local_file_path = self.get_preprocessed_data_path()
Expand Down Expand Up @@ -83,8 +85,14 @@ def load_data(self, preprocessed_data):
Load preprocessed csv data as df
"""
LOG.info("Loading preprocessed data: %s, %s, %s", self.election_id, self.office, self.geographic_unit_type)
data = self.estimandizer.add_estimand_baselines(
preprocessed_data,
self.estimand_baselines,
self.historical,
include_results_estimand=self.include_results_estimand,
)

return self.estimandizer.add_estimand_baselines(preprocessed_data, self.estimand_baselines, self.historical)
return data

def save_data(self, preprocessed_data):
if not Path(self.local_file_path).parent.exists():
Expand Down
8 changes: 4 additions & 4 deletions tests/handlers/test_estimandizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ def test_check_and_create_estimands_not_historical(va_governor_county_data):
estimandizer = Estimandizer()
(output_df, result_columns) = estimandizer.check_and_create_estimands(va_data_copy, estimands, False)

assert "baseline_party_vote_share_dem" in output_df.columns
assert "results_party_vote_share_dem" in output_df.columns
assert result_columns == ["results_party_vote_share_dem"]

Expand All @@ -27,24 +26,25 @@ def test_check_and_create_estimands_historical(va_governor_county_data):
estimandizer = Estimandizer()
(output_df, result_columns) = estimandizer.check_and_create_estimands(va_data_copy, estimands, True)

assert "baseline_party_vote_share_dem" in output_df.columns
assert "results_party_vote_share_dem" in output_df.columns
assert result_columns == ["results_party_vote_share_dem"]
assert output_df["results_party_vote_share_dem"].isnull().all()
dmnapolitano marked this conversation as resolved.
Show resolved Hide resolved


def test_add_estimand_baselines_not_historical(va_governor_county_data):
estimand_baselines = {"turnout": "turnout", "party_vote_share_dem": "party_vote_share_dem"}
estimandizer = Estimandizer()
output_df = estimandizer.add_estimand_baselines(va_governor_county_data.copy(), estimand_baselines, False)
assert "baseline_party_vote_share_dem" in output_df.columns
assert "results_party_vote_share_dem" in output_df.columns
assert "last_election_results_party_vote_share_dem" in output_df.columns


def test_add_estimand_baselines_historical(va_governor_county_data):
estimand_baselines = {"turnout": "turnout", "party_vote_share_dem": "party_vote_share_dem"}
estimandizer = Estimandizer()
output_df = estimandizer.add_estimand_baselines(va_governor_county_data.copy(), estimand_baselines, True)
output_df = estimandizer.add_estimand_baselines(
va_governor_county_data.copy(), estimand_baselines, True, include_results_estimand=True
)
assert "baseline_party_vote_share_dem" in output_df.columns
assert "results_party_vote_share_dem" in output_df.columns
assert "last_election_results_party_vote_share_dem" not in output_df.columns
Loading