diff --git a/src/elexmodel/client.py b/src/elexmodel/client.py index 6194b7f0..be11a3fb 100644 --- a/src/elexmodel/client.py +++ b/src/elexmodel/client.py @@ -391,6 +391,23 @@ def _format_historical_current_data( """ Formats data for historical model run """ + + """ + What does the historical model client do? + - If we are running the election in 2024 and 100 counties are reporting, we want to see what + our model error would have been in 2020 with these counties reporting + - To do that we need to merge the 2020 results onto the 2024 reporting counties + + - So for 2020 (cli) this means -> we have 2020 data and we pick 100 random counties reporting + in the MockLiveDataHandler + - in this function we get the 2016 results and merge that to the 100 reporting counties in 2020 + + + running election id: 2020-11-03_USA_G --historical + -> historical election id: 2016-11-08_USA_G, 2012, ... + + """ + formatted_data = current_data[["postal_code", "geographic_unit_fips", "percent_expected_vote"]] print(f"Getting data for historical election: {historical_election_id}") preprocessed_data_handler = PreprocessedDataHandler( @@ -401,6 +418,7 @@ def _format_historical_current_data( estimand_baselines, s3_client=s3.S3CsvUtil(TARGET_BUCKET), historical=True, + include_results_estimand=True, ) results_to_return = [f"results_{estimand}" for estimand in estimands] diff --git a/src/elexmodel/handlers/data/CombinedData.py b/src/elexmodel/handlers/data/CombinedData.py index 951dff97..76fc2672 100644 --- a/src/elexmodel/handlers/data/CombinedData.py +++ b/src/elexmodel/handlers/data/CombinedData.py @@ -19,9 +19,7 @@ def __init__( self.estimands = estimands estimandizer = Estimandizer() - (current_data, _) = estimandizer.check_and_create_estimands( - current_data.copy(), self.estimands, False, current_data=True - ) + (current_data, _) = estimandizer.add_estimand_results(current_data.copy(), self.estimands, False) # if we're running this for a past election, drop results columns from preprocessed data # so we use results_{estimand} numbers from current_data diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py index 30e6a00f..0724c53c 100644 --- a/src/elexmodel/handlers/data/Estimandizer.py +++ b/src/elexmodel/handlers/data/Estimandizer.py @@ -14,22 +14,14 @@ class Estimandizer: Estimandizer. Generate estimands explicitly. """ - def check_and_create_estimands(self, data_df, estimands, historical, current_data=False): + def add_estimand_results(self, data_df, estimands, historical): columns_to_return = [] - for estimand in estimands: results_col = f"{RESULTS_PREFIX}{estimand}" - baseline_col = f"{BASELINE_PREFIX}{estimand}" - target_col = results_col if current_data else baseline_col - - if target_col not in data_df.columns: - if estimand in data_df.columns: - data_df[target_col] = data_df[estimand].copy() - else: - # will raise a KeyError if a function with the same name as `estimand` doesn't exist - data_df = globals()[estimand](data_df) - if target_col == baseline_col: - data_df[results_col] = data_df[baseline_col].copy() + + if results_col not in data_df.columns: + # will raise a KeyError if a function with the same name as `estimand` doesn't exist + data_df = globals()[estimand](data_df, RESULTS_PREFIX) if historical: data_df[results_col] = nan @@ -45,44 +37,45 @@ def check_and_create_estimands(self, data_df, estimands, historical, current_dat if not historical and len(results_column_names) == 0: raise EstimandException("This is not a test election, it is missing results data") - return (data_df, columns_to_return) + return data_df, columns_to_return - def add_estimand_baselines(self, data_df, estimand_baselines, historical): + def add_estimand_baselines(self, data_df, estimand_baselines, historical, include_results_estimand=False): # if we are in a historical election we are only reading preprocessed data to get # the historical election results of the currently reporting units. # so we don't care about the total voters or the baseline election. for estimand, pointer in estimand_baselines.items(): if pointer is None: - # should only happen when we're going to create a new estimand + # when we are creating a new estimand pointer = estimand baseline_col = f"{BASELINE_PREFIX}{pointer}" if baseline_col not in data_df.columns: - # will raise a KeyError if a function with the same name as `pointer` doesn't exist - data_df = globals()[pointer](data_df) - results_col = f"{RESULTS_PREFIX}{estimand}" - data_df[results_col] = data_df[baseline_col].copy() + data_df = globals()[estimand](data_df, BASELINE_PREFIX) if not historical: - # Adding one to prevent zero divison data_df[f"last_election_results_{estimand}"] = data_df[baseline_col].copy() + 1 + if include_results_estimand: + data_df, ___ = self.add_estimand_results(data_df, estimand_baselines.keys(), historical) + return data_df # custom estimands -def party_vote_share_dem(data_df): - # should only happen when we're replaying an election - if f"{BASELINE_PREFIX}dem" not in data_df.columns and f"{BASELINE_PREFIX}turnout" not in data_df.columns: - data_df[f"{RESULTS_PREFIX}party_vote_share_dem"] = ( - data_df[f"{RESULTS_PREFIX}dem"] / data_df[f"{RESULTS_PREFIX}turnout"] - ) +def party_vote_share_dem(data_df, col_prefix): + if f"{col_prefix}dem" in data_df.columns and f"{col_prefix}turnout" in data_df.columns: + numer = f"{col_prefix}dem" + denom = f"{col_prefix}turnout" else: - data_df[f"{BASELINE_PREFIX}party_vote_share_dem"] = ( - data_df[f"{BASELINE_PREFIX}dem"] / data_df[f"{BASELINE_PREFIX}turnout"] - ) + numer = "dem" + denom = "total" + + data_df[f"{col_prefix}party_vote_share_dem"] = data_df.apply( + lambda x: 0 if x[numer] == 0 or x[denom] == 0 else x[numer] / x[denom], axis=1 + ) + return data_df diff --git a/src/elexmodel/handlers/data/LiveData.py b/src/elexmodel/handlers/data/LiveData.py index c9c598f3..940f9ca7 100644 --- a/src/elexmodel/handlers/data/LiveData.py +++ b/src/elexmodel/handlers/data/LiveData.py @@ -84,7 +84,9 @@ def get_live_data_file_path(self): def load_data(self, data): columns_to_return = ["postal_code", "geographic_unit_fips"] - (data, more_columns) = self.estimandizer.check_and_create_estimands(data, self.estimands, self.historical) + print(data) + print(data.columns) + (data, more_columns) = self.estimandizer.add_estimand_results(data, self.estimands, self.historical) columns_to_return += more_columns self.shuffle_dataframe = data[self.shuffle_columns].copy() diff --git a/src/elexmodel/handlers/data/PreprocessedData.py b/src/elexmodel/handlers/data/PreprocessedData.py index a2ed3902..d57639c3 100644 --- a/src/elexmodel/handlers/data/PreprocessedData.py +++ b/src/elexmodel/handlers/data/PreprocessedData.py @@ -25,6 +25,7 @@ def __init__( s3_client=None, historical=False, data=None, + include_results_estimand=False, ): """ Initialize preprocessed data. If not present, download from s3. @@ -36,6 +37,7 @@ def __init__( self.s3_client = s3_client self.estimand_baselines = estimand_baselines self.historical = historical + self.include_results_estimand = include_results_estimand self.estimandizer = Estimandizer() self.local_file_path = self.get_preprocessed_data_path() @@ -83,8 +85,14 @@ def load_data(self, preprocessed_data): Load preprocessed csv data as df """ LOG.info("Loading preprocessed data: %s, %s, %s", self.election_id, self.office, self.geographic_unit_type) + data = self.estimandizer.add_estimand_baselines( + preprocessed_data, + self.estimand_baselines, + self.historical, + include_results_estimand=self.include_results_estimand, + ) - return self.estimandizer.add_estimand_baselines(preprocessed_data, self.estimand_baselines, self.historical) + return data def save_data(self, preprocessed_data): if not Path(self.local_file_path).parent.exists(): diff --git a/tests/handlers/test_estimandizer.py b/tests/handlers/test_estimandizer.py index 84312e46..3d258715 100644 --- a/tests/handlers/test_estimandizer.py +++ b/tests/handlers/test_estimandizer.py @@ -1,35 +1,34 @@ from elexmodel.handlers.data.Estimandizer import Estimandizer -def test_check_and_create_estimands_not_historical(va_governor_county_data): +def test_add_estimand_results_not_historical(va_governor_county_data): """ - Tests the check_and_create_estimands() method. + Tests the add_estimand_results() method. """ va_data_copy = va_governor_county_data.copy() estimands = ["party_vote_share_dem"] estimandizer = Estimandizer() - (output_df, result_columns) = estimandizer.check_and_create_estimands(va_data_copy, estimands, False) + (output_df, result_columns) = estimandizer.add_estimand_results(va_data_copy, estimands, False) - assert "baseline_party_vote_share_dem" in output_df.columns assert "results_party_vote_share_dem" in output_df.columns assert result_columns == ["results_party_vote_share_dem"] -def test_check_and_create_estimands_historical(va_governor_county_data): +def test_add_estimand_results_historical(va_governor_county_data): """ - Tests the check_and_create_estimands() method with historical elections. + Tests the add_estimand_results() method with historical elections. """ va_data_copy = va_governor_county_data.copy() estimands = ["party_vote_share_dem"] estimandizer = Estimandizer() - (output_df, result_columns) = estimandizer.check_and_create_estimands(va_data_copy, estimands, True) + (output_df, result_columns) = estimandizer.add_estimand_results(va_data_copy, estimands, True) - assert "baseline_party_vote_share_dem" in output_df.columns assert "results_party_vote_share_dem" in output_df.columns assert result_columns == ["results_party_vote_share_dem"] + assert output_df["results_party_vote_share_dem"].isnull().all() def test_add_estimand_baselines_not_historical(va_governor_county_data): @@ -37,14 +36,15 @@ def test_add_estimand_baselines_not_historical(va_governor_county_data): estimandizer = Estimandizer() output_df = estimandizer.add_estimand_baselines(va_governor_county_data.copy(), estimand_baselines, False) assert "baseline_party_vote_share_dem" in output_df.columns - assert "results_party_vote_share_dem" in output_df.columns assert "last_election_results_party_vote_share_dem" in output_df.columns def test_add_estimand_baselines_historical(va_governor_county_data): estimand_baselines = {"turnout": "turnout", "party_vote_share_dem": "party_vote_share_dem"} estimandizer = Estimandizer() - output_df = estimandizer.add_estimand_baselines(va_governor_county_data.copy(), estimand_baselines, True) + output_df = estimandizer.add_estimand_baselines( + va_governor_county_data.copy(), estimand_baselines, True, include_results_estimand=True + ) assert "baseline_party_vote_share_dem" in output_df.columns assert "results_party_vote_share_dem" in output_df.columns assert "last_election_results_party_vote_share_dem" not in output_df.columns