washingtonpost · dmnapolitano · Sep 14, 2023 · Sep 14, 2023 · Sep 15, 2023 · Sep 15, 2023
@@ -391,6 +391,23 @@ def _format_historical_current_data(
         """
         Formats data for historical model run
         """
+
+        """
+        What does the historical model client do?
+            - If we are running the election in 2024 and 100 counties are reporting, we want to see what
+                our model error would have been in 2020 with these counties reporting
+            - To do that we need to merge the 2020 results onto the 2024 reporting counties
+
+            - So for 2020 (cli) this means -> we have 2020 data and we pick 100 random counties reporting
+              in the MockLiveDataHandler
+            - in this function we get the 2016 results and merge that to the 100 reporting counties in 2020
+
+
+        running election id: 2020-11-03_USA_G --historical
+            -> historical election id: 2016-11-08_USA_G, 2012, ...
+
+        """
+
         formatted_data = current_data[["postal_code", "geographic_unit_fips", "percent_expected_vote"]]
         print(f"Getting data for historical election: {historical_election_id}")
         preprocessed_data_handler = PreprocessedDataHandler(
@@ -401,6 +418,7 @@ def _format_historical_current_data(
             estimand_baselines,
             s3_client=s3.S3CsvUtil(TARGET_BUCKET),
             historical=True,
+            include_results_estimand=True,
         )
 
         results_to_return = [f"results_{estimand}" for estimand in estimands]

@@ -19,9 +19,7 @@ def __init__(
         self.estimands = estimands
 
         estimandizer = Estimandizer()
-        (current_data, _) = estimandizer.check_and_create_estimands(
-            current_data.copy(), self.estimands, False, current_data=True
-        )
+        (current_data, _) = estimandizer.check_and_create_estimands(current_data.copy(), self.estimands, False)
 
         # if we're running this for a past election, drop results columns from preprocessed data
         # so we use results_{estimand} numbers from current_data

@@ -14,22 +14,14 @@ class Estimandizer:
     Estimandizer. Generate estimands explicitly.
     """
 
-    def check_and_create_estimands(self, data_df, estimands, historical, current_data=False):
+    def check_and_create_estimands(self, data_df, estimands, historical):
         columns_to_return = []
-
         for estimand in estimands:
             results_col = f"{RESULTS_PREFIX}{estimand}"
-            baseline_col = f"{BASELINE_PREFIX}{estimand}"
-            target_col = results_col if current_data else baseline_col
-
-            if target_col not in data_df.columns:
-                if estimand in data_df.columns:
-                    data_df[target_col] = data_df[estimand].copy()
-                else:
-                    # will raise a KeyError if a function with the same name as `estimand` doesn't exist
-                    data_df = globals()[estimand](data_df)
-                if target_col == baseline_col:
-                    data_df[results_col] = data_df[baseline_col].copy()
+
+            if results_col not in data_df.columns:
+                # will raise a KeyError if a function with the same name as `estimand` doesn't exist
+                data_df = globals()[estimand](data_df, RESULTS_PREFIX)
 
             if historical:
                 data_df[results_col] = nan
@@ -45,44 +37,45 @@ def check_and_create_estimands(self, data_df, estimands, historical, current_dat
         if not historical and len(results_column_names) == 0:
             raise EstimandException("This is not a test election, it is missing results data")
 
-        return (data_df, columns_to_return)
+        return data_df, columns_to_return
 
-    def add_estimand_baselines(self, data_df, estimand_baselines, historical):
+    def add_estimand_baselines(self, data_df, estimand_baselines, historical, include_results_estimand=False):
         # if we are in a historical election we are only reading preprocessed data to get
         # the historical election results of the currently reporting units.
         # so we don't care about the total voters or the baseline election.
 
         for estimand, pointer in estimand_baselines.items():
             if pointer is None:
-                # should only happen when we're going to create a new estimand
+                # when we are creating a new estimand
                 pointer = estimand
 
             baseline_col = f"{BASELINE_PREFIX}{pointer}"
 
             if baseline_col not in data_df.columns:
-                # will raise a KeyError if a function with the same name as `pointer` doesn't exist
-                data_df = globals()[pointer](data_df)
-                results_col = f"{RESULTS_PREFIX}{estimand}"
-                data_df[results_col] = data_df[baseline_col].copy()
+                data_df = globals()[estimand](data_df, BASELINE_PREFIX)
 
             if not historical:
-                # Adding one to prevent zero divison
                 data_df[f"last_election_results_{estimand}"] = data_df[baseline_col].copy() + 1
 
+        if include_results_estimand:
+            data_df, ___ = self.check_and_create_estimands(data_df, estimand_baselines.keys(), historical)
+
         return data_df
 
 
 # custom estimands
 
 
-def party_vote_share_dem(data_df):
-    # should only happen when we're replaying an election
-    if f"{BASELINE_PREFIX}dem" not in data_df.columns and f"{BASELINE_PREFIX}turnout" not in data_df.columns:
-        data_df[f"{RESULTS_PREFIX}party_vote_share_dem"] = (
-            data_df[f"{RESULTS_PREFIX}dem"] / data_df[f"{RESULTS_PREFIX}turnout"]
-        )
+def party_vote_share_dem(data_df, col_prefix):
+    if f"{col_prefix}dem" in data_df.columns and f"{col_prefix}turnout" in data_df.columns:
+        numer = f"{col_prefix}dem"
+        denom = f"{col_prefix}turnout"
     else:
-        data_df[f"{BASELINE_PREFIX}party_vote_share_dem"] = (
-            data_df[f"{BASELINE_PREFIX}dem"] / data_df[f"{BASELINE_PREFIX}turnout"]
-        )
+        numer = "dem"
+        denom = "total"
+
+    data_df[f"{col_prefix}party_vote_share_dem"] = data_df.apply(
+        lambda x: 0 if x[numer] == 0 or x[denom] == 0 else x[numer] / x[denom], axis=1
+    )
+
     return data_df
@@ -25,6 +25,7 @@ def __init__(
         s3_client=None,
         historical=False,
         data=None,
+        include_results_estimand=False,
     ):
         """
         Initialize preprocessed data. If not present, download from s3.
@@ -36,6 +37,7 @@ def __init__(
         self.s3_client = s3_client
         self.estimand_baselines = estimand_baselines
         self.historical = historical
+        self.include_results_estimand = include_results_estimand
         self.estimandizer = Estimandizer()
 
         self.local_file_path = self.get_preprocessed_data_path()
@@ -83,8 +85,14 @@ def load_data(self, preprocessed_data):
         Load preprocessed csv data as df
         """
         LOG.info("Loading preprocessed data: %s, %s, %s", self.election_id, self.office, self.geographic_unit_type)
+        data = self.estimandizer.add_estimand_baselines(
+            preprocessed_data,
+            self.estimand_baselines,
+            self.historical,
+            include_results_estimand=self.include_results_estimand,
+        )
 
-        return self.estimandizer.add_estimand_baselines(preprocessed_data, self.estimand_baselines, self.historical)
+        return data
 
     def save_data(self, preprocessed_data):
         if not Path(self.local_file_path).parent.exists():

@@ -12,7 +12,6 @@ def test_check_and_create_estimands_not_historical(va_governor_county_data):
     estimandizer = Estimandizer()
     (output_df, result_columns) = estimandizer.check_and_create_estimands(va_data_copy, estimands, False)
 
-    assert "baseline_party_vote_share_dem" in output_df.columns
     assert "results_party_vote_share_dem" in output_df.columns
     assert result_columns == ["results_party_vote_share_dem"]
 
@@ -27,24 +26,25 @@ def test_check_and_create_estimands_historical(va_governor_county_data):
     estimandizer = Estimandizer()
     (output_df, result_columns) = estimandizer.check_and_create_estimands(va_data_copy, estimands, True)
 
-    assert "baseline_party_vote_share_dem" in output_df.columns
     assert "results_party_vote_share_dem" in output_df.columns
     assert result_columns == ["results_party_vote_share_dem"]
+    assert output_df["results_party_vote_share_dem"].isnull().all()
 
 
 def test_add_estimand_baselines_not_historical(va_governor_county_data):
     estimand_baselines = {"turnout": "turnout", "party_vote_share_dem": "party_vote_share_dem"}
     estimandizer = Estimandizer()
     output_df = estimandizer.add_estimand_baselines(va_governor_county_data.copy(), estimand_baselines, False)
     assert "baseline_party_vote_share_dem" in output_df.columns
-    assert "results_party_vote_share_dem" in output_df.columns
     assert "last_election_results_party_vote_share_dem" in output_df.columns
 
 
 def test_add_estimand_baselines_historical(va_governor_county_data):
     estimand_baselines = {"turnout": "turnout", "party_vote_share_dem": "party_vote_share_dem"}
     estimandizer = Estimandizer()
-    output_df = estimandizer.add_estimand_baselines(va_governor_county_data.copy(), estimand_baselines, True)
+    output_df = estimandizer.add_estimand_baselines(
+        va_governor_county_data.copy(), estimand_baselines, True, include_results_estimand=True
+    )
     assert "baseline_party_vote_share_dem" in output_df.columns
     assert "results_party_vote_share_dem" in output_df.columns
     assert "last_election_results_party_vote_share_dem" not in output_df.columns