From 44e6b90973f3a49cb651bbe66525a8edec280e09 Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Thu, 21 Sep 2023 18:36:20 -0400
Subject: [PATCH 01/14] added new features to combined data handler,
 estimandizer and updated conformal election model to use new elex-solver

---
 src/elexmodel/handlers/data/CombinedData.py   | 28 +++++++++++++---
 src/elexmodel/handlers/data/Estimandizer.py   | 33 +++++++++++++++----
 .../models/ConformalElectionModel.py          | 27 +++++++--------
 tests/handlers/test_combined_data.py          | 25 +++++++++-----
 tests/handlers/test_featurizer.py             |  9 +++--
 tests/test_client.py                          |  1 -
 6 files changed, 85 insertions(+), 38 deletions(-)

diff --git a/src/elexmodel/handlers/data/CombinedData.py b/src/elexmodel/handlers/data/CombinedData.py
index 76fc2672..d21f64a3 100644
--- a/src/elexmodel/handlers/data/CombinedData.py
+++ b/src/elexmodel/handlers/data/CombinedData.py
@@ -29,6 +29,7 @@ def __init__(
         self.current_data = current_data
         self.geographic_unit_type = geographic_unit_type
         data = preprocessed_data.merge(current_data, how="left", on=["postal_code", "geographic_unit_fips"])
+        data = estimandizer.add_turnout_factor(data)
         # if unreporting is 'drop' then drop units that are not reporting (ie. units where results are na)
         # this is necessary if units will not be returning results in this election,
         # but we didn't know that (ie. townships)
@@ -37,7 +38,7 @@ def __init__(
             # Drop the whole row if an estimand is not reporting
             data = data.dropna(axis=0, how="any", subset=result_cols)
         # if unreporting is 'zero' then we set the votes for non-reporting units to zero
-        # this is necessary if we are worried that there is no zero state for units (ie. some precincts)
+        # this is necessary if we are worried that there is no zero state for units (ie. some precinct states)
         elif handle_unreporting == "zero":
             indices_with_null_val = data[result_cols].isna().any(axis=1)
             data.update(data[result_cols].fillna(value=0))
@@ -45,13 +46,23 @@ def __init__(
 
         self.data = data
 
-    def get_reporting_units(self, percent_reporting_threshold, features_to_normalize=[], add_intercept=True):
+    def get_reporting_units(
+        self,
+        percent_reporting_threshold,
+        turnout_factor_lower=0.5,
+        turnout_factor_upper=1.5,
+        features_to_normalize=[],
+        add_intercept=True,
+    ):
         """
         Get reporting data. These are units where the expected vote is greater than the percent reporting threshold.
         """
         reporting_units = self.data[self.data.percent_expected_vote >= percent_reporting_threshold].reset_index(
             drop=True
         )
+        # if turnout factor less than 0.5 or greater than 1.5 assume AP made a mistake and don't treat those as reporting units
+        reporting_units = reporting_units[reporting_units.turnout_factor > turnout_factor_lower]
+        reporting_units = reporting_units[reporting_units.turnout_factor < turnout_factor_upper]
 
         # residualize + normalize
         for estimand in self.estimands:
@@ -64,16 +75,23 @@ def get_reporting_units(self, percent_reporting_threshold, features_to_normalize
 
         return reporting_units
 
-    def get_nonreporting_units(self, percent_reporting_threshold, features_to_normalize=[], add_intercept=True):
+    def get_nonreporting_units(
+        self,
+        percent_reporting_threshold,
+        turnout_factor_lower=0.5,
+        turnout_factor_upper=1.5,
+        features_to_normalize=[],
+        add_intercept=True,
+    ):
         """
         Get nonreporting data. These are units where expected vote is less than the percent reporting threshold
         """
+        # if turnout factor <= turnout_factor_lower or >= turnout_factor_upper assume the AP made a mistake and treat them as non-reporting units
         nonreporting_units = self.data.query(
-            "percent_expected_vote < @percent_reporting_threshold"
+            "(percent_expected_vote < @percent_reporting_threshold) | (turnout_factor <= @turnout_factor_lower) | (turnout_factor >= @turnout_factor_upper)"  #
         ).reset_index(  # not checking if results.isnull() anymore across multiple estimands
             drop=True
         )
-
         nonreporting_units["reporting"] = int(0)
         nonreporting_units["expected"] = True
 
diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py
index 64250a28..bff3e623 100644
--- a/src/elexmodel/handlers/data/Estimandizer.py
+++ b/src/elexmodel/handlers/data/Estimandizer.py
@@ -1,4 +1,4 @@
-from numpy import nan
+import numpy as np
 
 RESULTS_PREFIX = "results_"
 BASELINE_PREFIX = "baseline_"
@@ -11,12 +11,15 @@ class Estimandizer:
 
     def add_estimand_results(self, data_df, estimands, historical):
         columns_to_return = []
+        turnout_col = f"{RESULTS_PREFIX}turnout"
+
         for estimand in estimands:
             results_col = f"{RESULTS_PREFIX}{estimand}"
+            additional_columns_added = []
             if results_col not in data_df.columns:
                 # will raise a KeyError if a function with the same name as `estimand` doesn't exist
                 try:
-                    data_df = globals()[estimand](data_df, RESULTS_PREFIX)
+                    data_df, additional_columns_added = globals()[estimand](data_df, RESULTS_PREFIX)
                 except KeyError as e:
                     if historical:
                         # A historical run is one where we pull in data from a past election
@@ -27,12 +30,21 @@ def add_estimand_results(self, data_df, estimands, historical):
                         # and that data handler expects a results_ column for every estimand specified.
                         # Hence, this is the only special case in which we'd want to add
                         # an empty results_ column.
-                        data_df[results_col] = nan
+                        data_df[results_col] = np.nan
                     else:
                         # If this is not a historical run, then this is a live election
                         # so we are expecting that there will be actual results data
                         raise e
-            columns_to_return.append(results_col)
+
+
+            columns_to_return.extend([results_col] + additional_columns_added)
+
+        # always adding turnout since we will want to generate weights
+        # but if turnout is the estimand, then we only want to add it once
+        if turnout_col not in columns_to_return:
+            columns_to_return.append(turnout_col)
+        
+        data_df = self.add_weights(data_df, RESULTS_PREFIX)
 
         return data_df, columns_to_return
 
@@ -49,7 +61,7 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ
             baseline_col = f"{BASELINE_PREFIX}{pointer}"
 
             if baseline_col not in data_df.columns:
-                data_df = globals()[estimand](data_df, BASELINE_PREFIX)
+                data_df, __ = globals()[estimand](data_df, BASELINE_PREFIX)
 
             if not historical:
                 data_df[f"last_election_results_{estimand}"] = data_df[baseline_col].copy() + 1
@@ -62,6 +74,15 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ
             # we need to add the results from the historical election as well.
             data_df, ___ = self.add_estimand_results(data_df, estimand_baselines.keys(), historical)
 
+        data_df = self.add_weights(data_df, BASELINE_PREFIX)
+        return data_df
+
+    def add_weights(self, data_df, col_prefix):
+        data_df[f"{col_prefix}weights"] = data_df[f"{col_prefix}turnout"]
+        return data_df
+
+    def add_turnout_factor(self, data_df):
+        data_df["turnout_factor"] = np.nan_to_num(data_df.results_weights / data_df.baseline_weights)
         return data_df
 
 
@@ -76,4 +97,4 @@ def party_vote_share_dem(data_df, col_prefix):
         lambda x: 0 if x[numer] == 0 or x[denom] == 0 else x[numer] / x[denom], axis=1
     )
 
-    return data_df
+    return data_df, []
diff --git a/src/elexmodel/models/ConformalElectionModel.py b/src/elexmodel/models/ConformalElectionModel.py
index f0a81a7b..8c240c7e 100644
--- a/src/elexmodel/models/ConformalElectionModel.py
+++ b/src/elexmodel/models/ConformalElectionModel.py
@@ -22,7 +22,6 @@
 class ConformalElectionModel(BaseElectionModel.BaseElectionModel, ABC):
     def __init__(self, model_settings: dict):
         super(ConformalElectionModel, self).__init__(model_settings)
-        self.qr = QuantileRegressionSolver(solver="ECOS")
         self.lambda_ = model_settings.get("lambda_", 0)
 
     @classmethod
@@ -56,11 +55,10 @@ def fit_model(
             model.fit(
                 X,
                 y,
-                tau_value=tau,
+                taus=tau,
                 weights=weights,
                 lambda_=self.lambda_,
                 fit_intercept=self.add_intercept,
-                normalize_weights=normalize_weights,
             )
         except (UserWarning, cvxpy.error.SolverError):
             LOG.warning("Warning: solution was inaccurate or solver broke. Re-running with normalize_weights=False.")
@@ -88,10 +86,11 @@ def get_unit_predictions(
             x_all[self.n_train : self.n_train + n_test]  # noqa: E203
         )
 
-        self.fit_model(self.qr, reporting_units_features, reporting_units_residuals, 0.5, weights, True)
-        self.features_to_coefficients = dict(zip(featurizer.complete_features, self.qr.coefficients))
+        qr = QuantileRegressionSolver()
+        self.fit_model(qr, reporting_units_features, reporting_units_residuals, 0.5, weights, True)
+        self.features_to_coefficients = dict(zip(featurizer.complete_features, qr.coefficients))
 
-        preds = self.qr.predict(nonreporting_units_features)
+        preds = qr.predict(nonreporting_units_features.values).flatten()
 
         # multiply by total voters to get unnormalized residuals
         preds = preds * nonreporting_units[f"last_election_results_{estimand}"]
@@ -150,10 +149,10 @@ def get_unit_prediction_interval_bounds(
         train_data_weights = train_data[f"last_election_results_{estimand}"]
 
         # fit lower and upper model to training data. ECOS solver is better than SCS.
-        lower_qr = QuantileRegressionSolver(solver="ECOS")
+        lower_qr = QuantileRegressionSolver()
         self.fit_model(lower_qr, train_data_features, train_data_residuals, lower_bound, train_data_weights, True)
 
-        upper_qr = QuantileRegressionSolver(solver="ECOS")
+        upper_qr = QuantileRegressionSolver()
         self.fit_model(upper_qr, train_data_features, train_data_residuals, upper_bound, train_data_weights, True)
 
         # apply to conformalization data. Conformalization bounds will later tell us how much to adjust lower/upper
@@ -169,10 +168,12 @@ def get_unit_prediction_interval_bounds(
         # we are interested in f(X) - r
         # since later conformity scores care about deviation of bounds from residuals
         conformalization_lower_bounds = (
-            lower_qr.predict(conformalization_data_features) - conformalization_data[f"residuals_{estimand}"].values
+            lower_qr.predict(conformalization_data_features.values).flatten()
+            - conformalization_data[f"residuals_{estimand}"].values
         )
-        conformalization_upper_bounds = conformalization_data[f"residuals_{estimand}"].values - upper_qr.predict(
-            conformalization_data_features
+        conformalization_upper_bounds = (
+            conformalization_data[f"residuals_{estimand}"].values
+            - upper_qr.predict(conformalization_data_features.values).flatten()
         )
 
         # save conformalization bounds for later
@@ -185,8 +186,8 @@ def get_unit_prediction_interval_bounds(
         # are the same accross train_data, conformalization_data and nonreporting_units
         nonreporting_units_features = interval_featurizer.generate_holdout_data(x_all[self.n_train :])  # noqa: E203
 
-        nonreporting_lower_bounds = lower_qr.predict(nonreporting_units_features)
-        nonreporting_upper_bounds = upper_qr.predict(nonreporting_units_features)
+        nonreporting_lower_bounds = lower_qr.predict(nonreporting_units_features.values).flatten()
+        nonreporting_upper_bounds = upper_qr.predict(nonreporting_units_features.values).flatten()
 
         return PredictionIntervals(nonreporting_lower_bounds, nonreporting_upper_bounds, conformalization_data)
 
diff --git a/tests/handlers/test_combined_data.py b/tests/handlers/test_combined_data.py
index 784c491b..111932c2 100644
--- a/tests/handlers/test_combined_data.py
+++ b/tests/handlers/test_combined_data.py
@@ -7,16 +7,25 @@
 
 
 def test_load(va_governor_county_data):
+    election_id = "2017-11-07_VA_G"
+    office_id = "G"
+    geographic_unit_type = "county"
     estimands = ["turnout"]
+    estimand_baselines = {"turnout": "turnout"}
     live_data_handler = MockLiveDataHandler(
-        "2017-11-07_VA_G", "G", "county", estimands=["turnout"], data=va_governor_county_data
+        election_id, office_id, geographic_unit_type, estimands=["turnout"], data=va_governor_county_data
     )
     current_data = live_data_handler.data
+    preprocessed_data_handler = PreprocessedDataHandler(
+        election_id, office_id, geographic_unit_type, estimand_baselines=estimand_baselines, estimands=["turnout"], data=va_governor_county_data
+    )
+    preprocessed_data = preprocessed_data_handler.data
 
     combined_data_handler = CombinedDataHandler(
-        va_governor_county_data, current_data, estimands, "county", handle_unreporting="drop"
+        preprocessed_data, current_data, estimands, "county", handle_unreporting="drop"
     )
-    assert combined_data_handler.data.shape == (133, 29)
+
+    assert combined_data_handler.data.shape == (133, 33)
 
 
 def test_zero_unreporting_missing_single_estimand_value(va_governor_county_data):
@@ -37,7 +46,7 @@ def test_zero_unreporting_missing_single_estimand_value(va_governor_county_data)
     assert (
         combined_data_handler.data["percent_expected_vote"].iloc[0] == 0
     )  # percent expected vote with na result has been set to zero
-    assert combined_data_handler.data.shape == (133, 31)  # didn't drop any
+    assert combined_data_handler.data.shape == (133, 35)  # didn't drop any
     assert combined_data_handler.data["results_dem"].iloc[1] != 0  # didn't accidentally set other to zero
 
 
@@ -58,7 +67,7 @@ def test_zero_unreporting_missing_multiple_estimands_value(va_governor_county_da
     assert combined_data_handler.data["results_dem"].iloc[0] == 0.0
     assert combined_data_handler.data["results_turnout"].iloc[0] == 0.0
     assert combined_data_handler.data["percent_expected_vote"].iloc[0] == 0.0
-    assert combined_data_handler.data.shape == (133, 31)
+    assert combined_data_handler.data.shape == (133, 35)
     assert combined_data_handler.data["results_dem"].iloc[1] != 0  # didn't accidentally set other to zero
     assert combined_data_handler.data["results_turnout"].iloc[1] != 0  # didn't accidentally set other to zero
 
@@ -79,7 +88,7 @@ def test_zero_unreporting_missing_percent_expected_vote_value(va_governor_county
     )
     assert combined_data_handler.data["results_dem"].iloc[0] == 0.0
     assert combined_data_handler.data["percent_expected_vote"].iloc[0] == 0.0
-    assert combined_data_handler.data.shape == (133, 31)
+    assert combined_data_handler.data.shape == (133, 35)
     assert combined_data_handler.data["results_dem"].iloc[1] != 0  # didn't accidentally set other to zero
 
 
@@ -98,7 +107,7 @@ def test_zero_unreporting_random_percent_expected_vote_value(va_governor_county_
     )
     assert combined_data_handler.data["results_dem"].iloc[0] == 0.0  # all values set to 0.0
     assert combined_data_handler.data["percent_expected_vote"].iloc[0] == 0.0
-    assert combined_data_handler.data.shape == (133, 31)
+    assert combined_data_handler.data.shape == (133, 35)
     assert combined_data_handler.data["results_dem"].iloc[1] != 0  # didn't accidentally set other to zero
 
 
@@ -115,7 +124,7 @@ def test_drop_unreporting_missing_single_estimand_value(va_governor_county_data)
     combined_data_handler = CombinedDataHandler(
         va_governor_county_data, current_data, estimands, "county", handle_unreporting="drop"
     )
-    assert combined_data_handler.data.shape == (132, 31)  # dropped one
+    assert combined_data_handler.data.shape == (132, 35)  # dropped one
     assert combined_data_handler.data["results_dem"].iloc[0] != 0  # didn't accidentally set other to zero
 
 
diff --git a/tests/handlers/test_featurizer.py b/tests/handlers/test_featurizer.py
index 2b235195..64e1fe16 100644
--- a/tests/handlers/test_featurizer.py
+++ b/tests/handlers/test_featurizer.py
@@ -360,8 +360,7 @@ def test_generate_fixed_effects(va_governor_county_data):
     reporting_data_features = featurizer.filter_to_active_features(x_all[:n_train])
     nonreporting_data_features = featurizer.generate_holdout_data(x_all[n_train:])
 
-    assert combined_data_handler.data.shape == (133, 32)
-
+    assert combined_data_handler.data.shape == (133, 35)
     n_expected_columns = 6  # (6 - 1) fixed effects + 1 intercept
     assert reporting_data_features.shape == (133, n_expected_columns)
     assert nonreporting_data_features.shape == (0, n_expected_columns)
@@ -394,7 +393,7 @@ def test_generate_fixed_effects(va_governor_county_data):
     reporting_data_features = featurizer.filter_to_active_features(x_all[:n_train])
     nonreporting_data_features = featurizer.generate_holdout_data(x_all[n_train:])
 
-    assert combined_data_handler.data.shape == (133, 32)
+    assert combined_data_handler.data.shape == (133, 35)
 
     n_expected_columns = 138  # (6 - 1) + (133 - 1) fixed effects + 1 intercept
     assert reporting_data_features.shape == (133, n_expected_columns)
@@ -451,7 +450,7 @@ def test_generate_fixed_effects_not_all_reporting(va_governor_county_data):
     reporting_data_features = featurizer.filter_to_active_features(x_all[:n_train])
     nonreporting_data_features = featurizer.generate_holdout_data(x_all[n_train:])
 
-    assert combined_data_handler.data.shape == (133, 32)
+    assert combined_data_handler.data.shape == (133, 35)
 
     n_expected_columns = (n - 1) + 1  # minus 1 for dropped fixed effect, plus 1 for intercept
     assert reporting_data_features.shape == (n, n_expected_columns)
@@ -519,7 +518,7 @@ def test_generate_fixed_effects_mixed_reporting(va_governor_precinct_data):
     reporting_data_features = featurizer.filter_to_active_features(x_all[:n_train])
     nonreporting_data_features = featurizer.generate_holdout_data(x_all[n_train:])
 
-    assert combined_data_handler.data.shape == (2360, 32)
+    assert combined_data_handler.data.shape == (2360, 35)
 
     n_expected_columns = 7  # when n = 100 we get to county 51013 (minus dropped fixed effect, plus intercept)
     assert reporting_data_features.shape == (n_train, n_expected_columns)  # use n_train since dropping columns
diff --git a/tests/test_client.py b/tests/test_client.py
index 837d14b9..802d20b4 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -785,7 +785,6 @@ def test_estimandizer_input(model_client, va_governor_county_data, va_config):
 
     preprocessed_data = va_governor_county_data.copy()
     preprocessed_data["last_election_results_turnout"] = preprocessed_data["baseline_turnout"].copy() + 1
-
     try:
         model_client.get_estimates(
             data,

From 57a2d0d65d9f62210adec9c84570d8d86732b6db Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Fri, 22 Sep 2023 10:05:59 -0400
Subject: [PATCH 02/14] unit tests pass now

---
 src/elexmodel/handlers/data/Estimandizer.py   |   3 +-
 tests/conftest.py                             |  14 +--
 tests/handlers/test_combined_data.py          | 117 +++++++++++++++---
 tests/handlers/test_estimandizer.py           |   4 +-
 .../test_nonparametric_election_model.py      |   6 +-
 5 files changed, 114 insertions(+), 30 deletions(-)

diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py
index bff3e623..1f5c43cb 100644
--- a/src/elexmodel/handlers/data/Estimandizer.py
+++ b/src/elexmodel/handlers/data/Estimandizer.py
@@ -36,14 +36,13 @@ def add_estimand_results(self, data_df, estimands, historical):
                         # so we are expecting that there will be actual results data
                         raise e
 
-
             columns_to_return.extend([results_col] + additional_columns_added)
 
         # always adding turnout since we will want to generate weights
         # but if turnout is the estimand, then we only want to add it once
         if turnout_col not in columns_to_return:
             columns_to_return.append(turnout_col)
-        
+
         data_df = self.add_weights(data_df, RESULTS_PREFIX)
 
         return data_df, columns_to_return
diff --git a/tests/conftest.py b/tests/conftest.py
index 1760a21b..871879e4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -66,43 +66,43 @@ def conformal_election_model():
     return ConformalElectionModel.ConformalElectionModel(model_settings)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def va_config(get_fixture):
     path = os.path.join("config", "2017-11-07_VA_G.json")
     return get_fixture(path, load=True, pandas=False)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def tx_primary_governor_config(get_fixture):
     path = os.path.join("config", "2018-03-06_TX_R.json")
     return get_fixture(path, load=True, pandas=False)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def va_governor_precinct_data(get_fixture):
     path = os.path.join("data", "2017-11-07_VA_G", "G", "data_precinct.csv")
     return get_fixture(path, load=False, pandas=True)
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def va_governor_county_data(get_fixture):
     path = os.path.join("data", "2017-11-07_VA_G", "G", "data_county.csv")
     return get_fixture(path, load=False, pandas=True)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def va_assembly_county_data(get_fixture):
     path = os.path.join("data", "2017-11-07_VA_G", "Y", "data_county-district.csv")
     return get_fixture(path, load=False, pandas=True)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def va_assembly_precinct_data(get_fixture):
     path = os.path.join("data", "2017-11-07_VA_G", "Y", "data_precinct-district.csv")
     return get_fixture(path, load=False, pandas=True)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def az_assembly_precinct_data(get_fixture):
     path = os.path.join("data", "2020-08-04_AZ_R", "S", "data_precinct.csv")
     return get_fixture(path, load=False, pandas=True)
diff --git a/tests/handlers/test_combined_data.py b/tests/handlers/test_combined_data.py
index 111932c2..1888a121 100644
--- a/tests/handlers/test_combined_data.py
+++ b/tests/handlers/test_combined_data.py
@@ -17,7 +17,12 @@ def test_load(va_governor_county_data):
     )
     current_data = live_data_handler.data
     preprocessed_data_handler = PreprocessedDataHandler(
-        election_id, office_id, geographic_unit_type, estimand_baselines=estimand_baselines, estimands=["turnout"], data=va_governor_county_data
+        election_id,
+        office_id,
+        geographic_unit_type,
+        estimand_baselines=estimand_baselines,
+        estimands=estimands,
+        data=va_governor_county_data,
     )
     preprocessed_data = preprocessed_data_handler.data
 
@@ -32,21 +37,37 @@ def test_zero_unreporting_missing_single_estimand_value(va_governor_county_data)
     """
     Set the value for one estimand (dem) as na to test unreporting = "zero"
     """
+    election_id = "2017-11-07_VA_G"
+    office_id = "G"
+    geographic_unit_type = "county"
+    estimand_baselines = {"turnout": "turnout", "dem": "dem"}
     estimands = ["turnout", "dem"]
-    live_data_handler = MockLiveDataHandler("2017-11-07_VA_G", "G", "county", estimands, data=va_governor_county_data)
+    live_data_handler = MockLiveDataHandler(
+        election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data
+    )
     current_data = live_data_handler.data
     current_data["percent_expected_vote"] = 100
     current_data.loc[0, "results_dem"] = np.nan
 
+    preprocessed_data_handler = PreprocessedDataHandler(
+        election_id,
+        office_id,
+        geographic_unit_type,
+        estimand_baselines=estimand_baselines,
+        estimands=estimands,
+        data=va_governor_county_data,
+    )
+    preprocessed_data = preprocessed_data_handler.data
+
     combined_data_handler = CombinedDataHandler(
-        va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero"
+        preprocessed_data, current_data, estimands, "county", handle_unreporting="zero"
     )
     assert combined_data_handler.data["results_dem"].iloc[0] == 0.0  # value with na result has been set to zero
     assert combined_data_handler.data["results_turnout"].iloc[0] != 0  # has not been set to zero
     assert (
         combined_data_handler.data["percent_expected_vote"].iloc[0] == 0
     )  # percent expected vote with na result has been set to zero
-    assert combined_data_handler.data.shape == (133, 35)  # didn't drop any
+    assert combined_data_handler.data.shape == (133, 36)  # didn't drop any
     assert combined_data_handler.data["results_dem"].iloc[1] != 0  # didn't accidentally set other to zero
 
 
@@ -54,20 +75,36 @@ def test_zero_unreporting_missing_multiple_estimands_value(va_governor_county_da
     """
     Set the value for multiple estimands (dem, turnout) as na to test unreporting = "zero"
     """
+    election_id = "2017-11-07_VA_G"
+    office_id = "G"
+    geographic_unit_type = "county"
+    estimand_baselines = {"turnout": "turnout", "dem": "dem"}
     estimands = ["turnout", "dem"]
-    live_data_handler = MockLiveDataHandler("2017-11-07_VA_G", "G", "county", estimands, data=va_governor_county_data)
+    live_data_handler = MockLiveDataHandler(
+        election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data
+    )
     current_data = live_data_handler.data
     current_data["percent_expected_vote"] = 100
     current_data.loc[0, "results_dem"] = np.nan
     current_data.loc[0, "results_turnout"] = np.nan
 
+    preprocessed_data_handler = PreprocessedDataHandler(
+        election_id,
+        office_id,
+        geographic_unit_type,
+        estimand_baselines=estimand_baselines,
+        estimands=estimands,
+        data=va_governor_county_data,
+    )
+    preprocessed_data = preprocessed_data_handler.data
+
     combined_data_handler = CombinedDataHandler(
-        va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero"
+        preprocessed_data, current_data, estimands, "county", handle_unreporting="zero"
     )
     assert combined_data_handler.data["results_dem"].iloc[0] == 0.0
     assert combined_data_handler.data["results_turnout"].iloc[0] == 0.0
     assert combined_data_handler.data["percent_expected_vote"].iloc[0] == 0.0
-    assert combined_data_handler.data.shape == (133, 35)
+    assert combined_data_handler.data.shape == (133, 36)
     assert combined_data_handler.data["results_dem"].iloc[1] != 0  # didn't accidentally set other to zero
     assert combined_data_handler.data["results_turnout"].iloc[1] != 0  # didn't accidentally set other to zero
 
@@ -76,19 +113,35 @@ def test_zero_unreporting_missing_percent_expected_vote_value(va_governor_county
     """
     Set the value and percent reporting for one estimand (dem) as na to test unreporting = "zero"
     """
+    election_id = "2017-11-07_VA_G"
+    office_id = "G"
+    geographic_unit_type = "county"
+    estimand_baselines = {"turnout": "turnout", "dem": "dem"}
     estimands = ["turnout", "dem"]
-    live_data_handler = MockLiveDataHandler("2017-11-07_VA_G", "G", "county", estimands, data=va_governor_county_data)
+    live_data_handler = MockLiveDataHandler(
+        election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data
+    )
     current_data = live_data_handler.data
     current_data["percent_expected_vote"] = 100
     current_data.loc[0, "percent_expected_vote"] = np.nan
     current_data.loc[0, "results_dem"] = np.nan
 
+    preprocessed_data_handler = PreprocessedDataHandler(
+        election_id,
+        office_id,
+        geographic_unit_type,
+        estimand_baselines=estimand_baselines,
+        estimands=estimands,
+        data=va_governor_county_data,
+    )
+    preprocessed_data = preprocessed_data_handler.data
+
     combined_data_handler = CombinedDataHandler(
-        va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero"
+        preprocessed_data, current_data, estimands, "county", handle_unreporting="zero"
     )
     assert combined_data_handler.data["results_dem"].iloc[0] == 0.0
     assert combined_data_handler.data["percent_expected_vote"].iloc[0] == 0.0
-    assert combined_data_handler.data.shape == (133, 35)
+    assert combined_data_handler.data.shape == (133, 36)
     assert combined_data_handler.data["results_dem"].iloc[1] != 0  # didn't accidentally set other to zero
 
 
@@ -96,18 +149,34 @@ def test_zero_unreporting_random_percent_expected_vote_value(va_governor_county_
     """
     Set the value for one estimand (dem) as na to test unreporting = "zero"
     """
+    election_id = "2017-11-07_VA_G"
+    office_id = "G"
+    geographic_unit_type = "county"
+    estimand_baselines = {"turnout": "turnout", "dem": "dem"}
     estimands = ["turnout", "dem"]
-    live_data_handler = MockLiveDataHandler("2017-11-07_VA_G", "G", "county", estimands, data=va_governor_county_data)
+    live_data_handler = MockLiveDataHandler(
+        election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data
+    )
     current_data = live_data_handler.data
     current_data["percent_expected_vote"] = np.random.randint(1, 100, current_data.shape[0])
     current_data.loc[0, "results_dem"] = np.nan
 
+    preprocessed_data_handler = PreprocessedDataHandler(
+        election_id,
+        office_id,
+        geographic_unit_type,
+        estimand_baselines=estimand_baselines,
+        estimands=estimands,
+        data=va_governor_county_data,
+    )
+    preprocessed_data = preprocessed_data_handler.data
+
     combined_data_handler = CombinedDataHandler(
-        va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero"
+        preprocessed_data, current_data, estimands, "county", handle_unreporting="zero"
     )
     assert combined_data_handler.data["results_dem"].iloc[0] == 0.0  # all values set to 0.0
     assert combined_data_handler.data["percent_expected_vote"].iloc[0] == 0.0
-    assert combined_data_handler.data.shape == (133, 35)
+    assert combined_data_handler.data.shape == (133, 36)
     assert combined_data_handler.data["results_dem"].iloc[1] != 0  # didn't accidentally set other to zero
 
 
@@ -115,16 +184,32 @@ def test_drop_unreporting_missing_single_estimand_value(va_governor_county_data)
     """
     Set the value for one estimand (dem) as na to test unreporting = "drop"
     """
+    election_id = "2017-11-07_VA_G"
+    office_id = "G"
+    geographic_unit_type = "county"
+    estimand_baselines = {"turnout": "turnout", "dem": "dem"}
     estimands = ["turnout", "dem"]
-    live_data_handler = MockLiveDataHandler("2017-11-07_VA_G", "G", "county", estimands, data=va_governor_county_data)
+    live_data_handler = MockLiveDataHandler(
+        election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data
+    )
     current_data = live_data_handler.data
     current_data["percent_expected_vote"] = 100
     current_data.loc[0, "results_dem"] = np.nan
 
+    preprocessed_data_handler = PreprocessedDataHandler(
+        election_id,
+        office_id,
+        geographic_unit_type,
+        estimand_baselines=estimand_baselines,
+        estimands=estimands,
+        data=va_governor_county_data,
+    )
+    preprocessed_data = preprocessed_data_handler.data
+
     combined_data_handler = CombinedDataHandler(
-        va_governor_county_data, current_data, estimands, "county", handle_unreporting="drop"
+        preprocessed_data, current_data, estimands, "county", handle_unreporting="drop"
     )
-    assert combined_data_handler.data.shape == (132, 35)  # dropped one
+    assert combined_data_handler.data.shape == (132, 36)  # dropped one
     assert combined_data_handler.data["results_dem"].iloc[0] != 0  # didn't accidentally set other to zero
 
 
diff --git a/tests/handlers/test_estimandizer.py b/tests/handlers/test_estimandizer.py
index 91be637a..5b476d09 100644
--- a/tests/handlers/test_estimandizer.py
+++ b/tests/handlers/test_estimandizer.py
@@ -13,7 +13,7 @@ def test_add_estimand_results_not_historical(va_governor_county_data):
     (output_df, result_columns) = estimandizer.add_estimand_results(va_data_copy, estimands, False)
 
     assert "results_party_vote_share_dem" in output_df.columns
-    assert result_columns == ["results_party_vote_share_dem"]
+    assert result_columns == ["results_party_vote_share_dem", "results_turnout"]
 
 
 def test_add_estimand_results_historical(va_governor_county_data):
@@ -27,7 +27,7 @@ def test_add_estimand_results_historical(va_governor_county_data):
     (output_df, result_columns) = estimandizer.add_estimand_results(va_data_copy, estimands, True)
 
     assert "results_party_vote_share_dem" in output_df.columns
-    assert result_columns == ["results_party_vote_share_dem"]
+    assert result_columns == ["results_party_vote_share_dem", "results_turnout"]
 
 
 def test_add_estimand_baselines_not_historical(va_governor_county_data):
diff --git a/tests/models/test_nonparametric_election_model.py b/tests/models/test_nonparametric_election_model.py
index ac7b37e6..7688eea9 100644
--- a/tests/models/test_nonparametric_election_model.py
+++ b/tests/models/test_nonparametric_election_model.py
@@ -408,7 +408,7 @@ def test_fit_model():
     """
     model_settings = {}
     model = NonparametricElectionModel.NonparametricElectionModel(model_settings)
-    qr = QuantileRegressionSolver(solver="ECOS")
+    qr = QuantileRegressionSolver()
 
     df_X = pd.DataFrame({"a": [1, 1, 1, 1], "b": [1, 1, 1, 2]})
 
@@ -416,8 +416,8 @@ def test_fit_model():
     weights = pd.DataFrame({"weights": [1, 1, 1, 1]}).weights
     model.fit_model(qr, df_X, df_y, 0.5, weights, True)
 
-    assert all(np.abs(qr.predict(df_X) - [8, 8, 8, 15]) <= TOL)
-    assert all(np.abs(qr.coefficients - [1, 7]) <= TOL)
+    np.testing.assert_allclose(qr.predict(df_X), [[8, 8, 8, 15]], rtol=TOL)
+    np.testing.assert_allclose(qr.coefficients, [[1, 7]], rtol=TOL)
 
 
 def test_get_unit_predictions():

From 9145e8053949fcf963b5ffe5203171d08d2b737b Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Fri, 22 Sep 2023 10:11:03 -0400
Subject: [PATCH 03/14] resetting index to avoid copy warning in pandas

---
 src/elexmodel/models/ConformalElectionModel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/elexmodel/models/ConformalElectionModel.py b/src/elexmodel/models/ConformalElectionModel.py
index 8c240c7e..a0c8ae72 100644
--- a/src/elexmodel/models/ConformalElectionModel.py
+++ b/src/elexmodel/models/ConformalElectionModel.py
@@ -157,7 +157,7 @@ def get_unit_prediction_interval_bounds(
 
         # apply to conformalization data. Conformalization bounds will later tell us how much to adjust lower/upper
         # bounds for nonreporting data.
-        conformalization_data = reporting_units_shuffled[train_rows:]
+        conformalization_data = reporting_units_shuffled[train_rows:].reset_index(drop=True)
 
         # all_data starts with reporting_units_shuffled, so the rows between train_rows and n_train are the
         # conformalization set

From b8605113567086a6d423b3b7495d84b6444f2de6 Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Fri, 22 Sep 2023 10:28:53 -0400
Subject: [PATCH 04/14] updated combined data unit tests

---
 tests/handlers/test_combined_data.py | 84 ++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/tests/handlers/test_combined_data.py b/tests/handlers/test_combined_data.py
index 1888a121..674c988d 100644
--- a/tests/handlers/test_combined_data.py
+++ b/tests/handlers/test_combined_data.py
@@ -238,6 +238,90 @@ def test_get_reporting_data(va_governor_county_data):
     assert observed_data.reporting.sum() == 20
 
 
+def test_get_reporting_data_dropping_with_turnout_factor(va_governor_county_data):
+    election_id = "2017-11-07_VA_G"
+    office = "G"
+    geographic_unit_type = "county"
+    estimands = ["turnout"]
+    estimand_baseline = {"turnout": "turnout"}
+
+    live_data_handler = MockLiveDataHandler(
+        election_id, office, geographic_unit_type, estimands, data=va_governor_county_data
+    )
+    current_data = live_data_handler.get_n_fully_reported(n=20)
+    preprocessed_data_handler = PreprocessedDataHandler(
+        election_id, office, geographic_unit_type, estimands, estimand_baseline, data=va_governor_county_data
+    )
+
+    combined_data_handler = CombinedDataHandler(
+        preprocessed_data_handler.data, current_data, estimands, geographic_unit_type
+    )
+
+    turnout_factor_lower = 0.95
+    turnout_factor_upper = 1.2
+    reporting_units_above_turnout_factor_threshold = combined_data_handler.data[
+        combined_data_handler.data.turnout_factor > turnout_factor_upper
+    ].shape[0]
+    reporting_units_below_turnout_factor_threshold = combined_data_handler.data[
+        (combined_data_handler.data.percent_expected_vote == 100)
+        & (combined_data_handler.data.turnout_factor < turnout_factor_lower)
+    ].shape[0]
+
+    observed_data = combined_data_handler.get_reporting_units(
+        100, turnout_factor_lower=turnout_factor_lower, turnout_factor_upper=turnout_factor_upper
+    )
+
+    # 20 units should be reporting, but the additional ones are dropped to nonreporting because they are above/below threshold
+    # and so are subtracted from the reporting ones
+    assert observed_data.shape[0] == 20 - (
+        reporting_units_above_turnout_factor_threshold + reporting_units_below_turnout_factor_threshold
+    )
+
+
+def test_get_nonreporting_adding_with_turnout_factor(va_governor_county_data):
+    election_id = "2017-11-07_VA_G"
+    office = "G"
+    geographic_unit_type = "county"
+    estimands = ["turnout"]
+    estimand_baseline = {"turnout": "turnout"}
+
+    live_data_handler = MockLiveDataHandler(
+        election_id, office, geographic_unit_type, estimands, data=va_governor_county_data
+    )
+    n = 20
+    current_data = live_data_handler.get_n_fully_reported(n=n)
+    preprocessed_data_handler = PreprocessedDataHandler(
+        election_id, office, geographic_unit_type, estimands, estimand_baseline, data=va_governor_county_data
+    )
+
+    combined_data_handler = CombinedDataHandler(
+        preprocessed_data_handler.data, current_data, estimands, geographic_unit_type
+    )
+
+    turnout_factor_lower = 0.95
+    turnout_factor_upper = 1.2
+
+    reporting_units_above_turnout_factor_threshold = combined_data_handler.data[
+        combined_data_handler.data.turnout_factor > turnout_factor_upper
+    ].shape[0]
+    reporting_units_below_turnout_factor_threshold = combined_data_handler.data[
+        (combined_data_handler.data.percent_expected_vote == 100)
+        & (combined_data_handler.data.turnout_factor < turnout_factor_lower)
+    ].shape[0]
+
+    nonreporting_data = combined_data_handler.get_nonreporting_units(
+        100, turnout_factor_lower=turnout_factor_lower, turnout_factor_upper=turnout_factor_upper
+    )
+
+    assert (
+        nonreporting_data.shape[0]
+        == va_governor_county_data.shape[0]
+        - n
+        + reporting_units_above_turnout_factor_threshold
+        + reporting_units_below_turnout_factor_threshold
+    )
+
+
 def test_get_unexpected_units_county_district(va_assembly_county_data):
     election_id = "2017-11-07_VA_G"
     office = "Y"

From 440a7e06b1cabd0fe2607d78f203dc232808762a Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Fri, 22 Sep 2023 10:49:31 -0400
Subject: [PATCH 05/14] updated estimandizer unit tests

---
 src/elexmodel/handlers/data/Estimandizer.py |  8 ++++++-
 tests/handlers/test_estimandizer.py         | 26 +++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py
index 1f5c43cb..bfa8ec98 100644
--- a/src/elexmodel/handlers/data/Estimandizer.py
+++ b/src/elexmodel/handlers/data/Estimandizer.py
@@ -81,7 +81,13 @@ def add_weights(self, data_df, col_prefix):
         return data_df
 
     def add_turnout_factor(self, data_df):
-        data_df["turnout_factor"] = np.nan_to_num(data_df.results_weights / data_df.baseline_weights)
+        # posinf and neginf are also set to zero because dividing by zero can lead to nan/posinf/neginf depending
+        # on the type of the numeric in the numpy array. Assume that if baseline_weights is zero then turnout
+        # would be incredibly low in this election too (ie. this is effectively an empty precinct) and so setting
+        # the turnout factor to zero is fine
+        data_df["turnout_factor"] = np.nan_to_num(
+            data_df.results_weights / data_df.baseline_weights, nan=0, posinf=0, neginf=0
+        )
         return data_df
 
 
diff --git a/tests/handlers/test_estimandizer.py b/tests/handlers/test_estimandizer.py
index 5b476d09..23509c9f 100644
--- a/tests/handlers/test_estimandizer.py
+++ b/tests/handlers/test_estimandizer.py
@@ -1,3 +1,5 @@
+import pytest
+
 from elexmodel.handlers.data.Estimandizer import Estimandizer
 
 
@@ -13,6 +15,7 @@ def test_add_estimand_results_not_historical(va_governor_county_data):
     (output_df, result_columns) = estimandizer.add_estimand_results(va_data_copy, estimands, False)
 
     assert "results_party_vote_share_dem" in output_df.columns
+    assert "results_weights" in output_df.columns
     assert result_columns == ["results_party_vote_share_dem", "results_turnout"]
 
 
@@ -27,6 +30,7 @@ def test_add_estimand_results_historical(va_governor_county_data):
     (output_df, result_columns) = estimandizer.add_estimand_results(va_data_copy, estimands, True)
 
     assert "results_party_vote_share_dem" in output_df.columns
+    assert "results_weights" in output_df.columns
     assert result_columns == ["results_party_vote_share_dem", "results_turnout"]
 
 
@@ -34,6 +38,7 @@ def test_add_estimand_baselines_not_historical(va_governor_county_data):
     estimand_baselines = {"turnout": "turnout", "party_vote_share_dem": "party_vote_share_dem"}
     estimandizer = Estimandizer()
     output_df = estimandizer.add_estimand_baselines(va_governor_county_data.copy(), estimand_baselines, False)
+    assert "baseline_weights" in output_df.columns
     assert "baseline_party_vote_share_dem" in output_df.columns
     assert "last_election_results_party_vote_share_dem" in output_df.columns
 
@@ -45,5 +50,26 @@ def test_add_estimand_baselines_historical(va_governor_county_data):
         va_governor_county_data.copy(), estimand_baselines, True, include_results_estimand=True
     )
     assert "baseline_party_vote_share_dem" in output_df.columns
+    assert "baseline_weights" in output_df.columns
     assert "results_party_vote_share_dem" in output_df.columns
     assert "last_election_results_party_vote_share_dem" not in output_df.columns
+
+
+def test_add_turnout_factor(va_governor_county_data):
+    estimands = ["party_vote_share_dem", "turnout"]
+    estimand_baselines = {"turnout": "turnout", "party_vote_share_dem": "party_vote_share_dem"}
+    estimandizer = Estimandizer()
+    output_df = estimandizer.add_estimand_baselines(
+        va_governor_county_data.copy(), estimand_baselines, False, include_results_estimand=False
+    )
+    output_df, __ = estimandizer.add_estimand_results(output_df, estimands, False)
+
+    # check that nan turns into 0
+    output_df.loc[0, "baseline_weights"] = 0.0
+    import pdb
+
+    pdb.set_trace()
+    output_df = estimandizer.add_turnout_factor(output_df)
+
+    assert "turnout_factor" in output_df.columns
+    assert 0 == pytest.approx(output_df.loc[0, "turnout_factor"])

From 1effc1116fe0be5954161d4605ee6eb7aa5d59ff Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Mon, 25 Sep 2023 11:49:08 -0400
Subject: [PATCH 06/14] updated elex-solver version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index f4507b08..6742c3bf 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 
 from setuptools import find_packages, setup
 
-INSTALL_REQUIRES = ("click<8.1", "elex-solver<2", "pandas<1.5.0", "boto3<2", "python-dotenv==0.19.2", "scipy==1.10.1")
+INSTALL_REQUIRES = ("click<8.1", "elex-solver<3", "pandas<1.5.0", "boto3<2", "python-dotenv==0.19.2", "scipy==1.10.1")
 
 THIS_FILE_DIR = os.path.dirname(__file__)
 

From bf0ed06bbf390239c2ca170f4bdbb840615b1124 Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Mon, 25 Sep 2023 11:56:09 -0400
Subject: [PATCH 07/14] updated unit test

---
 tests/handlers/test_estimandizer.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/handlers/test_estimandizer.py b/tests/handlers/test_estimandizer.py
index 23509c9f..d2321fcc 100644
--- a/tests/handlers/test_estimandizer.py
+++ b/tests/handlers/test_estimandizer.py
@@ -66,9 +66,6 @@ def test_add_turnout_factor(va_governor_county_data):
 
     # check that nan turns into 0
     output_df.loc[0, "baseline_weights"] = 0.0
-    import pdb
-
-    pdb.set_trace()
     output_df = estimandizer.add_turnout_factor(output_df)
 
     assert "turnout_factor" in output_df.columns

From bc81f0e50ac8cdfee2da3382ebd195111e935aed Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Mon, 25 Sep 2023 12:22:40 -0400
Subject: [PATCH 08/14] updated historical

---
 src/elexmodel/client.py                     | 4 ++--
 src/elexmodel/handlers/data/Estimandizer.py | 8 +++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/elexmodel/client.py b/src/elexmodel/client.py
index 6cd8b308..db83feee 100644
--- a/src/elexmodel/client.py
+++ b/src/elexmodel/client.py
@@ -416,8 +416,8 @@ def _format_historical_current_data(
             historical=True,
             include_results_estimand=True,
         )
-
-        results_to_return = [f"results_{estimand}" for estimand in estimands]
+        # we always want to pass turnout so that we can generate results weights
+        results_to_return = list(set([f"results_{estimand}" for estimand in estimands] + ["results_turnout"]))
         geo_columns = set(["geographic_unit_fips", "postal_code"] + [a for a in self.aggregates if a != "unit"])
         preprocessed_data = preprocessed_data_handler.data[list(geo_columns) + results_to_return].copy()
         historical_current_data = preprocessed_data.merge(formatted_data, on=["postal_code", "geographic_unit_fips"])
diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py
index bfa8ec98..431c7235 100644
--- a/src/elexmodel/handlers/data/Estimandizer.py
+++ b/src/elexmodel/handlers/data/Estimandizer.py
@@ -13,6 +13,8 @@ def add_estimand_results(self, data_df, estimands, historical):
         columns_to_return = []
         turnout_col = f"{RESULTS_PREFIX}turnout"
 
+        data_df = self.add_weights(data_df, RESULTS_PREFIX)
+
         for estimand in estimands:
             results_col = f"{RESULTS_PREFIX}{estimand}"
             additional_columns_added = []
@@ -31,6 +33,7 @@ def add_estimand_results(self, data_df, estimands, historical):
                         # Hence, this is the only special case in which we'd want to add
                         # an empty results_ column.
                         data_df[results_col] = np.nan
+                        data_df[turnout_col] = np.nan
                     else:
                         # If this is not a historical run, then this is a live election
                         # so we are expecting that there will be actual results data
@@ -43,14 +46,14 @@ def add_estimand_results(self, data_df, estimands, historical):
         if turnout_col not in columns_to_return:
             columns_to_return.append(turnout_col)
 
-        data_df = self.add_weights(data_df, RESULTS_PREFIX)
-
         return data_df, columns_to_return
 
     def add_estimand_baselines(self, data_df, estimand_baselines, historical, include_results_estimand=False):
         # if we are in a historical election we are only reading preprocessed data to get
         # the historical election results of the currently reporting units.
         # so we don't care about the total voters or the baseline election.
+        
+        data_df = self.add_weights(data_df, BASELINE_PREFIX)
 
         for estimand, pointer in estimand_baselines.items():
             if pointer is None:
@@ -73,7 +76,6 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ
             # we need to add the results from the historical election as well.
             data_df, ___ = self.add_estimand_results(data_df, estimand_baselines.keys(), historical)
 
-        data_df = self.add_weights(data_df, BASELINE_PREFIX)
         return data_df
 
     def add_weights(self, data_df, col_prefix):

From 7662f0c0f2cc498b073f3a7d72b6f206ef34766b Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Mon, 25 Sep 2023 12:23:06 -0400
Subject: [PATCH 09/14] linter

---
 src/elexmodel/handlers/data/Estimandizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py
index 431c7235..0359d148 100644
--- a/src/elexmodel/handlers/data/Estimandizer.py
+++ b/src/elexmodel/handlers/data/Estimandizer.py
@@ -52,7 +52,7 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ
         # if we are in a historical election we are only reading preprocessed data to get
         # the historical election results of the currently reporting units.
         # so we don't care about the total voters or the baseline election.
-        
+
         data_df = self.add_weights(data_df, BASELINE_PREFIX)
 
         for estimand, pointer in estimand_baselines.items():

From 23b9f7d2b13c22c522cc680ef2ca7cbb1440f6f9 Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Mon, 25 Sep 2023 12:40:44 -0400
Subject: [PATCH 10/14] move weights adding

---
 src/elexmodel/handlers/data/Estimandizer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py
index 0359d148..9e45003a 100644
--- a/src/elexmodel/handlers/data/Estimandizer.py
+++ b/src/elexmodel/handlers/data/Estimandizer.py
@@ -13,8 +13,6 @@ def add_estimand_results(self, data_df, estimands, historical):
         columns_to_return = []
         turnout_col = f"{RESULTS_PREFIX}turnout"
 
-        data_df = self.add_weights(data_df, RESULTS_PREFIX)
-
         for estimand in estimands:
             results_col = f"{RESULTS_PREFIX}{estimand}"
             additional_columns_added = []
@@ -46,6 +44,8 @@ def add_estimand_results(self, data_df, estimands, historical):
         if turnout_col not in columns_to_return:
             columns_to_return.append(turnout_col)
 
+        data_df = self.add_weights(data_df, RESULTS_PREFIX)
+
         return data_df, columns_to_return
 
     def add_estimand_baselines(self, data_df, estimand_baselines, historical, include_results_estimand=False):
@@ -53,8 +53,6 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ
         # the historical election results of the currently reporting units.
         # so we don't care about the total voters or the baseline election.
 
-        data_df = self.add_weights(data_df, BASELINE_PREFIX)
-
         for estimand, pointer in estimand_baselines.items():
             if pointer is None:
                 # when we are creating a new estimand
@@ -75,6 +73,8 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ
             # Since this method is only called by the PreprocessedDataHandler, for historical runs,
             # we need to add the results from the historical election as well.
             data_df, ___ = self.add_estimand_results(data_df, estimand_baselines.keys(), historical)
+        
+        data_df = self.add_weights(data_df, BASELINE_PREFIX)
 
         return data_df
 

From 82567c3b24d043ca30260914b478962e10bd0e61 Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Mon, 25 Sep 2023 12:47:49 -0400
Subject: [PATCH 11/14] linter

---
 src/elexmodel/handlers/data/Estimandizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py
index 9e45003a..2807154f 100644
--- a/src/elexmodel/handlers/data/Estimandizer.py
+++ b/src/elexmodel/handlers/data/Estimandizer.py
@@ -73,7 +73,7 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ
             # Since this method is only called by the PreprocessedDataHandler, for historical runs,
             # we need to add the results from the historical election as well.
             data_df, ___ = self.add_estimand_results(data_df, estimand_baselines.keys(), historical)
-        
+
         data_df = self.add_weights(data_df, BASELINE_PREFIX)
 
         return data_df

From 72dd9a2d16500b47da32e4ab7714efba8315de64 Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Mon, 25 Sep 2023 14:56:09 -0400
Subject: [PATCH 12/14] using nan to num in party_vote_share_dem

---
 src/elexmodel/handlers/data/Estimandizer.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py
index 2807154f..6acc43b1 100644
--- a/src/elexmodel/handlers/data/Estimandizer.py
+++ b/src/elexmodel/handlers/data/Estimandizer.py
@@ -97,11 +97,5 @@ def add_turnout_factor(self, data_df):
 
 
 def party_vote_share_dem(data_df, col_prefix):
-    numer = f"{col_prefix}dem"
-    denom = f"{col_prefix}turnout"
-
-    data_df[f"{col_prefix}party_vote_share_dem"] = data_df.apply(
-        lambda x: 0 if x[numer] == 0 or x[denom] == 0 else x[numer] / x[denom], axis=1
-    )
-
+    data_df[f"{col_prefix}party_vote_share_dem"] = np.nan_to_num(data_df[f"{col_prefix}dem"] /data_df[f"{col_prefix}turnout"])
     return data_df, []

From 88719c7e01822d98db30d5f42d140fb6b24f9d3f Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Mon, 25 Sep 2023 15:04:59 -0400
Subject: [PATCH 13/14] changed tests slightly

---
 tests/handlers/test_combined_data.py | 123 ++++++++-------------------
 1 file changed, 37 insertions(+), 86 deletions(-)

diff --git a/tests/handlers/test_combined_data.py b/tests/handlers/test_combined_data.py
index 674c988d..b66de0ba 100644
--- a/tests/handlers/test_combined_data.py
+++ b/tests/handlers/test_combined_data.py
@@ -16,21 +16,13 @@ def test_load(va_governor_county_data):
         election_id, office_id, geographic_unit_type, estimands=["turnout"], data=va_governor_county_data
     )
     current_data = live_data_handler.data
-    preprocessed_data_handler = PreprocessedDataHandler(
-        election_id,
-        office_id,
-        geographic_unit_type,
-        estimand_baselines=estimand_baselines,
-        estimands=estimands,
-        data=va_governor_county_data,
-    )
-    preprocessed_data = preprocessed_data_handler.data
+    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
 
     combined_data_handler = CombinedDataHandler(
-        preprocessed_data, current_data, estimands, "county", handle_unreporting="drop"
+        va_governor_county_data, current_data, estimands, "county", handle_unreporting="drop"
     )
 
-    assert combined_data_handler.data.shape == (133, 33)
+    assert combined_data_handler.data.shape == (133, 32)
 
 
 def test_zero_unreporting_missing_single_estimand_value(va_governor_county_data):
@@ -49,25 +41,17 @@ def test_zero_unreporting_missing_single_estimand_value(va_governor_county_data)
     current_data["percent_expected_vote"] = 100
     current_data.loc[0, "results_dem"] = np.nan
 
-    preprocessed_data_handler = PreprocessedDataHandler(
-        election_id,
-        office_id,
-        geographic_unit_type,
-        estimand_baselines=estimand_baselines,
-        estimands=estimands,
-        data=va_governor_county_data,
-    )
-    preprocessed_data = preprocessed_data_handler.data
+    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
 
     combined_data_handler = CombinedDataHandler(
-        preprocessed_data, current_data, estimands, "county", handle_unreporting="zero"
+        va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero"
     )
     assert combined_data_handler.data["results_dem"].iloc[0] == 0.0  # value with na result has been set to zero
     assert combined_data_handler.data["results_turnout"].iloc[0] != 0  # has not been set to zero
     assert (
         combined_data_handler.data["percent_expected_vote"].iloc[0] == 0
     )  # percent expected vote with na result has been set to zero
-    assert combined_data_handler.data.shape == (133, 36)  # didn't drop any
+    assert combined_data_handler.data.shape == (133, 34)  # didn't drop any
     assert combined_data_handler.data["results_dem"].iloc[1] != 0  # didn't accidentally set other to zero
 
 
@@ -88,23 +72,15 @@ def test_zero_unreporting_missing_multiple_estimands_value(va_governor_county_da
     current_data.loc[0, "results_dem"] = np.nan
     current_data.loc[0, "results_turnout"] = np.nan
 
-    preprocessed_data_handler = PreprocessedDataHandler(
-        election_id,
-        office_id,
-        geographic_unit_type,
-        estimand_baselines=estimand_baselines,
-        estimands=estimands,
-        data=va_governor_county_data,
-    )
-    preprocessed_data = preprocessed_data_handler.data
+    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
 
     combined_data_handler = CombinedDataHandler(
-        preprocessed_data, current_data, estimands, "county", handle_unreporting="zero"
+        va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero"
     )
     assert combined_data_handler.data["results_dem"].iloc[0] == 0.0
     assert combined_data_handler.data["results_turnout"].iloc[0] == 0.0
     assert combined_data_handler.data["percent_expected_vote"].iloc[0] == 0.0
-    assert combined_data_handler.data.shape == (133, 36)
+    assert combined_data_handler.data.shape == (133, 34)
     assert combined_data_handler.data["results_dem"].iloc[1] != 0  # didn't accidentally set other to zero
     assert combined_data_handler.data["results_turnout"].iloc[1] != 0  # didn't accidentally set other to zero
 
@@ -126,22 +102,14 @@ def test_zero_unreporting_missing_percent_expected_vote_value(va_governor_county
     current_data.loc[0, "percent_expected_vote"] = np.nan
     current_data.loc[0, "results_dem"] = np.nan
 
-    preprocessed_data_handler = PreprocessedDataHandler(
-        election_id,
-        office_id,
-        geographic_unit_type,
-        estimand_baselines=estimand_baselines,
-        estimands=estimands,
-        data=va_governor_county_data,
-    )
-    preprocessed_data = preprocessed_data_handler.data
+    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
 
     combined_data_handler = CombinedDataHandler(
-        preprocessed_data, current_data, estimands, "county", handle_unreporting="zero"
+        va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero"
     )
     assert combined_data_handler.data["results_dem"].iloc[0] == 0.0
     assert combined_data_handler.data["percent_expected_vote"].iloc[0] == 0.0
-    assert combined_data_handler.data.shape == (133, 36)
+    assert combined_data_handler.data.shape == (133, 34)
     assert combined_data_handler.data["results_dem"].iloc[1] != 0  # didn't accidentally set other to zero
 
 
@@ -161,22 +129,14 @@ def test_zero_unreporting_random_percent_expected_vote_value(va_governor_county_
     current_data["percent_expected_vote"] = np.random.randint(1, 100, current_data.shape[0])
     current_data.loc[0, "results_dem"] = np.nan
 
-    preprocessed_data_handler = PreprocessedDataHandler(
-        election_id,
-        office_id,
-        geographic_unit_type,
-        estimand_baselines=estimand_baselines,
-        estimands=estimands,
-        data=va_governor_county_data,
-    )
-    preprocessed_data = preprocessed_data_handler.data
+    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
 
     combined_data_handler = CombinedDataHandler(
-        preprocessed_data, current_data, estimands, "county", handle_unreporting="zero"
+        va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero"
     )
     assert combined_data_handler.data["results_dem"].iloc[0] == 0.0  # all values set to 0.0
     assert combined_data_handler.data["percent_expected_vote"].iloc[0] == 0.0
-    assert combined_data_handler.data.shape == (133, 36)
+    assert combined_data_handler.data.shape == (133, 34)
     assert combined_data_handler.data["results_dem"].iloc[1] != 0  # didn't accidentally set other to zero
 
 
@@ -196,20 +156,12 @@ def test_drop_unreporting_missing_single_estimand_value(va_governor_county_data)
     current_data["percent_expected_vote"] = 100
     current_data.loc[0, "results_dem"] = np.nan
 
-    preprocessed_data_handler = PreprocessedDataHandler(
-        election_id,
-        office_id,
-        geographic_unit_type,
-        estimand_baselines=estimand_baselines,
-        estimands=estimands,
-        data=va_governor_county_data,
-    )
-    preprocessed_data = preprocessed_data_handler.data
+    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
 
     combined_data_handler = CombinedDataHandler(
-        preprocessed_data, current_data, estimands, "county", handle_unreporting="drop"
+        va_governor_county_data, current_data, estimands, "county", handle_unreporting="drop"
     )
-    assert combined_data_handler.data.shape == (132, 36)  # dropped one
+    assert combined_data_handler.data.shape == (132, 34)  # dropped one
     assert combined_data_handler.data["results_dem"].iloc[0] != 0  # didn't accidentally set other to zero
 
 
@@ -224,13 +176,13 @@ def test_get_reporting_data(va_governor_county_data):
         election_id, office, geographic_unit_type, estimands, data=va_governor_county_data
     )
     current_data = live_data_handler.get_n_fully_reported(n=20)
-    preprocessed_data_handler = PreprocessedDataHandler(
-        election_id, office, geographic_unit_type, estimands, estimand_baseline, data=va_governor_county_data
-    )
+
+    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
+    va_governor_county_data['last_election_results_turnout'] = va_governor_county_data.baseline_turnout + 1
 
     # no fixed effects
     combined_data_handler = CombinedDataHandler(
-        preprocessed_data_handler.data, current_data, estimands, geographic_unit_type
+        va_governor_county_data, current_data, estimands, geographic_unit_type
     )
     observed_data = combined_data_handler.get_reporting_units(100)
     assert observed_data.shape[0] == 20
@@ -249,12 +201,12 @@ def test_get_reporting_data_dropping_with_turnout_factor(va_governor_county_data
         election_id, office, geographic_unit_type, estimands, data=va_governor_county_data
     )
     current_data = live_data_handler.get_n_fully_reported(n=20)
-    preprocessed_data_handler = PreprocessedDataHandler(
-        election_id, office, geographic_unit_type, estimands, estimand_baseline, data=va_governor_county_data
-    )
+    
+    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
+    va_governor_county_data['last_election_results_turnout'] = va_governor_county_data.baseline_turnout + 1
 
     combined_data_handler = CombinedDataHandler(
-        preprocessed_data_handler.data, current_data, estimands, geographic_unit_type
+        va_governor_county_data, current_data, estimands, geographic_unit_type
     )
 
     turnout_factor_lower = 0.95
@@ -290,12 +242,12 @@ def test_get_nonreporting_adding_with_turnout_factor(va_governor_county_data):
     )
     n = 20
     current_data = live_data_handler.get_n_fully_reported(n=n)
-    preprocessed_data_handler = PreprocessedDataHandler(
-        election_id, office, geographic_unit_type, estimands, estimand_baseline, data=va_governor_county_data
-    )
+    
+    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
+    va_governor_county_data['last_election_results_turnout'] = va_governor_county_data.baseline_turnout + 1
 
     combined_data_handler = CombinedDataHandler(
-        preprocessed_data_handler.data, current_data, estimands, geographic_unit_type
+        va_governor_county_data, current_data, estimands, geographic_unit_type
     )
 
     turnout_factor_lower = 0.95
@@ -339,12 +291,12 @@ def test_get_unexpected_units_county_district(va_assembly_county_data):
         unexpected_units=unexpected_units,
     )
     current_data = live_data_handler.get_n_fully_reported(n=20)
-    preprocessed_data_handler = PreprocessedDataHandler(
-        election_id, office, geographic_unit_type, estimands, estimand_baseline, data=va_assembly_county_data
-    )
+    
+    va_assembly_county_data['baseline_weights'] = va_assembly_county_data.baseline_turnout
+    va_assembly_county_data['last_election_results_turnout'] = va_assembly_county_data.baseline_turnout + 1
 
     combined_data_handler = CombinedDataHandler(
-        preprocessed_data_handler.data, current_data, estimands, geographic_unit_type
+        va_assembly_county_data, current_data, estimands, geographic_unit_type
     )
     unexpected_data = combined_data_handler.get_unexpected_units(100, ["county_fips", "district"])
     assert unexpected_data.shape[0] == unexpected_units
@@ -377,12 +329,11 @@ def test_get_unexpected_units_county(va_governor_county_data):
     extra_row["percent_expected_vote"] = 50
     current_data = pd.concat([current_data, extra_row])
 
-    preprocessed_data_handler = PreprocessedDataHandler(
-        election_id, office, geographic_unit_type, estimands, estimand_baseline, data=va_governor_county_data
-    )
+    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
+    va_governor_county_data['last_election_results_turnout'] = va_governor_county_data.baseline_turnout + 1
 
     combined_data_handler = CombinedDataHandler(
-        preprocessed_data_handler.data, current_data, estimands, geographic_unit_type
+        va_governor_county_data, current_data, estimands, geographic_unit_type
     )
     unexpected_data = combined_data_handler.get_unexpected_units(100, ["county_fips"])
     assert unexpected_data.shape[0] == reporting_unexpected_units + 1

From eb5de999e0d08bfce5615fac6e0abca27131d99e Mon Sep 17 00:00:00 2001
From: lbvienna <lenny.bronner@gmail.com>
Date: Mon, 25 Sep 2023 15:12:57 -0400
Subject: [PATCH 14/14] linter

---
 src/elexmodel/handlers/data/Estimandizer.py |  4 +-
 tests/handlers/test_combined_data.py        | 70 +++++++--------------
 2 files changed, 27 insertions(+), 47 deletions(-)

diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py
index 6acc43b1..038bd623 100644
--- a/src/elexmodel/handlers/data/Estimandizer.py
+++ b/src/elexmodel/handlers/data/Estimandizer.py
@@ -97,5 +97,7 @@ def add_turnout_factor(self, data_df):
 
 
 def party_vote_share_dem(data_df, col_prefix):
-    data_df[f"{col_prefix}party_vote_share_dem"] = np.nan_to_num(data_df[f"{col_prefix}dem"] /data_df[f"{col_prefix}turnout"])
+    data_df[f"{col_prefix}party_vote_share_dem"] = np.nan_to_num(
+        data_df[f"{col_prefix}dem"] / data_df[f"{col_prefix}turnout"]
+    )
     return data_df, []
diff --git a/tests/handlers/test_combined_data.py b/tests/handlers/test_combined_data.py
index b66de0ba..d1cd53aa 100644
--- a/tests/handlers/test_combined_data.py
+++ b/tests/handlers/test_combined_data.py
@@ -3,7 +3,6 @@
 
 from elexmodel.handlers.data.CombinedData import CombinedDataHandler
 from elexmodel.handlers.data.LiveData import MockLiveDataHandler
-from elexmodel.handlers.data.PreprocessedData import PreprocessedDataHandler
 
 
 def test_load(va_governor_county_data):
@@ -11,12 +10,11 @@ def test_load(va_governor_county_data):
     office_id = "G"
     geographic_unit_type = "county"
     estimands = ["turnout"]
-    estimand_baselines = {"turnout": "turnout"}
     live_data_handler = MockLiveDataHandler(
         election_id, office_id, geographic_unit_type, estimands=["turnout"], data=va_governor_county_data
     )
     current_data = live_data_handler.data
-    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
+    va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout
 
     combined_data_handler = CombinedDataHandler(
         va_governor_county_data, current_data, estimands, "county", handle_unreporting="drop"
@@ -32,7 +30,6 @@ def test_zero_unreporting_missing_single_estimand_value(va_governor_county_data)
     election_id = "2017-11-07_VA_G"
     office_id = "G"
     geographic_unit_type = "county"
-    estimand_baselines = {"turnout": "turnout", "dem": "dem"}
     estimands = ["turnout", "dem"]
     live_data_handler = MockLiveDataHandler(
         election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data
@@ -41,7 +38,7 @@ def test_zero_unreporting_missing_single_estimand_value(va_governor_county_data)
     current_data["percent_expected_vote"] = 100
     current_data.loc[0, "results_dem"] = np.nan
 
-    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
+    va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout
 
     combined_data_handler = CombinedDataHandler(
         va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero"
@@ -62,7 +59,6 @@ def test_zero_unreporting_missing_multiple_estimands_value(va_governor_county_da
     election_id = "2017-11-07_VA_G"
     office_id = "G"
     geographic_unit_type = "county"
-    estimand_baselines = {"turnout": "turnout", "dem": "dem"}
     estimands = ["turnout", "dem"]
     live_data_handler = MockLiveDataHandler(
         election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data
@@ -72,7 +68,7 @@ def test_zero_unreporting_missing_multiple_estimands_value(va_governor_county_da
     current_data.loc[0, "results_dem"] = np.nan
     current_data.loc[0, "results_turnout"] = np.nan
 
-    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
+    va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout
 
     combined_data_handler = CombinedDataHandler(
         va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero"
@@ -92,7 +88,6 @@ def test_zero_unreporting_missing_percent_expected_vote_value(va_governor_county
     election_id = "2017-11-07_VA_G"
     office_id = "G"
     geographic_unit_type = "county"
-    estimand_baselines = {"turnout": "turnout", "dem": "dem"}
     estimands = ["turnout", "dem"]
     live_data_handler = MockLiveDataHandler(
         election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data
@@ -102,7 +97,7 @@ def test_zero_unreporting_missing_percent_expected_vote_value(va_governor_county
     current_data.loc[0, "percent_expected_vote"] = np.nan
     current_data.loc[0, "results_dem"] = np.nan
 
-    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
+    va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout
 
     combined_data_handler = CombinedDataHandler(
         va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero"
@@ -120,7 +115,6 @@ def test_zero_unreporting_random_percent_expected_vote_value(va_governor_county_
     election_id = "2017-11-07_VA_G"
     office_id = "G"
     geographic_unit_type = "county"
-    estimand_baselines = {"turnout": "turnout", "dem": "dem"}
     estimands = ["turnout", "dem"]
     live_data_handler = MockLiveDataHandler(
         election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data
@@ -129,7 +123,7 @@ def test_zero_unreporting_random_percent_expected_vote_value(va_governor_county_
     current_data["percent_expected_vote"] = np.random.randint(1, 100, current_data.shape[0])
     current_data.loc[0, "results_dem"] = np.nan
 
-    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
+    va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout
 
     combined_data_handler = CombinedDataHandler(
         va_governor_county_data, current_data, estimands, "county", handle_unreporting="zero"
@@ -147,7 +141,6 @@ def test_drop_unreporting_missing_single_estimand_value(va_governor_county_data)
     election_id = "2017-11-07_VA_G"
     office_id = "G"
     geographic_unit_type = "county"
-    estimand_baselines = {"turnout": "turnout", "dem": "dem"}
     estimands = ["turnout", "dem"]
     live_data_handler = MockLiveDataHandler(
         election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data
@@ -156,7 +149,7 @@ def test_drop_unreporting_missing_single_estimand_value(va_governor_county_data)
     current_data["percent_expected_vote"] = 100
     current_data.loc[0, "results_dem"] = np.nan
 
-    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
+    va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout
 
     combined_data_handler = CombinedDataHandler(
         va_governor_county_data, current_data, estimands, "county", handle_unreporting="drop"
@@ -170,20 +163,17 @@ def test_get_reporting_data(va_governor_county_data):
     office = "G"
     geographic_unit_type = "county"
     estimands = ["turnout"]
-    estimand_baseline = {"turnout": "turnout"}
 
     live_data_handler = MockLiveDataHandler(
         election_id, office, geographic_unit_type, estimands, data=va_governor_county_data
     )
     current_data = live_data_handler.get_n_fully_reported(n=20)
 
-    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
-    va_governor_county_data['last_election_results_turnout'] = va_governor_county_data.baseline_turnout + 1
+    va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout
+    va_governor_county_data["last_election_results_turnout"] = va_governor_county_data.baseline_turnout + 1
 
     # no fixed effects
-    combined_data_handler = CombinedDataHandler(
-        va_governor_county_data, current_data, estimands, geographic_unit_type
-    )
+    combined_data_handler = CombinedDataHandler(va_governor_county_data, current_data, estimands, geographic_unit_type)
     observed_data = combined_data_handler.get_reporting_units(100)
     assert observed_data.shape[0] == 20
     assert observed_data.reporting.iloc[0] == 1
@@ -195,19 +185,16 @@ def test_get_reporting_data_dropping_with_turnout_factor(va_governor_county_data
     office = "G"
     geographic_unit_type = "county"
     estimands = ["turnout"]
-    estimand_baseline = {"turnout": "turnout"}
 
     live_data_handler = MockLiveDataHandler(
         election_id, office, geographic_unit_type, estimands, data=va_governor_county_data
     )
     current_data = live_data_handler.get_n_fully_reported(n=20)
-    
-    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
-    va_governor_county_data['last_election_results_turnout'] = va_governor_county_data.baseline_turnout + 1
 
-    combined_data_handler = CombinedDataHandler(
-        va_governor_county_data, current_data, estimands, geographic_unit_type
-    )
+    va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout
+    va_governor_county_data["last_election_results_turnout"] = va_governor_county_data.baseline_turnout + 1
+
+    combined_data_handler = CombinedDataHandler(va_governor_county_data, current_data, estimands, geographic_unit_type)
 
     turnout_factor_lower = 0.95
     turnout_factor_upper = 1.2
@@ -235,20 +222,17 @@ def test_get_nonreporting_adding_with_turnout_factor(va_governor_county_data):
     office = "G"
     geographic_unit_type = "county"
     estimands = ["turnout"]
-    estimand_baseline = {"turnout": "turnout"}
 
     live_data_handler = MockLiveDataHandler(
         election_id, office, geographic_unit_type, estimands, data=va_governor_county_data
     )
     n = 20
     current_data = live_data_handler.get_n_fully_reported(n=n)
-    
-    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
-    va_governor_county_data['last_election_results_turnout'] = va_governor_county_data.baseline_turnout + 1
 
-    combined_data_handler = CombinedDataHandler(
-        va_governor_county_data, current_data, estimands, geographic_unit_type
-    )
+    va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout
+    va_governor_county_data["last_election_results_turnout"] = va_governor_county_data.baseline_turnout + 1
+
+    combined_data_handler = CombinedDataHandler(va_governor_county_data, current_data, estimands, geographic_unit_type)
 
     turnout_factor_lower = 0.95
     turnout_factor_upper = 1.2
@@ -280,7 +264,6 @@ def test_get_unexpected_units_county_district(va_assembly_county_data):
     geographic_unit_type = "county-district"
     estimands = ["turnout"]
     unexpected_units = 5
-    estimand_baseline = {"turnout": "turnout"}
 
     live_data_handler = MockLiveDataHandler(
         election_id,
@@ -291,13 +274,11 @@ def test_get_unexpected_units_county_district(va_assembly_county_data):
         unexpected_units=unexpected_units,
     )
     current_data = live_data_handler.get_n_fully_reported(n=20)
-    
-    va_assembly_county_data['baseline_weights'] = va_assembly_county_data.baseline_turnout
-    va_assembly_county_data['last_election_results_turnout'] = va_assembly_county_data.baseline_turnout + 1
 
-    combined_data_handler = CombinedDataHandler(
-        va_assembly_county_data, current_data, estimands, geographic_unit_type
-    )
+    va_assembly_county_data["baseline_weights"] = va_assembly_county_data.baseline_turnout
+    va_assembly_county_data["last_election_results_turnout"] = va_assembly_county_data.baseline_turnout + 1
+
+    combined_data_handler = CombinedDataHandler(va_assembly_county_data, current_data, estimands, geographic_unit_type)
     unexpected_data = combined_data_handler.get_unexpected_units(100, ["county_fips", "district"])
     assert unexpected_data.shape[0] == unexpected_units
     assert unexpected_data[unexpected_data.county_fips == ""].shape[0] == 0
@@ -312,7 +293,6 @@ def test_get_unexpected_units_county(va_governor_county_data):
     geographic_unit_type = "county"
     estimands = ["turnout"]
     reporting_unexpected_units = 5
-    estimand_baseline = {"turnout": "turnout"}
 
     live_data_handler = MockLiveDataHandler(
         election_id,
@@ -329,12 +309,10 @@ def test_get_unexpected_units_county(va_governor_county_data):
     extra_row["percent_expected_vote"] = 50
     current_data = pd.concat([current_data, extra_row])
 
-    va_governor_county_data['baseline_weights'] = va_governor_county_data.baseline_turnout
-    va_governor_county_data['last_election_results_turnout'] = va_governor_county_data.baseline_turnout + 1
+    va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout
+    va_governor_county_data["last_election_results_turnout"] = va_governor_county_data.baseline_turnout + 1
 
-    combined_data_handler = CombinedDataHandler(
-        va_governor_county_data, current_data, estimands, geographic_unit_type
-    )
+    combined_data_handler = CombinedDataHandler(va_governor_county_data, current_data, estimands, geographic_unit_type)
     unexpected_data = combined_data_handler.get_unexpected_units(100, ["county_fips"])
     assert unexpected_data.shape[0] == reporting_unexpected_units + 1
     assert unexpected_data[unexpected_data.county_fips == ""].shape[0] == 0