Merge pull request #77 from washingtonpost/elex-2771-new-ols-qr

Elex 2771 new ols qr
washingtonpost · Sep 25, 2023 · 2043a30 · 2043a30
2 parents f359935 + eb5de99
commit 2043a30
Show file tree

Hide file tree

Showing 11 changed files with 257 additions and 86 deletions.
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 
 from setuptools import find_packages, setup
 
-INSTALL_REQUIRES = ("click<8.1", "elex-solver<2", "pandas<1.5.0", "boto3<2", "python-dotenv==0.19.2", "scipy==1.10.1")
+INSTALL_REQUIRES = ("click<8.1", "elex-solver<3", "pandas<1.5.0", "boto3<2", "python-dotenv==0.19.2", "scipy==1.10.1")
 
 THIS_FILE_DIR = os.path.dirname(__file__)
 

diff --git a/src/elexmodel/client.py b/src/elexmodel/client.py
@@ -416,8 +416,8 @@ def _format_historical_current_data(
             historical=True,
             include_results_estimand=True,
         )
-
-        results_to_return = [f"results_{estimand}" for estimand in estimands]
+        # we always want to pass turnout so that we can generate results weights
+        results_to_return = list(set([f"results_{estimand}" for estimand in estimands] + ["results_turnout"]))
         geo_columns = set(["geographic_unit_fips", "postal_code"] + [a for a in self.aggregates if a != "unit"])
         preprocessed_data = preprocessed_data_handler.data[list(geo_columns) + results_to_return].copy()
         historical_current_data = preprocessed_data.merge(formatted_data, on=["postal_code", "geographic_unit_fips"])

diff --git a/src/elexmodel/handlers/data/CombinedData.py b/src/elexmodel/handlers/data/CombinedData.py
@@ -29,6 +29,7 @@ def __init__(
         self.current_data = current_data
         self.geographic_unit_type = geographic_unit_type
         data = preprocessed_data.merge(current_data, how="left", on=["postal_code", "geographic_unit_fips"])
+        data = estimandizer.add_turnout_factor(data)
         # if unreporting is 'drop' then drop units that are not reporting (ie. units where results are na)
         # this is necessary if units will not be returning results in this election,
         # but we didn't know that (ie. townships)
@@ -37,21 +38,31 @@ def __init__(
             # Drop the whole row if an estimand is not reporting
             data = data.dropna(axis=0, how="any", subset=result_cols)
         # if unreporting is 'zero' then we set the votes for non-reporting units to zero
-        # this is necessary if we are worried that there is no zero state for units (ie. some precincts)
+        # this is necessary if we are worried that there is no zero state for units (ie. some precinct states)
         elif handle_unreporting == "zero":
             indices_with_null_val = data[result_cols].isna().any(axis=1)
             data.update(data[result_cols].fillna(value=0))
             data.loc[indices_with_null_val, "percent_expected_vote"] = 0
 
         self.data = data
 
-    def get_reporting_units(self, percent_reporting_threshold, features_to_normalize=[], add_intercept=True):
+    def get_reporting_units(
+        self,
+        percent_reporting_threshold,
+        turnout_factor_lower=0.5,
+        turnout_factor_upper=1.5,
+        features_to_normalize=[],
+        add_intercept=True,
+    ):
         """
         Get reporting data. These are units where the expected vote is greater than the percent reporting threshold.
         """
         reporting_units = self.data[self.data.percent_expected_vote >= percent_reporting_threshold].reset_index(
             drop=True
         )
+        # if turnout factor less than 0.5 or greater than 1.5 assume AP made a mistake and don't treat those as reporting units
+        reporting_units = reporting_units[reporting_units.turnout_factor > turnout_factor_lower]
+        reporting_units = reporting_units[reporting_units.turnout_factor < turnout_factor_upper]
 
         # residualize + normalize
         for estimand in self.estimands:
@@ -64,16 +75,23 @@ def get_reporting_units(self, percent_reporting_threshold, features_to_normalize
 
         return reporting_units
 
-    def get_nonreporting_units(self, percent_reporting_threshold, features_to_normalize=[], add_intercept=True):
+    def get_nonreporting_units(
+        self,
+        percent_reporting_threshold,
+        turnout_factor_lower=0.5,
+        turnout_factor_upper=1.5,
+        features_to_normalize=[],
+        add_intercept=True,
+    ):
         """
         Get nonreporting data. These are units where expected vote is less than the percent reporting threshold
         """
+        # if turnout factor <= turnout_factor_lower or >= turnout_factor_upper assume the AP made a mistake and treat them as non-reporting units
         nonreporting_units = self.data.query(
-            "percent_expected_vote < @percent_reporting_threshold"
+            "(percent_expected_vote < @percent_reporting_threshold) | (turnout_factor <= @turnout_factor_lower) | (turnout_factor >= @turnout_factor_upper)"  #
         ).reset_index(  # not checking if results.isnull() anymore across multiple estimands
             drop=True
         )
-
         nonreporting_units["reporting"] = int(0)
         nonreporting_units["expected"] = True
 

diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py
@@ -1,4 +1,4 @@
-from numpy import nan
+import numpy as np
 
 RESULTS_PREFIX = "results_"
 BASELINE_PREFIX = "baseline_"
@@ -11,12 +11,15 @@ class Estimandizer:
 
     def add_estimand_results(self, data_df, estimands, historical):
         columns_to_return = []
+        turnout_col = f"{RESULTS_PREFIX}turnout"
+
         for estimand in estimands:
             results_col = f"{RESULTS_PREFIX}{estimand}"
+            additional_columns_added = []
             if results_col not in data_df.columns:
                 # will raise a KeyError if a function with the same name as `estimand` doesn't exist
                 try:
-                    data_df = globals()[estimand](data_df, RESULTS_PREFIX)
+                    data_df, additional_columns_added = globals()[estimand](data_df, RESULTS_PREFIX)
                 except KeyError as e:
                     if historical:
                         # A historical run is one where we pull in data from a past election
@@ -27,12 +30,21 @@ def add_estimand_results(self, data_df, estimands, historical):
                         # and that data handler expects a results_ column for every estimand specified.
                         # Hence, this is the only special case in which we'd want to add
                         # an empty results_ column.
-                        data_df[results_col] = nan
+                        data_df[results_col] = np.nan
+                        data_df[turnout_col] = np.nan
                     else:
                         # If this is not a historical run, then this is a live election
                         # so we are expecting that there will be actual results data
                         raise e
-            columns_to_return.append(results_col)
+
+            columns_to_return.extend([results_col] + additional_columns_added)
+
+        # always adding turnout since we will want to generate weights
+        # but if turnout is the estimand, then we only want to add it once
+        if turnout_col not in columns_to_return:
+            columns_to_return.append(turnout_col)
+
+        data_df = self.add_weights(data_df, RESULTS_PREFIX)
 
         return data_df, columns_to_return
 
@@ -49,7 +61,7 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ
             baseline_col = f"{BASELINE_PREFIX}{pointer}"
 
             if baseline_col not in data_df.columns:
-                data_df = globals()[estimand](data_df, BASELINE_PREFIX)
+                data_df, __ = globals()[estimand](data_df, BASELINE_PREFIX)
 
             if not historical:
                 data_df[f"last_election_results_{estimand}"] = data_df[baseline_col].copy() + 1
@@ -62,18 +74,30 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ
             # we need to add the results from the historical election as well.
             data_df, ___ = self.add_estimand_results(data_df, estimand_baselines.keys(), historical)
 
+        data_df = self.add_weights(data_df, BASELINE_PREFIX)
+
+        return data_df
+
+    def add_weights(self, data_df, col_prefix):
+        data_df[f"{col_prefix}weights"] = data_df[f"{col_prefix}turnout"]
+        return data_df
+
+    def add_turnout_factor(self, data_df):
+        # posinf and neginf are also set to zero because dividing by zero can lead to nan/posinf/neginf depending
+        # on the type of the numeric in the numpy array. Assume that if baseline_weights is zero then turnout
+        # would be incredibly low in this election too (ie. this is effectively an empty precinct) and so setting
+        # the turnout factor to zero is fine
+        data_df["turnout_factor"] = np.nan_to_num(
+            data_df.results_weights / data_df.baseline_weights, nan=0, posinf=0, neginf=0
+        )
         return data_df
 
 
 # custom estimands
 
 
 def party_vote_share_dem(data_df, col_prefix):
-    numer = f"{col_prefix}dem"
-    denom = f"{col_prefix}turnout"
-
-    data_df[f"{col_prefix}party_vote_share_dem"] = data_df.apply(
-        lambda x: 0 if x[numer] == 0 or x[denom] == 0 else x[numer] / x[denom], axis=1
+    data_df[f"{col_prefix}party_vote_share_dem"] = np.nan_to_num(
+        data_df[f"{col_prefix}dem"] / data_df[f"{col_prefix}turnout"]
     )
-
-    return data_df
+    return data_df, []
diff --git a/src/elexmodel/models/ConformalElectionModel.py b/src/elexmodel/models/ConformalElectionModel.py
@@ -22,7 +22,6 @@
 class ConformalElectionModel(BaseElectionModel.BaseElectionModel, ABC):
     def __init__(self, model_settings: dict):
         super(ConformalElectionModel, self).__init__(model_settings)
-        self.qr = QuantileRegressionSolver(solver="ECOS")
         self.lambda_ = model_settings.get("lambda_", 0)
 
     @classmethod
@@ -56,11 +55,10 @@ def fit_model(
             model.fit(
                 X,
                 y,
-                tau_value=tau,
+                taus=tau,
                 weights=weights,
                 lambda_=self.lambda_,
                 fit_intercept=self.add_intercept,
-                normalize_weights=normalize_weights,
             )
         except (UserWarning, cvxpy.error.SolverError):
             LOG.warning("Warning: solution was inaccurate or solver broke. Re-running with normalize_weights=False.")
@@ -88,10 +86,11 @@ def get_unit_predictions(
             x_all[self.n_train : self.n_train + n_test]  # noqa: E203
         )
 
-        self.fit_model(self.qr, reporting_units_features, reporting_units_residuals, 0.5, weights, True)
-        self.features_to_coefficients = dict(zip(featurizer.complete_features, self.qr.coefficients))
+        qr = QuantileRegressionSolver()
+        self.fit_model(qr, reporting_units_features, reporting_units_residuals, 0.5, weights, True)
+        self.features_to_coefficients = dict(zip(featurizer.complete_features, qr.coefficients))
 
-        preds = self.qr.predict(nonreporting_units_features)
+        preds = qr.predict(nonreporting_units_features.values).flatten()
 
         # multiply by total voters to get unnormalized residuals
         preds = preds * nonreporting_units[f"last_election_results_{estimand}"]
@@ -150,15 +149,15 @@ def get_unit_prediction_interval_bounds(
         train_data_weights = train_data[f"last_election_results_{estimand}"]
 
         # fit lower and upper model to training data. ECOS solver is better than SCS.
-        lower_qr = QuantileRegressionSolver(solver="ECOS")
+        lower_qr = QuantileRegressionSolver()
         self.fit_model(lower_qr, train_data_features, train_data_residuals, lower_bound, train_data_weights, True)
 
-        upper_qr = QuantileRegressionSolver(solver="ECOS")
+        upper_qr = QuantileRegressionSolver()
         self.fit_model(upper_qr, train_data_features, train_data_residuals, upper_bound, train_data_weights, True)
 
         # apply to conformalization data. Conformalization bounds will later tell us how much to adjust lower/upper
         # bounds for nonreporting data.
-        conformalization_data = reporting_units_shuffled[train_rows:]
+        conformalization_data = reporting_units_shuffled[train_rows:].reset_index(drop=True)
 
         # all_data starts with reporting_units_shuffled, so the rows between train_rows and n_train are the
         # conformalization set
@@ -169,10 +168,12 @@ def get_unit_prediction_interval_bounds(
         # we are interested in f(X) - r
         # since later conformity scores care about deviation of bounds from residuals
         conformalization_lower_bounds = (
-            lower_qr.predict(conformalization_data_features) - conformalization_data[f"residuals_{estimand}"].values
+            lower_qr.predict(conformalization_data_features.values).flatten()
+            - conformalization_data[f"residuals_{estimand}"].values
         )
-        conformalization_upper_bounds = conformalization_data[f"residuals_{estimand}"].values - upper_qr.predict(
-            conformalization_data_features
+        conformalization_upper_bounds = (
+            conformalization_data[f"residuals_{estimand}"].values
+            - upper_qr.predict(conformalization_data_features.values).flatten()
         )
 
         # save conformalization bounds for later
@@ -185,8 +186,8 @@ def get_unit_prediction_interval_bounds(
         # are the same accross train_data, conformalization_data and nonreporting_units
         nonreporting_units_features = interval_featurizer.generate_holdout_data(x_all[self.n_train :])  # noqa: E203
 
-        nonreporting_lower_bounds = lower_qr.predict(nonreporting_units_features)
-        nonreporting_upper_bounds = upper_qr.predict(nonreporting_units_features)
+        nonreporting_lower_bounds = lower_qr.predict(nonreporting_units_features.values).flatten()
+        nonreporting_upper_bounds = upper_qr.predict(nonreporting_units_features.values).flatten()
 
         return PredictionIntervals(nonreporting_lower_bounds, nonreporting_upper_bounds, conformalization_data)
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -66,43 +66,43 @@ def conformal_election_model():
     return ConformalElectionModel.ConformalElectionModel(model_settings)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def va_config(get_fixture):
     path = os.path.join("config", "2017-11-07_VA_G.json")
     return get_fixture(path, load=True, pandas=False)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def tx_primary_governor_config(get_fixture):
     path = os.path.join("config", "2018-03-06_TX_R.json")
     return get_fixture(path, load=True, pandas=False)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def va_governor_precinct_data(get_fixture):
     path = os.path.join("data", "2017-11-07_VA_G", "G", "data_precinct.csv")
     return get_fixture(path, load=False, pandas=True)
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def va_governor_county_data(get_fixture):
     path = os.path.join("data", "2017-11-07_VA_G", "G", "data_county.csv")
     return get_fixture(path, load=False, pandas=True)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def va_assembly_county_data(get_fixture):
     path = os.path.join("data", "2017-11-07_VA_G", "Y", "data_county-district.csv")
     return get_fixture(path, load=False, pandas=True)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def va_assembly_precinct_data(get_fixture):
     path = os.path.join("data", "2017-11-07_VA_G", "Y", "data_precinct-district.csv")
     return get_fixture(path, load=False, pandas=True)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def az_assembly_precinct_data(get_fixture):
     path = os.path.join("data", "2020-08-04_AZ_R", "S", "data_precinct.csv")
     return get_fixture(path, load=False, pandas=True)