diff --git a/README.md b/README.md
index c712cbb8..75ee47cf 100644
--- a/README.md
+++ b/README.md
@@ -100,12 +100,27 @@ elexmodel 2017-11-07_VA_G --estimands=turnout --estimands my_estimand --estimand
 
 Some model types have specific model parameters that can be included.
 
-| Name      | Type    | Acceptable values           | model           |
-|-----------|---------|-----------------------------|-----------------|
-| lambda    | numeric | 0-inf                       | all             |
-| robust    | boolean | larger prediction intervals | `nonparametric` |
-| beta      | numeric | variance inflation          | `gaussian`      |
-| winsorize | boolean | winsorize std estimate      | `gaussian`      |
+| Name                              | Type    | Acceptable values                | model           |
+|-----------------------------------|---------|----------------------------------|-----------------|
+| lambda                            | numeric | regularization constant          | all             |
+| turnout_factor_lower              | numeric | drop units with < turnout factor | all             |
+| turnout_factor_upper              | numeric | drop units with < turnout factor | all             |
+| robust                            | boolean | larger prediction intervals      | `nonparametric` |
+| beta                              | numeric | variance inflation               | `gaussian`      |
+| winsorize                         | boolean | winsorize std estimate           | `gaussian`      |
+| B                                 | numeric | bootstrap samples                | `bootstrap`     |
+| T                                 | numeric | temperature for aggregate        | `bootstrap`     |
+| strata                            | list    | groups to stratify bootstrap by  | `bootstrap`     |
+| agg_model_hard_threshold          | bool    | hard threshold aggregate model   | `bootstrap`     |
+| y_LB                              | numeric | lower bound norm. margin dist    | `bootstrap`     |
+| y_UB                              | numeric | upper bound norm. margin dist    | `bootstrap`     |
+| z_LB                              | numeric | lower bound turnout fact dist    | `bootstrap`     |
+| z_UB                              | numeric | lower bound turnout fact dist    | `bootstrap`     |
+| y_unobserved_lower_bound          | numeric | lower bound for norm. margin     | `bootstrap`     |
+| y_unobserved_upper_bound          | numeric | upper bound for norm. margin     | `bootstrap`     |
+| percent_expected_vote_error_bound | numeric | error tolerance on expected vote | `bootstrap`     |
+| z_unobserved_lower_bound          | numeric | lower bound for turnout factor   | `bootstrap`     |
+| z_unobserved_upper_bound          | numeric | upper bound for turnout factor   | `bootstrap`     |
 
 This is the class and function that invokes the general function to generate estimates. You can install `elex-model` as a Python package and use this code snippet in other projects.
 
diff --git a/src/elexmodel/cli.py b/src/elexmodel/cli.py
index c7128ebe..387f24ed 100644
--- a/src/elexmodel/cli.py
+++ b/src/elexmodel/cli.py
@@ -35,12 +35,7 @@ def type_cast_value(self, ctx, value):
     "--pi_method",
     "pi_method",
     default="nonparametric",
-    type=click.Choice(
-        [
-            "gaussian",
-            "nonparametric",
-        ]
-    ),
+    type=click.Choice(["gaussian", "nonparametric", "bootstrap"]),
 )
 @click.option("--prediction_intervals", "prediction_intervals", default=[0.7, 0.9], multiple=True)
 @click.option("--percent_reporting_threshold", "percent_reporting_threshold", default=100)
diff --git a/src/elexmodel/client.py b/src/elexmodel/client.py
index db83feee..31f00f4b 100644
--- a/src/elexmodel/client.py
+++ b/src/elexmodel/client.py
@@ -10,6 +10,7 @@
 from elexmodel.handlers.data.ModelResults import ModelResultsHandler
 from elexmodel.handlers.data.PreprocessedData import PreprocessedDataHandler
 from elexmodel.logging import initialize_logging
+from elexmodel.models.BootstrapElectionModel import BootstrapElectionModel
 from elexmodel.models.ConformalElectionModel import ConformalElectionModel
 from elexmodel.models.GaussianElectionModel import GaussianElectionModel
 from elexmodel.models.NonparametricElectionModel import NonparametricElectionModel
@@ -39,6 +40,7 @@ def __init__(self):
         super().__init__()
         self.all_conformalization_data_unit_dict = defaultdict(dict)
         self.all_conformalization_data_agg_dict = defaultdict(dict)
+        self.model = None
 
     def _check_input_parameters(
         self,
@@ -88,8 +90,7 @@ def _check_input_parameters(
             ]
         if len(invalid_fixed_effects) > 0:
             raise ValueError(f"Fixed effect(s): {invalid_fixed_effects} not valid. Please check config")
-
-        if pi_method not in {"gaussian", "nonparametric"}:
+        if pi_method not in {"gaussian", "nonparametric", "bootstrap"}:
             raise ValueError(
                 f"Prediction interval method: {pi_method} is not valid. \
                     pi_method has to be either `gaussian` or `nonparametric`."
@@ -102,6 +103,14 @@ def _check_input_parameters(
                 not isinstance(model_parameters["lambda_"], (float, int)) or model_parameters["lambda_"] < 0
             ):
                 raise ValueError("lambda is not valid. It has to be numeric and greater than zero.")
+            if "turnout_factor_lower" in model_parameters and not isinstance(
+                model_parameters["turnout_factor_lower"], float
+            ):
+                raise ValueError("turnout_factor_lower is not valid. Has to be a float.")
+            if "turnout_factor_upper" in model_parameters and not isinstance(
+                model_parameters["turnout_factor_upper"], float
+            ):
+                raise ValueError("turnout_factor_upper is not valid. Has to be a float.")
             if pi_method == "gaussian":
                 if "beta" in model_parameters and not isinstance(model_parameters["beta"], (int, float)):
                     raise ValueError("beta is not valid. Has to be either an integer or a float.")
@@ -110,7 +119,45 @@ def _check_input_parameters(
             elif pi_method == "nonparametric":
                 if "robust" in model_parameters and not isinstance(model_parameters["robust"], bool):
                     raise ValueError("robust is not valid. Has to be a boolean.")
-
+            elif pi_method == "bootstrap":
+                if "B" in model_parameters and not isinstance(model_parameters["B"], int):
+                    raise ValueError("B is not valid. Has to be either an integer.")
+                if "T" in model_parameters and not isinstance(model_parameters["T"], (int, float)):
+                    raise ValueError("T is not valid. Has to be either an integer or a float.")
+                if "strata" in model_parameters and not isinstance(model_parameters["strata"], list):
+                    raise ValueError("strata is not valid. Has to be a list.")
+                if "agg_model_hard_threshold" in model_parameters and not isinstance(
+                    model_parameters["agg_model_hard_threshold"], bool
+                ):
+                    raise ValueError("agg_model_hard_threshold is not valid. Has to be a boolean.")
+                if "y_LB" in model_parameters and not isinstance(model_parameters["y_LB"], float):
+                    raise ValueError("y_LB is not valid. Has to be a float.")
+                if "y_UB" in model_parameters and not isinstance(model_parameters["y_UB"], float):
+                    raise ValueError("y_UB is not valid. Has to be a float.")
+                if "z_LB" in model_parameters and not isinstance(model_parameters["z_LB"], float):
+                    raise ValueError("z_LB is not valid. Has to be a float.")
+                if "z_UB" in model_parameters and not isinstance(model_parameters["z_UB"], float):
+                    raise ValueError("z_UB is not valid. Has to be a float.")
+                if "y_unobserved_upper_bound" in model_parameters and not isinstance(
+                    model_parameters["y_unobserved_upper_bound"], float
+                ):
+                    raise ValueError("y_unobserved_upper_bound is not valid. Has to be a float.")
+                if "y_unobserved_lower_bound" in model_parameters and not isinstance(
+                    model_parameters["y_unobserved_lower_bound"], float
+                ):
+                    raise ValueError("y_unobserved_lower_bound is not valid. Has to be a float.")
+                if "percent_expected_vote_error_bound" in model_parameters and not isinstance(
+                    model_parameters["percent_expected_vote_error_bound"], float
+                ):
+                    raise ValueError("z_UB is not valid. Has to be a float.")
+                if "z_unobserved_upper_bound" in model_parameters and not isinstance(
+                    model_parameters["z_unobserved_upper_bound"], float
+                ):
+                    raise ValueError("z_unobserved_upper_bound is not valid. Has to be a float.")
+                if "z_unobserved_lower_bound" in model_parameters and not isinstance(
+                    model_parameters["z_unobserved_lower_bound"], float
+                ):
+                    raise ValueError("z_unobserved_lower_bound is not valid. Has to be a float.")
         if handle_unreporting not in {"drop", "zero"}:
             raise ValueError("handle_unreporting must be either `drop` or `zero`")
 
@@ -122,6 +169,9 @@ def get_aggregate_list(self, office, aggregate):
         raw_aggregate_list = base_aggregate + [aggregate]
         return sorted(list(set(raw_aggregate_list)), key=lambda x: AGGREGATE_ORDER.index(x))
 
+    def get_national_summary_votes_estimates(self, nat_sum_data_dict=None, called_states={}, base_to_add=0, alpha=0.99):
+        return self.model.get_national_summary_estimates(nat_sum_data_dict, called_states, base_to_add, alpha=alpha)
+
     def get_estimates(
         self,
         current_data,  # list of lists
@@ -158,10 +208,15 @@ def get_estimates(
         save_conformalization = "conformalization" in save_output
         handle_unreporting = kwargs.get("handle_unreporting", "drop")
 
+        district_election = False
+        if office in {"H", "Y", "Z"}:
+            district_election = True
+
         model_settings = {
             "election_id": election_id,
             "office": office,
             "geographic_unit_type": geographic_unit_type,
+            "district_election": district_election,
             "features": features,
             "fixed_effects": fixed_effects,
             "save_conformalization": save_conformalization,
@@ -172,6 +227,7 @@ def get_estimates(
         config_handler = ConfigHandler(
             election_id, config=raw_config, s3_client=s3.S3JsonUtil(TARGET_BUCKET), save=save_config
         )
+
         self._check_input_parameters(
             config_handler,
             office,
@@ -184,6 +240,7 @@ def get_estimates(
             model_parameters,
             handle_unreporting,
         )
+
         states_with_election = config_handler.get_states(office)
         estimand_baselines = config_handler.get_estimand_baselines(office, estimands)
 
@@ -213,13 +270,18 @@ def get_estimates(
             handle_unreporting=handle_unreporting,
         )
 
+        turnout_factor_lower = model_parameters.get("turnout_factor_lower", 0.2)
+        turnout_factor_upper = model_parameters.get("turnout_factor_upper", 2.5)
+
         reporting_units = data.get_reporting_units(
-            percent_reporting_threshold, features_to_normalize=features, add_intercept=True
+            percent_reporting_threshold, turnout_factor_lower, turnout_factor_upper
         )
         nonreporting_units = data.get_nonreporting_units(
-            percent_reporting_threshold, features_to_normalize=features, add_intercept=True
+            percent_reporting_threshold, turnout_factor_lower, turnout_factor_upper
+        )
+        unexpected_units = data.get_unexpected_units(
+            percent_reporting_threshold, aggregates, turnout_factor_lower, turnout_factor_upper
         )
-        unexpected_units = data.get_unexpected_units(percent_reporting_threshold, aggregates)
 
         LOG.info(
             "Model parameters: \n prediction intervals: %s, percent reporting threshold: %s, \
@@ -232,13 +294,15 @@ def get_estimates(
         )
 
         if pi_method == "nonparametric":
-            model = NonparametricElectionModel(model_settings=model_settings)
+            self.model = NonparametricElectionModel(model_settings=model_settings)
         elif pi_method == "gaussian":
-            model = GaussianElectionModel(model_settings=model_settings)
+            self.model = GaussianElectionModel(model_settings=model_settings)
+        elif pi_method == "bootstrap":
+            self.model = BootstrapElectionModel(model_settings=model_settings)
 
         minimum_reporting_units_max = 0
         for alpha in prediction_intervals:
-            minimum_reporting_units = model.get_minimum_reporting_units(alpha)
+            minimum_reporting_units = self.model.get_minimum_reporting_units(alpha)
             if minimum_reporting_units > minimum_reporting_units_max:
                 minimum_reporting_units_max = minimum_reporting_units
 
@@ -270,24 +334,26 @@ def get_estimates(
         )
 
         for estimand in estimands:
-            unit_predictions = model.get_unit_predictions(reporting_units, nonreporting_units, estimand)
+            unit_predictions = self.model.get_unit_predictions(
+                reporting_units, nonreporting_units, estimand, unexpected_units=unexpected_units
+            )
             results_handler.add_unit_predictions(estimand, unit_predictions)
             # gets prediciton intervals for each alpha
             alpha_to_unit_prediction_intervals = {}
             for alpha in prediction_intervals:
-                alpha_to_unit_prediction_intervals[alpha] = model.get_unit_prediction_intervals(
+                alpha_to_unit_prediction_intervals[alpha] = self.model.get_unit_prediction_intervals(
                     results_handler.reporting_units, results_handler.nonreporting_units, alpha, estimand
                 )
-                if isinstance(model, ConformalElectionModel):
+                if isinstance(self.model, ConformalElectionModel):
                     self.all_conformalization_data_unit_dict[alpha][
                         estimand
-                    ] = model.get_all_conformalization_data_unit()
+                    ] = self.model.get_all_conformalization_data_unit()
 
             results_handler.add_unit_intervals(estimand, alpha_to_unit_prediction_intervals)
 
             for aggregate in results_handler.aggregates:
                 aggregate_list = self.get_aggregate_list(office, aggregate)
-                estimates_df = model.get_aggregate_predictions(
+                estimates_df = self.model.get_aggregate_predictions(
                     results_handler.reporting_units,
                     results_handler.nonreporting_units,
                     results_handler.unexpected_units,
@@ -296,7 +362,7 @@ def get_estimates(
                 )
                 alpha_to_agg_prediction_intervals = {}
                 for alpha in prediction_intervals:
-                    alpha_to_agg_prediction_intervals[alpha] = model.get_aggregate_prediction_intervals(
+                    alpha_to_agg_prediction_intervals[alpha] = self.model.get_aggregate_prediction_intervals(
                         results_handler.reporting_units,
                         results_handler.nonreporting_units,
                         results_handler.unexpected_units,
@@ -305,10 +371,10 @@ def get_estimates(
                         alpha_to_unit_prediction_intervals[alpha],
                         estimand,
                     )
-                    if isinstance(model, ConformalElectionModel):
+                    if isinstance(self.model, ConformalElectionModel):
                         self.all_conformalization_data_agg_dict[alpha][
                             estimand
-                        ] = model.get_all_conformalization_data_agg()
+                        ] = self.model.get_all_conformalization_data_agg()
 
                 # get all of the prediction intervals here
                 results_handler.add_agg_predictions(
diff --git a/src/elexmodel/handlers/config.py b/src/elexmodel/handlers/config.py
index f04874eb..b6e8fb39 100644
--- a/src/elexmodel/handlers/config.py
+++ b/src/elexmodel/handlers/config.py
@@ -69,11 +69,15 @@ def get_estimand_baselines(self, office, estimands):
         Return dict of baseline pointers for requested estimands
         """
         baseline_pointers = {estimand: self.get_baseline_pointer(office).get(estimand) for estimand in estimands}
+        if "margin" in estimands:
+            baseline_pointers["margin"] = "margin"
         return baseline_pointers
 
     def get_estimands(self, office):
         baseline_pointer = self.get_baseline_pointer(office)
         estimands = list(baseline_pointer.keys())
+        if self.election_id.endswith("G"):
+            estimands += ["margin"]  # would otherwise need to add margin to every single config
         return estimands
 
     def get_states(self, office):
@@ -92,7 +96,12 @@ def get_geographic_unit_types(self, office):
         return self._get_office_subconfig(office).get("geographic_unit_types")
 
     def get_features(self, office):
-        return self._get_office_subconfig(office).get("features", [])
+        features = self._get_office_subconfig(office).get("features", [])
+        if self.election_id.endswith("G"):
+            features += [
+                "baseline_normalized_margin"
+            ]  # would otherwise need to add baseline_margin to every single config
+        return features
 
     def get_aggregates(self, office):
         return self._get_office_subconfig(office).get("aggregates", [])
diff --git a/src/elexmodel/handlers/data/CombinedData.py b/src/elexmodel/handlers/data/CombinedData.py
index d21f64a3..eb4c0863 100644
--- a/src/elexmodel/handlers/data/CombinedData.py
+++ b/src/elexmodel/handlers/data/CombinedData.py
@@ -1,3 +1,6 @@
+import numpy as np
+import pandas as pd
+
 from elexmodel.handlers import s3
 from elexmodel.handlers.data.Estimandizer import Estimandizer
 from elexmodel.utils.file_utils import S3_FILE_PATH, TARGET_BUCKET, convert_df_to_csv
@@ -20,7 +23,6 @@ def __init__(
 
         estimandizer = Estimandizer()
         (current_data, _) = estimandizer.add_estimand_results(current_data.copy(), self.estimands, False)
-
         # if we're running this for a past election, drop results columns from preprocessed data
         # so we use results_{estimand} numbers from current_data
         preprocessed_results_columns = list(filter(lambda col: col.startswith("results_"), preprocessed_data.columns))
@@ -46,23 +48,22 @@ def __init__(
 
         self.data = data
 
-    def get_reporting_units(
-        self,
-        percent_reporting_threshold,
-        turnout_factor_lower=0.5,
-        turnout_factor_upper=1.5,
-        features_to_normalize=[],
-        add_intercept=True,
-    ):
+    def get_reporting_units(self, percent_reporting_threshold, turnout_factor_lower, turnout_factor_upper):
         """
         Get reporting data. These are units where the expected vote is greater than the percent reporting threshold.
         """
+        # units where the expected vote is greater than the percent reporting threshold
         reporting_units = self.data[self.data.percent_expected_vote >= percent_reporting_threshold].reset_index(
             drop=True
         )
-        # if turnout factor less than 0.5 or greater than 1.5 assume AP made a mistake and don't treat those as reporting units
-        reporting_units = reporting_units[reporting_units.turnout_factor > turnout_factor_lower]
-        reporting_units = reporting_units[reporting_units.turnout_factor < turnout_factor_upper]
+
+        # remove unexpected units
+        unexpected_units = self._get_unexpected_units(
+            percent_reporting_threshold, turnout_factor_lower, turnout_factor_upper
+        )
+        reporting_units = reporting_units[
+            ~reporting_units.geographic_unit_fips.isin(unexpected_units.geographic_unit_fips)
+        ].reset_index(drop=True)
 
         # residualize + normalize
         for estimand in self.estimands:
@@ -75,23 +76,23 @@ def get_reporting_units(
 
         return reporting_units
 
-    def get_nonreporting_units(
-        self,
-        percent_reporting_threshold,
-        turnout_factor_lower=0.5,
-        turnout_factor_upper=1.5,
-        features_to_normalize=[],
-        add_intercept=True,
-    ):
+    def get_nonreporting_units(self, percent_reporting_threshold, turnout_factor_lower, turnout_factor_upper):
         """
         Get nonreporting data. These are units where expected vote is less than the percent reporting threshold
         """
-        # if turnout factor <= turnout_factor_lower or >= turnout_factor_upper assume the AP made a mistake and treat them as non-reporting units
-        nonreporting_units = self.data.query(
-            "(percent_expected_vote < @percent_reporting_threshold) | (turnout_factor <= @turnout_factor_lower) | (turnout_factor >= @turnout_factor_upper)"  #
-        ).reset_index(  # not checking if results.isnull() anymore across multiple estimands
+        # units where expected vote is less than the percent reporting threshold
+        nonreporting_units = self.data[self.data.percent_expected_vote < percent_reporting_threshold].reset_index(
             drop=True
         )
+
+        # remove unexpected units
+        unexpected_units = self._get_unexpected_units(
+            percent_reporting_threshold, turnout_factor_lower, turnout_factor_upper
+        )
+        nonreporting_units = nonreporting_units[
+            ~nonreporting_units.geographic_unit_fips.isin(unexpected_units.geographic_unit_fips)
+        ].reset_index(drop=True)
+
         nonreporting_units["reporting"] = int(0)
         nonreporting_units["expected"] = True
 
@@ -104,6 +105,9 @@ def _get_expected_geographic_unit_fips(self):
         # data is only expected units since left join of preprocessed data in initialization
         return self.data.geographic_unit_fips
 
+    def _get_units_without_baseline(self):
+        return self.data[np.isclose(self.data.baseline_weights, 0)].geographic_unit_fips
+
     def _get_county_fips_from_geographic_unit_fips(self, geographic_unit_fips):
         """
         Get county fips for geographic unit fips
@@ -123,17 +127,37 @@ def _get_district_from_geographic_unit_fips(self, geographic_unit_fips):
         components = geographic_unit_fips.split("_")
         return components[0]
 
-    def get_unexpected_units(self, percent_reporting_threshold, aggregates):
-        """
-        Gets reporting but unexpected data. These are units that are may or may not be fully
-        reporting, but we have no preprocessed data for them.
-        """
+    # TODO: rename unexpected units to be non-modeled units
+    def _get_unexpected_units(self, percent_reporting_threshold, turnout_factor_lower, turnout_factor_upper):
         expected_geographic_units = self._get_expected_geographic_unit_fips().tolist()
+        no_baseline_units = self._get_units_without_baseline()
         # Note: this uses current_data because self.data drops unexpected units
         unexpected_units = self.current_data[
             ~self.current_data["geographic_unit_fips"].isin(expected_geographic_units)
+            | self.current_data.geographic_unit_fips.isin(no_baseline_units)
         ].reset_index(drop=True)
 
+        units_with_strange_turnout_factor = self.data[
+            (self.data.percent_expected_vote >= percent_reporting_threshold)
+            & ((self.data.turnout_factor <= turnout_factor_lower) | (self.data.turnout_factor >= turnout_factor_upper))
+        ][self.current_data.columns]
+
+        all_unexpected_units = pd.concat([unexpected_units, units_with_strange_turnout_factor]).reset_index(drop=True)
+        all_unexpected_units.drop_duplicates(subset="geographic_unit_fips", inplace=True)
+        return all_unexpected_units
+
+    def get_unexpected_units(self, percent_reporting_threshold, aggregates, turnout_factor_lower, turnout_factor_upper):
+        """
+        Gets units for which we will not be making predictions:
+            - unexpected units (ie. units for which we have no covariates prepared)
+            - units for which the baseline results is zero (ie. units that are tiny)
+            - units with strange turnout factors (ie. units that are likely precinct mismatches)
+        """
+
+        unexpected_units = self._get_unexpected_units(
+            percent_reporting_threshold, turnout_factor_lower, turnout_factor_upper
+        )
+
         # since we were not expecting them, we have don't have their county or district
         # from preprocessed data. so we have to add that back in.
         if "county_fips" in aggregates:
@@ -146,7 +170,7 @@ def get_unexpected_units(self, percent_reporting_threshold, aggregates):
                 self._get_district_from_geographic_unit_fips
             )
 
-        unexpected_units["reporting"] = int(1)
+        unexpected_units["reporting"] = int(0)
         unexpected_units["expected"] = False
 
         return unexpected_units
diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py
index 038bd623..8f6e41a4 100644
--- a/src/elexmodel/handlers/data/Estimandizer.py
+++ b/src/elexmodel/handlers/data/Estimandizer.py
@@ -13,6 +13,12 @@ def add_estimand_results(self, data_df, estimands, historical):
         columns_to_return = []
         turnout_col = f"{RESULTS_PREFIX}turnout"
 
+        # if historical is true then we are running this only to add turnout and results
+        # columns that can be empty (since this is run by the live results handler to get
+        # the reporting counties. So we don't actually need to add the weights yet.
+        if not historical and f"{RESULTS_PREFIX}weights" not in data_df:
+            data_df = self.add_weights(data_df, RESULTS_PREFIX)
+
         for estimand in estimands:
             results_col = f"{RESULTS_PREFIX}{estimand}"
             additional_columns_added = []
@@ -44,14 +50,13 @@ def add_estimand_results(self, data_df, estimands, historical):
         if turnout_col not in columns_to_return:
             columns_to_return.append(turnout_col)
 
-        data_df = self.add_weights(data_df, RESULTS_PREFIX)
-
         return data_df, columns_to_return
 
     def add_estimand_baselines(self, data_df, estimand_baselines, historical, include_results_estimand=False):
         # if we are in a historical election we are only reading preprocessed data to get
         # the historical election results of the currently reporting units.
         # so we don't care about the total voters or the baseline election.
+        data_df = self.add_weights(data_df, BASELINE_PREFIX)
 
         for estimand, pointer in estimand_baselines.items():
             if pointer is None:
@@ -74,8 +79,20 @@ def add_estimand_baselines(self, data_df, estimand_baselines, historical, includ
             # we need to add the results from the historical election as well.
             data_df, ___ = self.add_estimand_results(data_df, estimand_baselines.keys(), historical)
 
-        data_df = self.add_weights(data_df, BASELINE_PREFIX)
+        return data_df
 
+    def add_weights(self, data_df, col_prefix):
+        data_df[f"{col_prefix}weights"] = data_df[f"{col_prefix}turnout"]
+        return data_df
+
+    def add_turnout_factor(self, data_df):
+        # posinf and neginf are also set to zero because dividing by zero can lead to nan/posinf/neginf depending
+        # on the type of the numeric in the numpy array. Assume that if baseline_weights is zero then turnout
+        # would be incredibly low in this election too (ie. this is effectively an empty precinct) and so setting
+        # the turnout factor to zero is fine
+        data_df["turnout_factor"] = np.nan_to_num(
+            data_df.results_weights / data_df.baseline_weights, nan=0, posinf=0, neginf=0
+        )
         return data_df
 
     def add_weights(self, data_df, col_prefix):
@@ -100,4 +117,16 @@ def party_vote_share_dem(data_df, col_prefix):
     data_df[f"{col_prefix}party_vote_share_dem"] = np.nan_to_num(
         data_df[f"{col_prefix}dem"] / data_df[f"{col_prefix}turnout"]
     )
+
     return data_df, []
+
+
+def margin(data_df, col_prefix):
+    # in the margin case we are overwriting baseline_weights with two party turnout
+    generated_weights_column_name = f"{col_prefix}weights"
+    generated_margin_column_name = f"{col_prefix}margin"
+    generated_normalized_margin_column_name = f"{col_prefix}normalized_margin"
+    data_df[generated_weights_column_name] = data_df[f"{col_prefix}dem"] + data_df[f"{col_prefix}gop"]
+    data_df[generated_margin_column_name] = data_df[f"{col_prefix}dem"] - data_df[f"{col_prefix}gop"]
+    data_df[generated_normalized_margin_column_name] = data_df[f"{col_prefix}margin"] / data_df[f"{col_prefix}weights"]
+    return data_df, [generated_weights_column_name, generated_normalized_margin_column_name]
diff --git a/src/elexmodel/models/BootstrapElectionModel.py b/src/elexmodel/models/BootstrapElectionModel.py
new file mode 100644
index 00000000..d81e74a5
--- /dev/null
+++ b/src/elexmodel/models/BootstrapElectionModel.py
@@ -0,0 +1,1350 @@
+from __future__ import annotations
+
+import logging
+
+import numpy as np
+import pandas as pd
+from elexsolver.OLSRegressionSolver import OLSRegressionSolver
+from elexsolver.QuantileRegressionSolver import QuantileRegressionSolver
+from scipy.special import expit
+
+from elexmodel.handlers.data.Featurizer import Featurizer
+from elexmodel.models.BaseElectionModel import BaseElectionModel, PredictionIntervals
+
+LOG = logging.getLogger(__name__)
+
+
+class BootstrapElectionModelException(Exception):
+    pass
+
+
+class BootstrapElectionModel(BaseElectionModel):
+    """
+    The bootstrap election model.
+
+    This model uses ordinary least squares regression for point predictions and the bootstrap to generate prediction intervals.
+
+    In this setup, we are modeling normalized two party margin. But because we need to sum our estimand from the unit level to the aggregate level, we need
+        to be able to convert normalized margin to unormalized margin (since normalized margins don't sum). This means on the unit level we will be modeling
+        unnormalized margin and on the aggregate level normalized margin.
+
+    We have found that instead of modeling the unit margin directly, decomposing the estimand into two quantites works better because the linear
+        model assumption is more plausible for each individually. The two quantities are normalized margin (y) and a turnout factor (z)
+            y = (D^{Y} - R^{Y}) / (D^{Y} + D^{Y})
+            z = (D^{Y} + R^{Y}) / (D^{Y'} + R^{Y'})
+    where Y is the year we are modeling and Y' is the previous election year.
+
+    If we define weights to be the total two party turnout in a previous election:
+        w = (D^{Y'} + R^{Y'}) -> w * y * z = (D^{Y} - R^{Y})
+    so (w * y * z) is the unnormalized margin.
+
+    We define our model as:
+        y_i = f_y(x) + epsilon_y(x)
+        z_i = f_z(x) + epsilon_z(x)
+    where f_y(x) and f_z(x) are ordinary least squares regressions and the epsilons are contest (state/district) level random effects.
+    """
+
+    def __init__(self, model_settings={}):
+        super().__init__(model_settings)
+        self.B = model_settings.get("B", 2000)  # number of bootstrap samples
+        self.strata = model_settings.get("strata", ["county_classification"])  # columns to stratify the data by
+        self.T = model_settings.get("T", 5000)  # temperature for aggregate model
+        self.hard_threshold = model_settings.get(
+            "agg_model_hard_threshold", False
+        )  # use sigmoid or hard thresold when calculating agg model
+        self.district_election = model_settings.get("district_election", False)
+
+        # upper and lower bounds for the quantile regression which define the strata distributions
+        # these make sure that we can control the worst cases for the distributions in case we
+        # haven't seen enough data ayet
+        self.y_LB = model_settings.get("y_LB", -0.3)  # normalied margin lower bound
+        self.y_UB = model_settings.get("y_UB", 0.3)  # normalized margin upper bound
+        self.z_LB = model_settings.get("z_LB", -0.5)  # turnout factor lower bound
+        self.z_UB = model_settings.get("z_UB", 0.5)  # turnout factor upper bound
+
+        # percentiles to compute the strata distributions for
+        self.taus_lower = np.arange(0.01, 0.5, 0.01)
+        self.taus_upper = np.arange(0.50, 1, 0.01)
+        self.taus = np.concatenate([self.taus_lower, self.taus_upper])
+
+        # upper and lower bounds for normalized margin and turnout factor as to how the "outstanding vote" in
+        # non-reporting units can go. Used to clip our predictions
+        self.y_unobserved_lower_bound = model_settings.get("y_unobserved_lower_bound", -1.0)
+        self.y_unobserved_upper_bound = model_settings.get("y_unobserved_upper_bound", 1.0)
+        self.percent_expected_vote_error_bound = model_settings.get("percent_expected_vote_error_bound", 0.5)
+        self.z_unobserved_upper_bound = model_settings.get("z_unobserved_upper_bound", 1.5)
+        self.z_unobserved_lower_bound = model_settings.get("z_unobserved_lower_bound", 0.5)
+
+        self.featurizer = Featurizer(self.features, self.fixed_effects)
+        self.seed = model_settings.get("seed", 0)
+        self.rng = np.random.default_rng(seed=self.seed)  # used for sampling
+        self.ran_bootstrap = False
+
+        # Assume that we have a baseline normalized margin (D^{Y'} - R^{Y'}) / (D^{Y'} + R^{Y'}) is one of the covariates
+        if "baseline_normalized_margin" not in self.features:
+            raise BootstrapElectionModelException(
+                "baseline_normalized_margin not included as feature. This is necessary for the model to work."
+            )
+
+    def cv_lambda(
+        self, x: np.ndarray, y: np.ndarray, lambdas_: np.ndarray, weights: np.ndarray | None = None, k: int = 5
+    ) -> float:
+        """
+        This function does k-fold cross validation for a OLS regression model given x, y and a set of lambdas to try out
+        This function returns the lambda that minimizes the k-fold cross validation loss
+        """
+        # if weights are none assume that all samples have equal weights
+        if weights is None:
+            weights = np.ones((y.shape[0], 1))
+        # concatenate since we need to keep x, y, weight samples together
+        x_y_w = np.concatenate([x, y, weights], axis=1)
+        self.rng.shuffle(x_y_w)
+        # generate k chunks
+        chunks = np.array_split(x_y_w, k, axis=0)
+        errors = np.zeros((len(lambdas_),))
+        # for each possible lambda value perform k-fold cross validation
+        # ie. train model on k-1 chunks and evaluate on one chunk (for all possible k combinations of heldout chunk)
+        for i, lambda_ in enumerate(lambdas_):
+            for test_chunk in range(k):
+                x_y_w_test = chunks[test_chunk]
+                # extract all chunks except for the current test chunk
+                x_y_w_train = np.concatenate(chunks[:test_chunk] + chunks[(test_chunk + 1) :], axis=0)  # noqa: 203
+                # undo the concatenation above
+                x_test = x_y_w_test[:, :-2]
+                y_test = x_y_w_test[:, -2]
+                w_test = x_y_w_test[:, -1]
+                x_train = x_y_w_train[:, :-2]
+                y_train = x_y_w_train[:, -2]
+                w_train = x_y_w_train[:, -1]
+                ols_lambda = OLSRegressionSolver()
+                ols_lambda.fit(
+                    x_train,
+                    y_train,
+                    weights=w_train,
+                    lambda_=lambda_,
+                    fit_intercept=True,
+                    regularize_intercept=False,
+                    n_feat_ignore_reg=1,
+                )
+                y_hat_lambda = ols_lambda.predict(x_test)
+                # error is the weighted sum of squares of the residual between the actual heldout y and the predicted y on the heldout set
+                errors[i] += np.sum(
+                    w_test * ols_lambda.residuals(y_test, y_hat_lambda, loo=False, center=False) ** 2
+                ) / np.sum(w_test)
+        # return lambda that minimizes the k-fold error
+        # np.argmin returns the first occurence if multiple minimum values
+        return lambdas_[np.argmin(errors)]
+
+    def get_minimum_reporting_units(self, alpha: float) -> int:
+        # arbitrary, just enough to fit coefficients
+        return 10
+
+    def _estimate_epsilon(self, residuals: np.ndarray, aggregate_indicator: np.ndarray) -> np.ndarray:
+        """
+        This function estimates the epsilon (contest level random effects)
+        """
+        # the estimate for epsilon is the average of the residuals in each contest
+        epsilon_hat = (aggregate_indicator.T @ residuals) / aggregate_indicator.sum(axis=0).reshape(-1, 1)
+        # we can't estimate a contest level effect if only have 1 unit in that contest (since our residual can just)
+        # be made equal to zero by setting the random effect to that value
+        epsilon_hat[aggregate_indicator.sum(axis=0) < 2] = 0
+
+        return epsilon_hat
+
+    def _estimate_delta(
+        self, residuals: np.ndarray, epsilon_hat: np.ndarray, aggregate_indicator: np.ndarray
+    ) -> np.ndarray:
+        """
+        This function estimates delta (the final unit level residual of the model)
+        """
+        # our estimate for delta is the difference between the residual and
+        # what can be explained by the contest level random effect
+        return (residuals - (aggregate_indicator @ epsilon_hat)).flatten()
+
+    def _estimate_model_errors(
+        self, model: OLSRegressionSolver, x: np.ndarray, y: np.ndarray, aggregate_indicator: np.ndarray
+    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """
+        This function estimates all components of the error in our bootstrap model
+        residual: the centered leave one out residual, ie the difference between our OLS prediction and our actual training values
+            this includes the component of the error that we want to explain using a contest level random effect
+        epsilon_hat: our estimate for the contest level effect in our model
+            (e.g. how much did each state contribute)
+        deta_hat: the unit level error (ie. the difference between the OLS residual and the contest level state effect)
+        """
+        # get unit level predictions from OLS
+        y_pred = model.predict(x)
+        # compute residuals
+        residuals_y = model.residuals(y, y_pred, loo=True, center=True)
+        # estimate contest level effect (the average residual in units of a contest)
+        epsilon_y_hat = self._estimate_epsilon(residuals_y, aggregate_indicator)
+        # compute delta, which is the left over residual after removing the contest level effect
+        delta_y_hat = self._estimate_delta(residuals_y, epsilon_y_hat, aggregate_indicator)
+        return residuals_y, epsilon_y_hat, delta_y_hat
+
+    def _estimate_strata_dist(
+        self,
+        x_train: np.ndarray,
+        x_train_strata: np.ndarray,
+        x_test: np.ndarray,
+        x_test_strata: np.ndarray,
+        delta_hat: np.ndarray,
+        lb: float,
+        ub: float,
+    ) -> tuple[dict, dict]:
+        """
+        This function generates the distribution (ie. CDF/PPF) for the strata in which we want to exchange
+        bootstrap errors in this model.
+        """
+        stratum_ppfs_delta = {}
+        stratum_cdfs_delta = {}
+
+        def ppf_creator(betas: np.ndarray, taus: np.ndarray, lb: float, ub: float) -> float:
+            """
+            Creates a probability point function (inverse of a cumulative distribution function -- CDF)
+            Given a percentile, provides the value of the CDF at that point
+            """
+            # we interpolate, because we want to return smooth betas
+            return lambda p: np.interp(p, taus, betas, lb, ub)
+
+        def cdf_creator(betas: np.ndarray, taus: np.ndarray) -> float:
+            """
+            Creates a cumulative distribution function (CDF)
+            Provides the probability that a value is at most x
+            """
+            # interpolates because we want to provide smooth probabilites
+            return lambda x: np.interp(x, betas, taus, right=1)
+
+        # we need the unique strata that exist in both the training and in the holdout data
+        # since we want a distribution for all strata
+        x_strata = np.unique(np.concatenate([x_train_strata, x_test_strata], axis=0), axis=0).astype(int)
+
+        # We compute the probability distribution for each stratum by fitting quantile regressions
+        # this works because the regression covariates are only dummy variables that define
+        # the strata. Therefore the i-th coefficient for a quantile regression at level tau is the
+        # tau-th delta for where dummy == 1
+        # ie. if tau is 0.5 and there are two unique dummies (x = [[0, 1], [1, 0], ...]), then
+        # the first coefficient is the median of all deltas where x = [1, 0] and the second coefficient
+        # is the median of all deltas where x = [0, 1]
+        # since the covariates define the strata, we get the tau-th (e.g. median or 30th percentile)
+        # delta per strata as the beta for that regression, which defines a probability distribution.
+
+        # for each stratum we want to add a worst case lower bound (for the taus between 0-0.49)
+        # and upper bound (for the taus between 0.5-1) so that if we sample a uniform from a different
+        # stratum that is larger/smaller than any one we have seen in this strata, we have a worst case
+        # value that is larger/smaller than what we saw in that strata. We do this by simply adding the
+        # lower/upper bound to the regression, one pair for each stratum.
+        # This lower/upper bound is set manually, note that if we observe a value that is more extreme than
+        # the lower/upper bound we are fine, since the quantile regression will use that instead
+        for x_stratum in x_strata:
+            x_train_aug = np.concatenate([x_train_strata, x_stratum.reshape(1, -1)], axis=0)
+
+            delta_aug_lb = np.concatenate([delta_hat, [lb]])
+            delta_aug_ub = np.concatenate([delta_hat, [ub]])
+
+            # fit the regressions to create the probability distributions
+            # for a single regression beta[i] is the tau-th (e.g. median or 30th percentile)
+            # for where dummy variable position i is equal to 1
+            # since we are fitting many quantile regressions at the same time, our beta is
+            # beta[tau, i] where tau stretches from 0.01 to 0.99
+            qr_lower = QuantileRegressionSolver()
+            qr_lower.fit(x_train_aug, delta_aug_lb, self.taus_lower, fit_intercept=False)
+            betas_lower = qr_lower.coefficients
+
+            qr_upper = QuantileRegressionSolver()
+            qr_upper.fit(x_train_aug, delta_aug_ub, self.taus_upper, fit_intercept=False)
+            betas_upper = qr_upper.coefficients
+
+            betas = np.concatenate([betas_lower, betas_upper])
+
+            # for each strata, we take the betas that belong to that stratum
+            # ie. for stratum [0, 1, 0] we take the betas (there is one for each tau between 0.01, 0.99])
+            # at position 1 (0-indexed)
+
+            # get all the betas for where x_stratum has a 1 (ie [1, 0, 0] position 0, [0, 1, 0] position 1 etc.)
+            betas_stratum = betas[:, np.where(x_stratum == 1)[0]].sum(axis=1)
+
+            # for this stratum value create ppf
+            # we want the lower bounds and upper bounds to be the actual lower and upper values taken from beta
+            stratum_ppfs_delta[tuple(x_stratum)] = ppf_creator(
+                betas_stratum, self.taus, np.min(betas_stratum), np.max(betas_stratum)
+            )
+
+            # for this stratum value create cdf
+            stratum_cdfs_delta[tuple(x_stratum)] = cdf_creator(betas_stratum, self.taus)
+
+        return stratum_ppfs_delta, stratum_cdfs_delta
+
+    # TODO: figure out how to better estimate margin_upper/lower_bound
+    def _generate_nonreporting_bounds(
+        self, nonreporting_units: pd.DataFrame, bootstrap_estimand: str, n_bins: int = 10
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """
+        This function creates upper and lower bounds for y and z based on the expected vote
+        that we have for each unit. This is used to clip our predictions
+        """
+
+        # turn expected for nonreporting units into decimal (also clip at 100)
+        nonreporting_expected_vote_frac = nonreporting_units.percent_expected_vote.values.clip(max=100) / 100
+        if bootstrap_estimand == "results_normalized_margin":
+            unobserved_upper_bound = self.y_unobserved_upper_bound
+            unobserved_lower_bound = self.y_unobserved_lower_bound
+            # the upper bound for LHS party if all the outstanding vote go in their favour
+            upper_bound = (
+                nonreporting_expected_vote_frac * nonreporting_units[bootstrap_estimand]
+                + (1 - nonreporting_expected_vote_frac) * unobserved_upper_bound
+            )
+            # the lower bound for the LHS party if all the outstanding vote go against them
+            lower_bound = (
+                nonreporting_expected_vote_frac * nonreporting_units[bootstrap_estimand]
+                + (1 - nonreporting_expected_vote_frac) * unobserved_lower_bound
+            )
+        elif bootstrap_estimand == "turnout_factor":
+            # our error bound for how much error we think our results provider has with expected vote
+            # e.g. 0.7 of the vote is in for a county, if percent_expected_vote_error_bound is 0.1
+            #   then we are saying that we believe between 0.6 and 0.8 of the vote is in for that county
+            percent_expected_vote_error_bound = self.percent_expected_vote_error_bound
+            unobserved_upper_bound = self.z_unobserved_upper_bound
+            unobserved_lower_bound = self.z_unobserved_lower_bound
+            # inflate or deflate turnout factor appropriately
+            lower_bound = nonreporting_units[bootstrap_estimand] / (
+                nonreporting_expected_vote_frac + percent_expected_vote_error_bound
+            )
+            upper_bound = nonreporting_units[bootstrap_estimand] / (
+                nonreporting_expected_vote_frac - percent_expected_vote_error_bound
+            ).clip(min=0.01)
+            # if 0 percent of the vote is in, the upper bound would be zero if we used the above
+            # code. So instead we set it to the naive bound
+            upper_bound[np.isclose(upper_bound, 0)] = unobserved_upper_bound
+
+        # if percent reporting is 0 or 1, don't try to compute anything and revert to naive bounds
+        lower_bound[
+            np.isclose(nonreporting_expected_vote_frac, 0) | np.isclose(nonreporting_expected_vote_frac, 1)
+        ] = unobserved_lower_bound
+        upper_bound[
+            np.isclose(nonreporting_expected_vote_frac, 0) | np.isclose(nonreporting_expected_vote_frac, 1)
+        ] = unobserved_upper_bound
+
+        return lower_bound.values.reshape(-1, 1), upper_bound.values.reshape(-1, 1)
+
+    def _strata_pit(
+        self,
+        x_train_strata: pd.DataFrame,
+        x_train_strata_unique: np.ndarray,
+        delta_hat: np.ndarray,
+        stratum_cdfs_delta: dict,
+    ) -> np.ndarray:
+        """
+        Apply the probability integral transform for each strata
+        """
+        # We convert the deltas in the training data to their percentiles in uniform space
+        unifs = []
+        # for each strata, apply the CDF to the residual to turn the residual into perecntil
+        for strata_dummies in x_train_strata_unique:
+            # grab all deltas that belong to strata defined by strata_dummies
+            delta_strata = delta_hat[np.where((strata_dummies == x_train_strata).all(axis=1))[0]]
+
+            # plug the deltas into their CDF to get uniform random variables (probability integral transform)
+            # 1e-6 is added to solve with numerical issues
+            unifs_strata = stratum_cdfs_delta[tuple(strata_dummies)](delta_strata + 1e-6).reshape(-1, 1)
+
+            # if the uniform is close to 1/0 set percentile to 0.01/0.99 also for numerical issues
+            unifs_strata[np.isclose(unifs_strata, 1)] = np.max(self.taus)
+            unifs_strata[np.isclose(unifs_strata, 0)] = np.min(self.taus)
+
+            unifs.append(unifs_strata)
+        return np.concatenate(unifs).reshape(-1, 1)
+
+    def _bootstrap_deltas(
+        self,
+        unifs: np.ndarray,
+        x_train_strata: pd.DataFrame,
+        x_train_strata_unique: np.ndarray,
+        stratum_ppfs_delta_y: dict,
+        stratum_ppfs_delta_z: dict,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """
+        Bootstrap deltas (unit level errors of our entire model) this is done by re-sampling from uniform random variables
+        """
+        n_train = unifs.shape[0]
+
+        # re-sample uniform random variables
+        # we are re-sampling over all uniforms (not just per strata), which gives us more randomness
+        # but we can guarantee that the deltas per strata will be correct because when we convert
+        # the uniforms back, we use the distribution of the stratum that the training point came from.
+        # also note unifs is of size: (n_train, self.B, 2) so we are re-sampling the deltas
+        # for y and z jointly
+        unifs_B = self.rng.choice(unifs, (n_train, self.B), replace=True)
+
+        delta_y_B = np.zeros((n_train, self.B))
+        delta_z_B = np.zeros((n_train, self.B))
+
+        # convert percentile (uniforms) back into residuals
+        # we need to do this separately for each strata because there is a different
+        # inverse function per stratum
+        for strata_dummies in x_train_strata_unique:
+            # grab all training indices where units belong to strata strata_dummies
+            strata_indices = np.where((strata_dummies == x_train_strata).all(axis=1))[0]
+            # get their corresponding uniform random variables
+            unifs_strata = unifs_B[strata_indices]
+            # convert back to deltas. unifs_stratas last dimension defines either y or z
+            delta_y_B[strata_indices] = stratum_ppfs_delta_y[tuple(strata_dummies)](unifs_strata[:, :, 0])
+            delta_z_B[strata_indices] = stratum_ppfs_delta_z[tuple(strata_dummies)](unifs_strata[:, :, 1])
+        return delta_y_B, delta_z_B
+
+    # TODO: what if hat_epsilon_y, hat_epsilon_z are correlated?
+    def _bootstrap_epsilons(
+        self,
+        epsilon_y_hat: np.ndarray,
+        epsilon_z_hat: np.ndarray,
+        x_train_strata: pd.DataFrame,
+        x_train_strata_unique: np.ndarray,
+        stratum_ppfs_delta_y: dict,
+        stratum_ppfs_delta_z: dict,
+        aggregate_indicator_train: np.ndarray,
+    ) -> tuple[np.ndarray.np.ndarray]:
+        """
+        Bootstrap epsilons (contest level random effects) using the parametric bootstrap
+
+        In a random effects model we assume that the contest level random effects are drawn from a normal distribution with mean zero and variance Sigma
+            epsilon_y, epsilon_z ~ N(0, Sigma)
+        and for now we assume Sigma is diagional (e.g. contest level random effects between normalized margin and turnout are uncorrelated)
+        """
+        # We first fit the parameters of the multivariate normal distribution that we are assuming epsilon comes
+        # from and then we sample from it B times to generate bootstrapped contest level random effects.
+
+        # The mean of our normal distribution is epsilon_hat (our best guess for contest level effects)
+        # we do this to maintain the sign of the contest level effect
+
+        # We now need to estimate Sigma, the covariance matrix of our contest level effects.
+        # We assume that the contest level random effects between contests is uncorrelated so the diagonals of \Sigma are zero.
+        # To estimate the variance (diagonals) we use the sum of the variances of the units within a contest.
+        # This is because residuals = epsilons + delta, if we have observed units in a contest then
+        #   the true epsilon is no longer a random quantity and no residual_{i, j} ~ N(epsilon_{i}, \sigma_{j}) (for i in contest and j in strata)
+        #   which means that var(residuals) = var(delta). So that epsilon_hat = mean(residuals)
+        #   so that var(epsilon_hat) = var(mean(residuals)) = mean(var(residuals)) = mean(var(delta))
+
+        #   The variance of deltas is defined by their probability distribution (it's the same per stratum
+        #   since we assume that deltas are iid within a stratum). So we use the distribution to compute
+        #   the variance of the delta. We use interquartile-range (IQR) since this is more robust than
+        #   sample standard deviation
+
+        # we first compute the unit variance estimate for each stratum (since that defines the variance per unit)
+        iqr_y_strata = {}
+        iqr_z_strata = {}
+        for x_stratum in x_train_strata_unique:
+            x_stratum_delta_y_ppf = stratum_ppfs_delta_y[tuple(x_stratum)]
+            iqr_y = x_stratum_delta_y_ppf(0.75) - x_stratum_delta_y_ppf(0.25)
+            iqr_y_strata[tuple(x_stratum)] = iqr_y
+
+            x_stratum_delta_z_ppf = stratum_ppfs_delta_z[tuple(x_stratum)]
+            iqr_z = x_stratum_delta_z_ppf(0.75) - x_stratum_delta_z_ppf(0.25)
+            iqr_z_strata[tuple(x_stratum)] = iqr_z
+
+        # set variance per contest to be zero
+        var_epsilon_y = np.zeros((aggregate_indicator_train.shape[1],))
+        var_epsilon_z = np.zeros((aggregate_indicator_train.shape[1],))
+
+        # we are now adding the unit variances to create the contest level variances
+        for strata_dummies in x_train_strata_unique:
+            # grab the indices of the units per state that within strata defined by strata_dummies
+            strata_indices = np.where((strata_dummies == x_train_strata).all(axis=1))[0]
+            # add variances
+            # we square because IQR approximates standard deviation
+            var_epsilon_y += (
+                aggregate_indicator_train[strata_indices] * (iqr_y_strata[tuple(strata_dummies)] ** 2)
+            ).sum(axis=0)
+            var_epsilon_z += (
+                aggregate_indicator_train[strata_indices] * (iqr_z_strata[tuple(strata_dummies)] ** 2)
+            ).sum(axis=0)
+
+        # IQR constant for a normal random variable
+        iqr_scale = 1.349
+        var_epsilon_y /= iqr_scale**2
+        var_epsilon_z /= iqr_scale**2
+        var_epsilon_y /= aggregate_indicator_train.sum(axis=0)
+        var_epsilon_z /= aggregate_indicator_train.sum(axis=0)
+
+        # if we only have 1 unit in a contest we define the variance to be zero
+        var_epsilon_y[aggregate_indicator_train.sum(axis=0) < 2] = 0
+        var_epsilon_z[aggregate_indicator_train.sum(axis=0) < 2] = 0
+
+        # sample B new epsilons
+        epsilon_y_B = self.rng.multivariate_normal(
+            mean=epsilon_y_hat.flatten(), cov=np.diag(var_epsilon_y), size=self.B
+        ).T
+        epsilon_z_B = self.rng.multivariate_normal(
+            mean=epsilon_z_hat.flatten(), cov=np.diag(var_epsilon_z), size=self.B
+        ).T
+        return epsilon_y_B, epsilon_z_B
+
+    def _bootstrap_errors(
+        self,
+        epsilon_y_hat: np.ndarray,
+        epsilon_z_hat: np.ndarray,
+        delta_y_hat: np.ndarray,
+        delta_z_hat: np.ndarray,
+        x_train_strata: pd.DataFrame,
+        stratum_cdfs_y: dict,
+        stratum_cdfs_z: dict,
+        stratum_ppfs_delta_y: dict,
+        stratum_ppfs_delta_z: dict,
+        aggregate_indicator_train: np.ndarray,
+    ) -> tuple[tuple[np.ndarray, np.ndarray], tuple[np.ndarray, np.ndarray]]:
+        """
+        Bootstrap the errors of our model (epsilon and delta)
+        """
+        # get unique strata that appea in the training data
+        x_train_strata_unique = np.unique(x_train_strata, axis=0).astype(int)
+
+        # bootstrap the epsilons. Uses the parametric bootstrap
+        epsilon_y_B, epsilon_z_B = self._bootstrap_epsilons(
+            epsilon_y_hat,
+            epsilon_z_hat,
+            x_train_strata,
+            x_train_strata_unique,
+            stratum_ppfs_delta_y,
+            stratum_ppfs_delta_z,
+            aggregate_indicator_train,
+        )
+
+        # turn deltas into percentiles (uniforms) so that we can re-sample those
+        unifs_y = self._strata_pit(x_train_strata, x_train_strata_unique, delta_y_hat, stratum_cdfs_y)
+        unifs_z = self._strata_pit(x_train_strata, x_train_strata_unique, delta_z_hat, stratum_cdfs_z)
+        unifs = np.concatenate([unifs_y, unifs_z], axis=1)
+
+        # re-sample uniforms and apply the inverse CDF (PPF) to convert back to re-sampled residuals
+        delta_y_B, delta_z_B = self._bootstrap_deltas(
+            unifs, x_train_strata, x_train_strata_unique, stratum_ppfs_delta_y, stratum_ppfs_delta_z
+        )
+
+        return (epsilon_y_B, epsilon_z_B), (delta_y_B, delta_z_B)
+
+    def _sample_test_delta(
+        self, x_test_strata: pd.DataFrame, stratum_ppfs_delta_y: dict, stratum_ppfs_delta_z: dict
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """
+        This function generates new test deltas (unit level errors)
+        """
+        # we previously estimated the distribution of the deltas per stratum
+        # we can now generate a new set of uniform random variables (percentiles)
+        # and then convert them to deltas using the deltas PPFs. We just need
+        # to make sure that we use the correct distribution defined by the strata
+
+        n_test = x_test_strata.shape[0]
+
+        # sample a new set of uniforms
+        test_unifs = self.rng.uniform(low=0, high=1, size=(n_test, self.B, 2))
+
+        test_delta_y = np.zeros((n_test, self.B))
+        test_delta_z = np.zeros((n_test, self.B))
+
+        x_test_strata_unique = np.unique(x_test_strata, axis=0).astype(int)
+        for strata_dummies in x_test_strata_unique:
+            # get indices of the nonreporting data that are in strata strata_dummies
+            strata_indices = np.where((strata_dummies == x_test_strata).all(axis=1))[0]
+            # get their corresponding newly sampled uniforms
+            unifs_strata = test_unifs[strata_indices]
+            # use PPF to generate new deltas from the new uniforms
+            test_delta_y[strata_indices] = stratum_ppfs_delta_y[tuple(strata_dummies)](unifs_strata[:, :, 0])
+            test_delta_z[strata_indices] = stratum_ppfs_delta_z[tuple(strata_dummies)](unifs_strata[:, :, 1])
+
+        return test_delta_y, test_delta_z
+
+    def _sample_test_epsilon(
+        self,
+        residuals_y: np.ndarray,
+        residuals_z: np.ndarray,
+        epsilon_y_hat: np.ndarray,
+        epsilon_z_hat: np.ndarray,
+        aggregate_indicator_train: np.ndarray,
+        aggregate_indicator_test: np.ndarray,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """
+        This function generates new test epsilons (contest level random effects)
+        NOTE: for now we are sampling from a normal with mean zero and sample variance.
+        """
+        # the contests that need a sampled contest level random effect are those that still have outstanding
+        # units (since for the others, the contest level random effect is a known fixed quantity) but no
+        # current estimate for the epsilon in that contest (so ones with NO training examples)
+
+        # this gives us indices of contests for which we have no samples in the training set
+        contests_not_in_reporting_units = np.where(np.all(aggregate_indicator_train == 0, axis=0))[0]
+        # gives us contests for which there is at least one county not reporting
+        contests_in_nonreporting_units = np.where(np.any(aggregate_indicator_test > 0, axis=0))[0]
+        # contests that need a random effect are:
+        #   contests that we want to make predictions on (ie. there are still outstanding units)
+        #   BUT no units in that contest are reporting
+        contests_that_need_random_effect = np.intersect1d(
+            contests_not_in_reporting_units, contests_in_nonreporting_units
+        )
+
+        # \epsilon ~ N(0, \Sigma)
+        mu_hat = [0, 0]
+
+        # the variance is the sample variance of epsilons for which we have an estimate
+        non_zero_epsilon_indices = np.nonzero(epsilon_y_hat)[0]
+        # if there is only one non zero contest OR if the contest is a state level election only
+        # then just sample from nearly zero since no variance
+        if non_zero_epsilon_indices.shape[0] == 1:
+            return np.zeros((1, self.B)), np.zeros((1, self.B))
+        else:
+            sigma_hat = np.cov(np.concatenate([epsilon_y_hat, epsilon_z_hat], axis=1)[np.nonzero(epsilon_y_hat)[0]].T)
+
+        # sample new test epsilons, but only for states that need tem
+        test_epsilon = self.rng.multivariate_normal(
+            mu_hat, sigma_hat, size=(len(contests_that_need_random_effect), self.B)
+        )
+
+        test_epsilon_y = aggregate_indicator_test[:, contests_that_need_random_effect] @ test_epsilon[:, :, 0]
+        test_epsilon_z = aggregate_indicator_test[:, contests_that_need_random_effect] @ test_epsilon[:, :, 1]
+        return test_epsilon_y, test_epsilon_z
+
+    def _sample_test_errors(
+        self,
+        residuals_y: np.ndarray,
+        residuals_z: np.ndarray,
+        epsilon_y_hat: np.ndarray,
+        epsilon_z_hat: np.ndarray,
+        x_test_strata: pd.DataFrame,
+        stratum_ppfs_delta_y: dict,
+        stratum_ppfs_delta_z: dict,
+        aggregate_indicator_train: np.ndarray,
+        aggregate_indicator_test: np.ndarray,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """
+        This function samples new test errors for our model (ie. new test residuals)
+        """
+        # we generate new test residuals by generating new test epsilons and new test deltas and then adding them together
+        test_epsilon_y, test_epsilon_z = self._sample_test_epsilon(
+            residuals_y, residuals_z, epsilon_y_hat, epsilon_z_hat, aggregate_indicator_train, aggregate_indicator_test
+        )
+        test_delta_y, test_delta_z = self._sample_test_delta(x_test_strata, stratum_ppfs_delta_y, stratum_ppfs_delta_z)
+        test_error_y = test_epsilon_y + test_delta_y
+        test_error_z = test_epsilon_z + test_delta_z
+        return test_error_y, test_error_z
+
+    # TODO: potentially generalize binning features for strata
+    def _get_strata(
+        self, reporting_units: pd.DataFrame, nonreporting_units: pd.DataFrame
+    ) -> tuple[pd.DataFrame, pd.DataFrame]:
+        """
+        Gets strata for stratified bootstrap sampling
+        """
+        n_train = reporting_units.shape[0]
+        # we can use the featurizer, since strata are defined by dummy variables in the same way that
+        # fixed effects are (ie. rural could be (1, 0, 0) while urban could be (0, 1, 0) while suburban could be (0, 0, 1))
+        # but like with fixed effects we drop one strata category and use the intercept instead so the example would be
+        # rural: 0, 0 urban: 1, 0 and rural: 0, 1
+        strata_featurizer = Featurizer([], self.strata)
+        all_units = pd.concat([reporting_units, nonreporting_units], axis=0)
+
+        strata_all = strata_featurizer.prepare_data(
+            all_units, center_features=False, scale_features=False, add_intercept=self.add_intercept
+        )
+        x_train_strata = strata_all[:n_train]
+        x_test_strata = strata_all[n_train:]
+        return x_train_strata, x_test_strata
+
+    def compute_bootstrap_errors(
+        self, reporting_units: pd.DataFrame, nonreporting_units: pd.DataFrame, unexpected_units: pd.DataFrame
+    ):
+        """
+        Computes unit level point predictions and runs the bootstrap to generate quantities needed for prediction intervals.
+
+        The bootstrap generally re-samples the observed data with replacement in order to generate synthentic "bootstrap" datasets,
+            which can be used to estimate the sampling distribution of a quantity that we are interested in.
+        Our implementation is the stratified residual bootstrap. The residual bootstrap samples residuals of a model (instead of the original dataset)
+            this is preferable in a regression setting because it removes the the component of the observation that is not random.
+        We use the stratified bootstrap because our units are not independent and identically distributed, which means that we cannot assign
+            the error of any unit to any other unit (e.g. the residual for an urban unit would likely not fit for a rural unit). For now, this model
+            stratifies on county classification (rural/urban/suburban).
+
+        Generally we are interested in predicting functions of:
+                w * y * z = weights * normalized_margin * turnout_factor = unnormalized_margin
+
+        There are three cases:
+            1) In the unit case we are interested in the unnormalized margin:
+                    w_i * y_i * z_i
+            2) In the aggregate (e.g. state aggregation) case we are interested in the normalized sum of the unnormalized margin of units
+                    (sum_{i = 1}^N w_i * y_i * z_i) / (sum_{i = 1}^N w_i * z_i)
+            3) In the national case we are interested in an interval over the sum of electoral votes generated by the predictions
+                    sum_{s = 1}^{51} sigmoid{sum_{i = 1}^{N_s} w_i * y_i * z_i} * ev_s
+
+        Our point prediction for each is:
+            1) w_i * hat{y_i} * hat{z_i}
+            2) (sum_{i = 1}^N w_i * hat{y_i} * hat_{z_i}) / (sum_{i = 1}^N w_i hat{z_i})
+            3) sum_{s = 1}^{51} sigmoid{sum_{i=1}^{N_s} w_i * hat{y_i} * hat{z_i}} * ev_s
+        remember that our model for y_i, z_i is OLS plus a contest random effect:
+                y_i = f_y(x) + epsilon_y(x)
+                z_i = f_z(x) + epsilon_z(x)
+        this means that:
+                hat{y_i} = hat{f_y(x)} + hat{epsilon_y(x)}
+                hat{z_i} = hat{f_z(x)} + hat{epsilon_z(x)}
+
+        As point predictions, this function only computes:
+            1) w_i * hat{y_i} * hat{z_i}
+            2) w_i * hat{z_i}
+        which are then used in the respective prediction functions
+
+        We are also interested in generating prediction intervals for the quantities, we do that by bootstrapping the error in our predictions
+            and then taking the appropriate percentiles of those errors. The errors we are interested in are between the true quantity and
+            our prediction:
+
+        There are three cases that mirror the cases above (error between prediction and true quantity)
+            1) w_i * hat{y_i} * hat{z_i} - w_i * y_i * z_i
+            2) (sum_{i = 1}^N w_i * hat{y_i} * hat{z_i}) / (sum_{i = 1}^n w_i hat{z_i}) - (sum_{i = 1}^N w_i * y_i * z_i) / (sum_{i = 1}^n w_i z_i)
+            3) sum_{s = 1}^{51} sigmoid{sum_{i=1}^{N_s} w_i * hat{y_i} * hat{z_i}} * ev_s - sum_{s = 1}^{51} sigmoid{sum_{i=1}^{N_s} w_i * y_i * z_i} * ev_s
+
+        In order to keep this model as flexible as possible for all potential cases, this function generates bootstrap estimates for
+            1) w_i * hat{y_i} * hat{z_i}
+            2) w_i * y_i * z_i
+            3) w_i * hat{z_i}
+            4) w_i * z_i
+        and store those so that we can later compute prediction interevals for any function of these quantities in their respective functions.
+
+        In a normal setting the bootstrap works by assuming that our fitted predictions (e.g. w_i * hat{y_i} * hat{z_i}) is now the true
+            value. Then using the bootstrap to generate synthentic samples (e.g. w_i hat{y_i}^b * hat{z_i}^b) and computing the error
+            between the two. This would give us a confidence interval (ie. the error between a quantity and it's mean), but we are interested
+            in a prediction interval, which means we also need to take into account the additional uncertainty in sampling new y_i and z_i
+
+        This means that our new "true" quantity (the equivalent of w_i * y_i * z_i) needs a new fresh sampled uncertainty, so we sample
+            new test errors
+                residuals_{y, i}^{b}, residuals_{z, i}^{b}
+            in order to compute:
+                hat{y_i} + residuals_{y, i}^{b}
+                hat{z_i} + residuals_{z, i}^{b}
+            so that:
+                w_i * y_i * z_i     -->     w_i * (hat{y_i} + residuals_{y, i}^{b}) * (hat{z_i} + residuals_{z, i}^{b})
+
+        We also need new "estimated" quantities (the equivalent of w_i * hat{y_i} * hat{z_i}), these are the outcome
+            of the stratified residual bootstrap:
+                w_i * hat{y_i} * hat{z_i}    -->     w_i * tilde{y_i}^{b} * tilde{z_i}^{b}
+
+        For completeness, we also generate estimates for the other two quantities:
+                w_i * z_i   ->      w_i * (hat{z_i} + epsilon_{z, i}^{b})
+                w_i * hat{z_i}     ->      w_i * tilde{z_i}^{b}
+        """
+        # prepare data (generate fixed effects, add intercept etc.)
+        all_units = pd.concat([reporting_units, nonreporting_units, unexpected_units], axis=0)
+        x_all = self.featurizer.prepare_data(
+            all_units, center_features=False, scale_features=False, add_intercept=self.add_intercept
+        )
+        n_train = reporting_units.shape[0]
+        n_test = nonreporting_units.shape[0]
+
+        x_train_df = self.featurizer.filter_to_active_features(x_all[:n_train])
+        x_train = x_train_df.values
+        y_train = reporting_units["results_normalized_margin"].values.reshape(-1, 1)
+        z_train = reporting_units["turnout_factor"].values.reshape(-1, 1)
+        weights_train = reporting_units["baseline_weights"].values.reshape(-1, 1)
+
+        x_test_df = self.featurizer.generate_holdout_data(x_all[n_train : (n_train + n_test)])  # noqa: 203
+        x_test = x_test_df.values
+        weights_test = nonreporting_units["baseline_weights"].values.reshape(-1, 1)
+
+        # Create a matrix of size (n_contests, n_total_units) which acts as a crosswalk
+        # between unit and contest (ie. a 1 in i,j says that unit j belongs to contest i)
+        # in case district election we need to create a variable that defines the state, district
+        # which is what the contest is
+        if self.district_election:
+            all_units["postal_code-district"] = all_units[["postal_code", "district"]].agg("_".join, axis=1)
+            aggregate_indicator = pd.get_dummies(all_units["postal_code-district"]).values
+        else:
+            aggregate_indicator = pd.get_dummies(all_units["postal_code"]).values
+
+        aggregate_indicator_expected = aggregate_indicator[: (n_train + n_test)]
+        aggregate_indicator_train = aggregate_indicator_expected[:n_train]
+        aggregate_indicator_test = aggregate_indicator_expected[n_train:]
+
+        # we compute bounds for normalized margin and turnout factor based on our results providers current estimate for expected vote
+        # ie. if 95% of the votes of a unit are in, what is the max/min the normalized_margin and turnout factor could still reach?
+        y_partial_reporting_lower, y_partial_reporting_upper = self._generate_nonreporting_bounds(
+            nonreporting_units, "results_normalized_margin"
+        )
+        z_partial_reporting_lower, z_partial_reporting_upper = self._generate_nonreporting_bounds(
+            nonreporting_units, "turnout_factor"
+        )
+
+        # we use k-fold cross validation to find the optimal lambda for our OLS regression
+        optimal_lambda_y = self.cv_lambda(x_train, y_train, np.logspace(-3, 2, 20), weights=weights_train)
+        optimal_lambda_z = self.cv_lambda(x_train, z_train, np.logspace(-3, 2, 20), weights=weights_train)
+
+        # step 1) fit the initial model
+        # we don't want to regularize the intercept or the coefficient for baseline_normalized_margin
+        ols_y = OLSRegressionSolver()
+        ols_y.fit(
+            x_train,
+            y_train,
+            weights=weights_train,
+            lambda_=optimal_lambda_y,
+            fit_intercept=True,
+            regularize_intercept=False,
+            n_feat_ignore_reg=1,
+        )
+        ols_z = OLSRegressionSolver()
+        ols_z.fit(
+            x_train,
+            z_train,
+            weights=weights_train,
+            lambda_=optimal_lambda_z,
+            fit_intercept=True,
+            regularize_intercept=False,
+            n_feat_ignore_reg=1,
+        )
+
+        # step 2) calculate the fitted values
+        y_train_pred = ols_y.predict(x_train)
+        z_train_pred = ols_z.predict(x_train)
+
+        # step 3) calculate residuals
+        #   residuals are the residuals from OLS (ie. the amount of error that our regression does not explain)
+        #       they have been inflated by (1 - hat_matrix) to be leave-one-out residuals so they are an estimate of
+        #       the test residuals
+        #   epsilon is the contest level effect (it is estiamted as the average error in OLS over all the units in a contest)
+        #   delta is the unit level error that is unaccounted for by the contest level effect
+        residuals_y, epsilon_y_hat, delta_y_hat = self._estimate_model_errors(
+            ols_y, x_train, y_train, aggregate_indicator_train
+        )
+        residuals_z, epsilon_z_hat, delta_z_hat = self._estimate_model_errors(
+            ols_z, x_train, z_train, aggregate_indicator_train
+        )
+
+        # As part of the residual bootstrap we now need to generate B synthetic versions of residuals_y and residuals_z
+        # residuals are broken into an epsilon (contest level error) and delta (unit level error component). This means
+        # in order to generate bootstrapped residuals we need to bootstrap samples for epsilon and for delta.
+        # We bootstrap epsilons using the parametric bootstrap. Our procedure for bootstrapping the deltas is more complicated.
+
+        # The stratified bootstrap assumes that the final model residuals (delta) are independent and identically distributed per
+        # strata. So we now need to generate new bootstrapped deltas for each unit in each strata.
+
+        # Instead of sampling the deltas directly with replacement we generate a probability distributionsfor each strata.
+        # We convert the deltas into uniform random variables using the distributions CDF (probability integral transform),
+        # we then re-sample the uniforms with replacement and then convert the uniform random variables back into deltas
+        # using the PPF of the distribution (inverse CDF)
+        # We do this because we have two deltas to sample (delta_z and delta_y) by sampling the observed CDFs we can maintain the correlation
+        # between the y and z deltas
+        #   E.g. take a rural delta (delta_y, delta_z), evaluate CDF -> get percentile for each residual (0.75, 0.85)
+        #   If you do this for every pair of points, you will notice that these two percentiles are correlated (a big y error and a big z error co-occur)
+        # This approach using uniform random variables allows us to smooth over the distribution in cases where we have only seen very few obserations
+        # per strata. We can also impose a worst/best case scenario by adding an additional datapoint for each stratum when generating the distribution.
+
+        # we only want re-sample deltas in each strata (ie. rural counties should only receive errors from rural counties, same for suburban and urban)
+        # this means that we need to generate error distributions conditional on each strata value (ie. conditional on urban, rural and suburban)
+        # to do this, we first need to get the strata
+        # x_train_strata is an array where each row defines the strata for that training unit (e.g. 00, 01, 10)
+        # x_test_strata is equivalent
+        x_train_strata, x_test_strata = self._get_strata(reporting_units, nonreporting_units)
+
+        # we then compute the probability distribution (CDF/PPF) for the deltas given each strata, this will allow us to move from the residual space
+        #   to the percentile space [0, 1] and back again after re-sampling
+        stratum_ppfs_delta_y, stratum_cdfs_delta_y = self._estimate_strata_dist(
+            x_train, x_train_strata, x_test, x_test_strata, delta_y_hat, self.y_LB, self.y_UB
+        )
+        stratum_ppfs_delta_z, stratum_cdfs_delta_z = self._estimate_strata_dist(
+            x_train, x_train_strata, x_test, x_test_strata, delta_z_hat, self.z_LB, self.z_UB
+        )
+
+        # step 4) bootstrap resampling
+        # step 4a) we resample B new epsilons and deltas
+        epsilon_B, delta_B = self._bootstrap_errors(
+            epsilon_y_hat,
+            epsilon_z_hat,
+            delta_y_hat,
+            delta_z_hat,
+            x_train_strata,
+            stratum_cdfs_delta_y,
+            stratum_cdfs_delta_z,
+            stratum_ppfs_delta_y,
+            stratum_ppfs_delta_z,
+            aggregate_indicator_train,
+        )
+        epsilon_y_B, epsilon_z_B = epsilon_B
+        delta_y_B, delta_z_B = delta_B
+
+        # step 4b) create our bootrapped dataset by adding the bootstrapped errors (epsilon + delta) to our fitted values
+        y_train_B = y_train_pred + (aggregate_indicator_train @ epsilon_y_B) + delta_y_B
+        z_train_B = z_train_pred + (aggregate_indicator_train @ epsilon_z_B) + delta_z_B
+
+        # step 5) refit the model
+        #   we need to generate bootstrapped test predictions and to do that we need to fit a bootstrap model
+        #   we are using the normal equations from the original model since x_train has stayed the same and the normal
+        #       equations are only dependent on x_train. This saves compute.
+        ols_y_B = OLSRegressionSolver()
+        ols_y_B.fit(
+            x_train,
+            y_train_B,
+            weights_train,
+            normal_eqs=ols_y.normal_eqs,
+            fit_intercept=True,
+            regularize_intercept=False,
+            n_feat_ignore_reg=1,
+        )
+        ols_z_B = OLSRegressionSolver()
+        ols_z_B.fit(
+            x_train,
+            z_train_B,
+            weights_train,
+            normal_eqs=ols_z.normal_eqs,
+            fit_intercept=True,
+            regularize_intercept=False,
+            n_feat_ignore_reg=1,
+        )
+
+        LOG.info("orig. ols coefficients, normalized margin: \n %s", ols_y.coefficients.flatten())
+        LOG.info("boot. ols coefficients, normalized margin: \n %s", ols_y_B.coefficients.mean(axis=-1))
+
+        # we cannot just apply ols_y_B/old_z_B to the test units because that would be missing our contest level random effect
+        # so we need to compute an bootstrapped estimate of the contest level random effect (epsilon)
+        # to do that we first compute the bootstrapped leave-one-out training residuals and then use that to
+        # estimate bootstrapped epsilons
+        y_train_pred_B = ols_y_B.predict(x_train)
+        z_train_pred_B = ols_z_B.predict(x_train)
+        residuals_y_B = ols_y_B.residuals(y_train_B, y_train_pred_B, loo=True, center=True)
+        residuals_z_B = ols_z_B.residuals(z_train_B, z_train_pred_B, loo=True, center=True)
+        epsilon_y_hat_B = self._estimate_epsilon(residuals_y_B, aggregate_indicator_train)
+        epsilon_z_hat_B = self._estimate_epsilon(residuals_z_B, aggregate_indicator_train)
+
+        # We can then use our bootstrapped ols_y_B/ols_z_B and our bootstrapped contest level effect (epsilon) to make bootstrapped predictions
+        # on our non-reporting units
+        # This is \tilde{y_i}^{b} and \tilde{z_i}^{b}
+        y_test_pred_B = (ols_y_B.predict(x_test) + (aggregate_indicator_test @ epsilon_y_hat_B)).clip(
+            min=y_partial_reporting_lower, max=y_partial_reporting_upper
+        )
+        z_test_pred_B = (ols_z_B.predict(x_test) + (aggregate_indicator_test @ epsilon_z_hat_B)).clip(
+            min=z_partial_reporting_lower, max=z_partial_reporting_upper
+        )
+
+        # \tilde{y_i}^{b} * \tilde{z_i}^{b}
+        yz_test_pred_B = y_test_pred_B * z_test_pred_B
+
+        # In order to generate our point prediction, we also need to apply our non-bootstrapped model to the testset
+        # this is \hat{y_i} and \hat{z_i}
+        y_test_pred = (ols_y.predict(x_test) + (aggregate_indicator_test @ epsilon_y_hat)).clip(
+            min=y_partial_reporting_lower, max=y_partial_reporting_upper
+        )
+        z_test_pred = (ols_z.predict(x_test) + (aggregate_indicator_test @ epsilon_z_hat)).clip(
+            min=z_partial_reporting_lower, max=z_partial_reporting_upper
+        )
+        yz_test_pred = y_test_pred * z_test_pred
+
+        # we now need to generate our bootstrapped "true" quantities (in order to subtract the bootstrapped estimates from these quantities to get an estimate for our error)
+        # In a normal bootstrap setting we would replace y and z with \hat{y} and \hat{z}
+        # however, we want to produce prediction intervals (rather than just confidence intervals) so we need to take into account the extra
+        # uncertainty that comes with prediction (ie. estimating the mean has some variance, but sampling from our mean estimate introduces an additional error)
+        # To do that we need an estimate for our prediction error (ie. an estimate of the residuals = epsilon + delta)
+        # we cannot use our original residuals_y, residuals_z because they would cancel out our error used in the bootstrap
+        test_residuals_y, test_residuals_z = self._sample_test_errors(
+            residuals_y,
+            residuals_z,
+            epsilon_y_hat,
+            epsilon_z_hat,
+            x_test_strata,
+            stratum_ppfs_delta_y,
+            stratum_ppfs_delta_z,
+            aggregate_indicator_train,
+            aggregate_indicator_test,
+        )
+
+        # multiply by weights to turn into unnormalized margin
+        # w_i * \hat{y_i} * \hat{z_i}
+        self.errors_B_1 = yz_test_pred_B * weights_test
+
+        # This is (\hat{y_i} + \residuals_{y, i}^{b}) * (\hat{z_i} + \residuals_{z, i}^{b})
+        # we clip them based on bounds we generated earlier. These are defined by the estimates amount of
+        # outstanding vote from our election results provider
+        errors_B_2 = (y_test_pred + test_residuals_y).clip(min=y_partial_reporting_lower, max=y_partial_reporting_upper)
+        errors_B_2 *= (z_test_pred + test_residuals_z).clip(
+            min=z_partial_reporting_lower, max=z_partial_reporting_upper
+        )
+
+        # multiply by weights turn into unnormalized margin
+        # this is w_i * (\hat{y_i} + \residuals_{y, i}^{b}) * (\hat{z_i} + \residuals_{z, i}^{b})
+        self.errors_B_2 = errors_B_2 * weights_test
+
+        # we also need errors for the denominator of the aggregate
+        # this is \tilde{z_i}^{b}
+        self.errors_B_3 = z_test_pred_B * weights_test  # has already been clipped above
+        # this is (\hat{z_i} + \residuals_{z, i}^{b})
+        self.errors_B_4 = (z_test_pred + test_residuals_z).clip(
+            min=z_partial_reporting_lower, max=z_partial_reporting_upper
+        ) * weights_test
+
+        # this is for the unit point prediction. turn into unnormalized margin
+        self.weighted_yz_test_pred = yz_test_pred * weights_test
+        # and turn into turnout estimate
+        self.weighted_z_test_pred = z_test_pred * weights_test
+        self.ran_bootstrap = True
+
+    def get_unit_predictions(
+        self, reporting_units: pd.DataFrame, nonreporting_units: pd.DataFrame, estimand: str, **kwargs
+    ) -> np.ndarray:
+        """
+        Returns the unit predictions, if necessary also generates them
+        The unit predictions are the *unnormalized margin*
+            w_i * hat{y_i} * hat{z_i}
+        """
+        # if bootstrap hasn't been run yet, run it
+        if not self.ran_bootstrap:
+            unexpected_units = kwargs["unexpected_units"]
+            self.compute_bootstrap_errors(reporting_units, nonreporting_units, unexpected_units)
+        return self.weighted_yz_test_pred
+
+    def _is_top_level_aggregate(self, aggregate: list) -> bool:
+        """
+        Function to figure out whether we are at the top level aggregation (ie. postal code for state level model or postal code, district for district model)
+        """
+        # case 1:
+        # top level aggregate is postal code (ie. we are generating up to a state level -> ECV or Senate). We know this is the case
+        # because aggregate length is just 1 and postal code is the only aggregate
+        # case 2:
+        # top level aggregate is postal code and district (ie. we are generating up to a district level -> House or State Senate).
+        # We know this is the case because aggregate length is 2 and postal code and district are the two aggregates.
+        return (len(aggregate) == 1 and "postal_code" in aggregate) or (
+            len(aggregate) == 2 and "postal_code" in aggregate and "district" in aggregate
+        )
+
+    def get_aggregate_predictions(
+        self,
+        reporting_units: pd.DataFrame,
+        nonreporting_units: pd.DataFrame,
+        unexpected_units: pd.DataFrame,
+        aggregate: list,
+        estimand: str,
+    ) -> pd.DataFrame:
+        """
+        Generates and returns the normalized margin for arbitrary aggregates
+            sum_{i = 1}^N (w_i * hat{y_i} * hat{z_i}) / sum_{i = 1}^N (w_i * hat{z_i})
+        """
+        n_train = reporting_units.shape[0]
+        n_test = nonreporting_units.shape[0]
+
+        all_units = pd.concat([reporting_units, nonreporting_units, unexpected_units], axis=0)
+
+        # if we want to aggregate to something that isn't postal_code we need to generate a temporary
+        # column so that we create a dummary variable for each level of the aggregate
+        # aggreagate_1 * aggregate_2 rather than aggregate_1 + aggregate_2 which is what would happen otherwise
+        if len(aggregate) > 1:
+            aggregate_temp_column_name = "-".join(aggregate)
+            all_units[aggregate_temp_column_name] = all_units[aggregate].agg("_".join, axis=1)
+            aggregate_indicator = pd.get_dummies(all_units[aggregate_temp_column_name]).values
+        else:
+            aggregate_indicator = pd.get_dummies(all_units[aggregate]).values
+            aggregate_temp_column_name = aggregate
+
+        # the unit level predictions that come in through reporting_units and nonreporting_units
+        # are unnormalized. Since we want the normalized margin for the aggregate predictions
+        # we need to divide the sum of unnormalized aggregates by the total turnout predictions
+        # so we first compute the total turnout predictions
+
+        aggregate_indicator_expected = aggregate_indicator[: (n_train + n_test)]  # noqa: 203
+        aggregate_indicator_unexpected = aggregate_indicator[(n_train + n_test) :]  # noqa: 203
+
+        # two party turnout
+        # results weights is unexpected_units["results_dem"] + unexpected_units["results_gop"]
+        turnout_unexpected = (unexpected_units["results_weights"]).values.reshape(-1, 1)
+
+        aggregate_indicator_train = aggregate_indicator_expected[:n_train]
+        aggregate_indicator_test = aggregate_indicator_expected[n_train:]
+        weights_train = reporting_units["baseline_weights"].values.reshape(-1, 1)
+        z_train = reporting_units["turnout_factor"].values.reshape(-1, 1)
+
+        # get turnout for aggregate (w_i * z_i)
+        aggregate_z_train = aggregate_indicator_train.T @ (weights_train * z_train)
+        aggregate_z_unexpected = aggregate_indicator_unexpected.T @ turnout_unexpected
+
+        # total turnout predictions
+        aggregate_z_total = (
+            aggregate_z_unexpected + aggregate_z_train + aggregate_indicator_test.T @ self.weighted_z_test_pred
+        )
+
+        # use get_aggregate_predictions from BaseElectionModel to sum unnormalized margin of all the units
+        raw_margin_df = super().get_aggregate_predictions(
+            reporting_units, nonreporting_units, unexpected_units, aggregate, estimand
+        )
+
+        # divide the unnormalized margin and results by the total turnout predictions to get the normalized margin for the aggregate
+        # turnot prediction could be zero, in which case predicted margin is also zero, so replace NaNs with zero in that case
+        raw_margin_df["pred_margin"] = np.nan_to_num(raw_margin_df.pred_margin / aggregate_z_total.flatten())
+        raw_margin_df["results_margin"] = np.nan_to_num(raw_margin_df.results_margin / aggregate_z_total.flatten())
+        # if we are in the top level prediction, then save the aggregated baseline margin, which we will need for the national
+        # summary (e.g. ecv) model
+        if self._is_top_level_aggregate(aggregate):
+            aggregate_sum = all_units.groupby(aggregate_temp_column_name).sum()
+            self.aggregate_baseline_margin = (
+                (aggregate_sum.baseline_dem - aggregate_sum.baseline_gop) / (aggregate_sum.baseline_turnout + 1)
+            ).values
+        return raw_margin_df
+
+    def _get_quantiles(self, alpha):
+        """
+        Given a confidence level for the prediction interval, calculates the quantiles
+        necessary to be computed
+        """
+        lower_alpha = (1 - alpha) / 2
+        upper_alpha = 1 - lower_alpha
+
+        # adjust percentiles to account for bootstrap
+        lower_q = np.floor(lower_alpha * (self.B + 1)) / self.B
+        upper_q = np.ceil(upper_alpha * (self.B - 1)) / self.B
+
+        return lower_q, upper_q
+
+    def get_unit_prediction_intervals(
+        self, reporting_units: pd.DataFrame, nonreporting_units: pd.DataFrame, alpha: float, estimand: str
+    ) -> PredictionIntervals:
+        """
+        Generate and return unit level prediction intervals
+
+        In the unit case, the error in our prediciton is:
+                w_i * hat{y_i} * hat{z_i} - w_i * y_i * z_i
+        In the bootstrap setting this has been estimated as:
+                w_i * tilde{y_i}^{b} * tilde{z_i}^{b} - w_i * (hat{y_i} + residual_{y, i}^{b}) * (hat{z_i} + residual_{z, i}^{b})
+
+        The alpha% prediction interval is the (1 - alpha) / 2 and (1 + alpha) / 2 percentiles over the bootstrap samples of this quantity
+        """
+        # error_B_1: w_i * \tilde{y_i}^{b} * \tilde{z_i}^{b}
+        # error_B_2: w_i * (\hat{y_i} + \residual_{y, i}^{b}) * (\hat{z_i} + \residual_{z, i}^{b})
+        errors_B = self.errors_B_1 - self.errors_B_2
+
+        lower_q, upper_q = self._get_quantiles(alpha)
+
+        # sum in the prediction to our lower and upper esimate of the error in our prediction
+        interval_upper, interval_lower = (
+            self.weighted_yz_test_pred - np.quantile(errors_B, q=[lower_q, upper_q], axis=-1).T
+        ).T
+
+        interval_upper = interval_upper.reshape(-1, 1)
+        interval_lower = interval_lower.reshape(-1, 1)
+
+        return PredictionIntervals(interval_lower.round(decimals=0), interval_upper.round(decimals=0))
+
+    def get_aggregate_prediction_intervals(
+        self,
+        reporting_units: pd.DataFrame,
+        nonreporting_units: pd.DataFrame,
+        unexpected_units: pd.DataFrame,
+        aggregate: list,
+        alpha: float,
+        unit_prediction_intervals: PredictionIntervals,
+        estimand: str,
+    ) -> PredictionIntervals:
+        """
+        Generate and return aggregate prediction intervals for arbitrary aggregates
+
+        In the aggregate case, the error in our prediction is:
+                (sum_{i = 1}^N w_i * hat{y_i} * hat{z_i}) / (sum_{i = 1}^n w_i hat{z_i}) - (sum_{i = 1}^N w_i * y_i * z_i) / (sum_{i = 1}^n w_i z_i)
+        In the bootstrap setting this has been estimated as:
+                (sum_{i = 1}^N w_i * tilde_{y_i}^b * tilde_{z_i}^b) / (sum_{i = 1}^N w_i * tilde_{z_i}^b) - (sum_{i = 1}^N w_i * hat_{y_i} + residual_{y, i}^b) * (hat{z_i} + residual_{z, i}^b)) / (sum_{i = 1}^N w_i * (hat{z_i} + residual_{z, i}^b))
+
+        The alpha% prediction interval is the (1 - alpha) / 2 and (1 + alpha) / 2 percentiles over the bootstrap samples of this quantity
+        """
+        n_train = reporting_units.shape[0]
+        n_test = nonreporting_units.shape[0]
+
+        all_units = pd.concat([reporting_units, nonreporting_units, unexpected_units], axis=0)
+
+        if len(aggregate) > 1:
+            aggregate_temp_column_name = "-".join(aggregate)
+            all_units[aggregate_temp_column_name] = all_units[aggregate].agg("_".join, axis=1)
+            aggregate_indicator = pd.get_dummies(all_units[aggregate_temp_column_name]).values
+        else:
+            aggregate_indicator = pd.get_dummies(all_units[aggregate]).values
+        aggregate_indicator_expected = aggregate_indicator[: (n_train + n_test)]
+
+        # first compute turnout and unnormalized margin for unexpected units.
+        # this is a known quantity
+        aggregate_indicator_unexpected = aggregate_indicator[(n_train + n_test) :]  # noqa: 1185
+        margin_unexpected = unexpected_units["results_margin"].values.reshape(-1, 1)
+        # results weights is unexpected_units["results_dem"] + unexpected_units["results_gop"]
+        turnout_unexpected = (unexpected_units["results_weights"]).values.reshape(-1, 1)
+        aggregate_z_unexpected = aggregate_indicator_unexpected.T @ turnout_unexpected
+        aggregate_yz_unexpected = aggregate_indicator_unexpected.T @ margin_unexpected
+
+        aggregate_indicator_train = aggregate_indicator_expected[:n_train]
+        aggregate_indicator_test = aggregate_indicator_expected[n_train:]
+        weights_train = reporting_units["baseline_weights"].values.reshape(-1, 1)
+
+        # compute turnout and unnormalized margin for reporting units.
+        # this is also a known quantity with no uncertainty
+        y_train = reporting_units["results_normalized_margin"].values.reshape(-1, 1)
+        z_train = reporting_units["turnout_factor"].values.reshape(-1, 1)
+        yz_train = y_train * z_train
+        aggregate_z_train = aggregate_indicator_train.T @ (weights_train * z_train)
+        aggregate_yz_train = aggregate_indicator_train.T @ (weights_train * yz_train)
+
+        # (sum_{i = 1}^N w_i * \tilde_{y_i}^b * \tilde_{z_i}^b)
+        aggregate_yz_test_B = aggregate_indicator_test.T @ self.errors_B_1
+
+        # (\sum_{i = 1}^N w_i * (\hat_{y_i} + \residual_{y, i}^b) * (\hat{z_i} + \residual_{z, i}^b))
+        aggregate_yz_test_pred = aggregate_indicator_test.T @ self.errors_B_2
+
+        # (\sum_{i = 1}^N w_i * \tilde_{z_i}^b)
+        aggregate_z_test_B = aggregate_indicator_test.T @ self.errors_B_3
+
+        # (\sum_{i = 1}^N w_i * (\hat{z_i} + \residual_{z, i}^b))
+        aggregate_z_test_pred = aggregate_indicator_test.T @ self.errors_B_4
+
+        # sum the aggregate error components with the known quantities from reporting and unexpected units
+        aggregate_yz_total_B = aggregate_yz_train + aggregate_yz_test_B + aggregate_yz_unexpected
+        aggregate_yz_total_pred = aggregate_yz_train + aggregate_yz_test_pred + aggregate_yz_unexpected
+        aggregate_z_total_B = aggregate_z_train + aggregate_z_test_B + aggregate_z_unexpected
+        aggregate_z_total_pred = aggregate_z_train + aggregate_z_test_pred + aggregate_z_unexpected
+
+        aggregate_error_B_1 = aggregate_yz_total_B
+        aggregate_error_B_2 = aggregate_yz_total_pred
+        aggregate_error_B_3 = aggregate_z_total_B
+        aggregate_error_B_4 = aggregate_z_total_pred
+
+        # (sum_{i = 1}^N w_i * \tilde_{y_i}^b * \tilde_{z_i}^b) /  (\sum_{i = 1}^N w_i * \tilde_{z_i}^b)
+        divided_error_B_1 = np.nan_to_num(aggregate_error_B_1 / aggregate_error_B_3)
+
+        # (\sum_{i = 1}^N w_i * (\hat_{y_i} + \residual_{y, i}^b) * (\hat{z_i} + \residual_{z, i}^b)) /  (\sum_{i = 1}^N w_i * (\hat{z_i} + \residual_{z, i}^b))
+        divided_error_B_2 = np.nan_to_num(aggregate_error_B_2 / aggregate_error_B_4)
+
+        # subtract to get bootstrap error for estimate in our predictions
+        aggregate_error_B = divided_error_B_1 - divided_error_B_2
+
+        lower_q, upper_q = self._get_quantiles(alpha)
+
+        # we also need to re-compute our aggregate prediction to add to our error to get the prediction interval
+        # first the turnout component
+        aggregate_z_total = (
+            aggregate_z_unexpected + aggregate_z_train + aggregate_indicator_test.T @ self.weighted_z_test_pred
+        )
+        # then the unnormalied margin component
+        aggregate_yz_total = (
+            aggregate_yz_unexpected + aggregate_yz_train + aggregate_indicator_test.T @ self.weighted_yz_test_pred
+        )
+        # calculate normalized margin in the aggregate prediction
+        # turnout prediction could be zero, so convert NaN -> 0
+        aggregate_perc_margin_total = np.nan_to_num(aggregate_yz_total / aggregate_z_total)
+
+        # saves the aggregate errors in case we want to generate somem form of national predictions (like ecv)
+        if self._is_top_level_aggregate(aggregate):
+            self.aggregate_error_B_1 = aggregate_error_B_1
+            self.aggregate_error_B_2 = aggregate_error_B_2
+            self.aggregate_error_B_3 = aggregate_error_B_3
+            self.aggregate_error_B_4 = aggregate_error_B_4
+            self.aggregate_perc_margin_total = aggregate_perc_margin_total
+
+        interval_upper, interval_lower = (
+            aggregate_perc_margin_total - np.quantile(aggregate_error_B, q=[lower_q, upper_q], axis=-1).T
+        ).T
+        interval_upper = interval_upper.reshape(-1, 1)
+        interval_lower = interval_lower.reshape(-1, 1)
+
+        return PredictionIntervals(interval_lower, interval_upper)
+
+    def get_national_summary_estimates(
+        self, nat_sum_data_dict: dict, called_states: dict, base_to_add: int | float, alpha: float
+    ) -> list:
+        """
+        Generates and returns a national summary estimate (ie. electoral votes or total number of senate seats)
+        This function does both the point prediction and the lower and upper estimates
+
+        First element in the list is the prediction, second is the lower end of the interval and third is the upper end of the interval
+
+        The point prediction and prediction intervals are very similar to get_aggregate_prediction / get_aggregate_prediction_intervals
+        except that we pass our bootstrapped preditions (and our stand-in for the "true" value) through a sigmoid (or a threshold) and assign
+        weights. This creates gives us bootstrapped national summary estimate (e.g. electoral votes), which we can use to generate
+        a prediction interval
+        """
+        # if nat_sum_data_dict is None then we assign 1 for every contest (ie. Senate or House)
+        if nat_sum_data_dict is None:
+            # the order does not matter since all contests have the same weight, so we can use anything as the key when sorting
+            nat_sum_data_dict = {i: 1 for i in range(self.aggregate_error_B_1.shape[0])}
+
+        # if we didn't pass the right number of national summary weights (ie. the number of contests) then raise an exception
+        if len(nat_sum_data_dict) != self.aggregate_error_B_1.shape[0]:
+            raise BootstrapElectionModelException(
+                f"nat_sum_data_dict is of length {len(nat_sum_data_dict)} but there are {self.aggregate_error_B_1.shape[0]} contests"
+            )
+
+        # called states is a dictionary where 1 means that the LHS party has one, 0 means that the RHS party has won
+        # and -1 means that the state is not called. If called_states is None, assume that all states are not called.
+        if called_states is None:
+            called_states = {i: -1 for i in range(self.aggregate_error_B_1.shape[0])}
+
+        if len(called_states) != self.aggregate_error_B_1.shape[0]:
+            raise BootstrapElectionModelException(
+                f"called_states is of length {len(nat_sum_data_dict)} but there are {self.aggregate_error_B_1.shape[0]} contests"
+            )
+
+        # sort in order to get in the same order as the contests, which have been sorted when getting dummies for aggregate indicators
+        # in get_aggregate_prediction_intervals
+        nat_sum_data_dict_sorted = sorted(nat_sum_data_dict.items())
+        nat_sum_data_dict_sorted_vals = np.asarray([x[1] for x in nat_sum_data_dict_sorted]).reshape(-1, 1)
+
+        called_states_sorted = sorted(called_states.items())
+        called_states_sorted_vals = (
+            np.asarray([x[1] for x in called_states_sorted]).reshape(-1, 1) * 1.0
+        )  # multiplying by 1.0 to turn into floats
+        # since we max/min the state called values with contest win probabilities, we don't want the uncalled states to have a number to max/min
+        # in order for those states to keep their original computed win probability
+        called_states_sorted_vals[called_states_sorted_vals == -1] = np.nan
+
+        # technically we do not need to do this division, since the margin (ie. aggregate_error_B_1 and aggregate_error_B_2)
+        # are enough to know who has won a contest (we don't need the normalized margin)
+        # but we normalize so that the temperature we use to set aggressiveness of sigmoid is in the right scale
+
+        # divided_error_B_1 = np.nan_to_num(self.aggregate_error_B_1 / self.aggregate_baseline_margin.reshape(-1, 1))
+        divided_error_B_1 = np.nan_to_num(self.aggregate_error_B_1 / self.aggregate_error_B_3)
+        # divided_error_B_2 = np.nan_to_num(self.aggregate_error_B_2 / self.aggregate_baseline_margin.reshape(-1, 1))
+        divided_error_B_2 = np.nan_to_num(self.aggregate_error_B_2 / self.aggregate_error_B_4)
+
+        if self.hard_threshold:
+            aggregate_dem_prob_B_1 = divided_error_B_1 > 0.5
+            aggregate_dem_prob_B_1 = divided_error_B_2 > 0.5
+        else:
+            aggregate_dem_prob_B_1 = expit(self.T * divided_error_B_1)
+            aggregate_dem_prob_B_2 = expit(self.T * divided_error_B_2)
+
+        # since called_states_sorted_vals has value 1 if the state is called for the LHS party, maxing the probabilities
+        # gives a probability of 1 for the LHS party
+        # and called_states_sorted_vals has value 0 if the state is called for the RHS party, so mining with probabilities
+        # gives a probability of 0 for the LHS party
+        # and called_states_sorted_vals has value np.nan if the state is uncalled, since we use fmax/fmin the actual number
+        # and not nan gets propagated, so we maintain the probability
+        aggregate_dem_prob_B_1_called = np.fmin(
+            np.fmax(aggregate_dem_prob_B_1, called_states_sorted_vals), called_states_sorted_vals
+        )
+        aggregate_dem_prob_B_2_called = np.fmin(
+            np.fmax(aggregate_dem_prob_B_2, called_states_sorted_vals), called_states_sorted_vals
+        )
+
+        # multiply by weights of each contest
+        aggregate_dem_vals_B_1 = nat_sum_data_dict_sorted_vals * aggregate_dem_prob_B_1_called
+        aggregate_dem_vals_B_2 = nat_sum_data_dict_sorted_vals * aggregate_dem_prob_B_2_called
+
+        # calculate the error in our national aggregate prediction
+        aggregate_dem_vals_B = np.sum(aggregate_dem_vals_B_1, axis=0) - np.sum(aggregate_dem_vals_B_2, axis=0)
+
+        # we also need a national aggregate point prediction
+        if self.hard_threshold:
+            aggregate_dem_probs_total = self.aggregate_perc_margin_total > 0.5
+        else:
+            aggregate_dem_probs_total = expit(self.T * self.aggregate_perc_margin_total)
+
+        # same as for the intervals
+        aggregate_dem_probs_total_called = np.fmin(
+            np.fmax(aggregate_dem_probs_total, called_states_sorted_vals), called_states_sorted_vals
+        )
+        aggregate_dem_vals_pred = np.sum(nat_sum_data_dict_sorted_vals * aggregate_dem_probs_total_called)
+
+        lower_q, upper_q = self._get_quantiles(alpha)
+
+        interval_upper, interval_lower = (
+            aggregate_dem_vals_pred - np.quantile(aggregate_dem_vals_B, q=[lower_q, upper_q], axis=-1).T
+        ).T
+        national_summary_estimates = {
+            "margin": [
+                aggregate_dem_vals_pred + base_to_add,
+                interval_lower + base_to_add,
+                interval_upper + base_to_add,
+            ]
+        }
+
+        print(national_summary_estimates)
+
+        return national_summary_estimates
diff --git a/tests/conftest.py b/tests/conftest.py
index 871879e4..7d0cfad6 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -3,11 +3,12 @@
 import os
 import sys
 
+import numpy as np
 import pandas as pd
 import pytest
 
 from elexmodel.client import HistoricalModelClient, ModelClient
-from elexmodel.models import BaseElectionModel, ConformalElectionModel
+from elexmodel.models import BaseElectionModel, BootstrapElectionModel, ConformalElectionModel
 
 _TEST_FOLDER = os.path.dirname(__file__)
 FIXTURE_DIR = os.path.join(_TEST_FOLDER, "fixtures")
@@ -66,6 +67,18 @@ def conformal_election_model():
     return ConformalElectionModel.ConformalElectionModel(model_settings)
 
 
+@pytest.fixture(scope="function")
+def bootstrap_election_model():
+    model_settings = {"features": ["baseline_normalized_margin"]}
+    return BootstrapElectionModel.BootstrapElectionModel(model_settings)
+
+
+@pytest.fixture(scope="session")
+def rng():
+    seed = 1941
+    return np.random.default_rng(seed=seed)
+
+
 @pytest.fixture(scope="function")
 def va_config(get_fixture):
     path = os.path.join("config", "2017-11-07_VA_G.json")
diff --git a/tests/handlers/test_combined_data.py b/tests/handlers/test_combined_data.py
index d1cd53aa..8b5f1ac9 100644
--- a/tests/handlers/test_combined_data.py
+++ b/tests/handlers/test_combined_data.py
@@ -174,7 +174,9 @@ def test_get_reporting_data(va_governor_county_data):
 
     # no fixed effects
     combined_data_handler = CombinedDataHandler(va_governor_county_data, current_data, estimands, geographic_unit_type)
-    observed_data = combined_data_handler.get_reporting_units(100)
+    turnout_factor_lower = 0.5
+    turnout_factor_upper = 1.5
+    observed_data = combined_data_handler.get_reporting_units(100, turnout_factor_lower, turnout_factor_upper)
     assert observed_data.shape[0] == 20
     assert observed_data.reporting.iloc[0] == 1
     assert observed_data.reporting.sum() == 20
@@ -206,58 +208,15 @@ def test_get_reporting_data_dropping_with_turnout_factor(va_governor_county_data
         & (combined_data_handler.data.turnout_factor < turnout_factor_lower)
     ].shape[0]
 
-    observed_data = combined_data_handler.get_reporting_units(
-        100, turnout_factor_lower=turnout_factor_lower, turnout_factor_upper=turnout_factor_upper
-    )
+    observed_data = combined_data_handler.get_reporting_units(100, turnout_factor_lower, turnout_factor_upper)
 
-    # 20 units should be reporting, but the additional ones are dropped to nonreporting because they are above/below threshold
+    # 20 units should be reporting, but the additional ones are dropped to unexpected because they are above/below threshold
     # and so are subtracted from the reporting ones
     assert observed_data.shape[0] == 20 - (
         reporting_units_above_turnout_factor_threshold + reporting_units_below_turnout_factor_threshold
     )
 
 
-def test_get_nonreporting_adding_with_turnout_factor(va_governor_county_data):
-    election_id = "2017-11-07_VA_G"
-    office = "G"
-    geographic_unit_type = "county"
-    estimands = ["turnout"]
-
-    live_data_handler = MockLiveDataHandler(
-        election_id, office, geographic_unit_type, estimands, data=va_governor_county_data
-    )
-    n = 20
-    current_data = live_data_handler.get_n_fully_reported(n=n)
-
-    va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout
-    va_governor_county_data["last_election_results_turnout"] = va_governor_county_data.baseline_turnout + 1
-
-    combined_data_handler = CombinedDataHandler(va_governor_county_data, current_data, estimands, geographic_unit_type)
-
-    turnout_factor_lower = 0.95
-    turnout_factor_upper = 1.2
-
-    reporting_units_above_turnout_factor_threshold = combined_data_handler.data[
-        combined_data_handler.data.turnout_factor > turnout_factor_upper
-    ].shape[0]
-    reporting_units_below_turnout_factor_threshold = combined_data_handler.data[
-        (combined_data_handler.data.percent_expected_vote == 100)
-        & (combined_data_handler.data.turnout_factor < turnout_factor_lower)
-    ].shape[0]
-
-    nonreporting_data = combined_data_handler.get_nonreporting_units(
-        100, turnout_factor_lower=turnout_factor_lower, turnout_factor_upper=turnout_factor_upper
-    )
-
-    assert (
-        nonreporting_data.shape[0]
-        == va_governor_county_data.shape[0]
-        - n
-        + reporting_units_above_turnout_factor_threshold
-        + reporting_units_below_turnout_factor_threshold
-    )
-
-
 def test_get_unexpected_units_county_district(va_assembly_county_data):
     election_id = "2017-11-07_VA_G"
     office = "Y"
@@ -279,7 +238,11 @@ def test_get_unexpected_units_county_district(va_assembly_county_data):
     va_assembly_county_data["last_election_results_turnout"] = va_assembly_county_data.baseline_turnout + 1
 
     combined_data_handler = CombinedDataHandler(va_assembly_county_data, current_data, estimands, geographic_unit_type)
-    unexpected_data = combined_data_handler.get_unexpected_units(100, ["county_fips", "district"])
+    turnout_factor_lower = 0  # set to extreme values so as not to add any more "unexpected"
+    turnout_factor_upper = 100
+    unexpected_data = combined_data_handler.get_unexpected_units(
+        100, ["county_fips", "district"], turnout_factor_lower, turnout_factor_upper
+    )
     assert unexpected_data.shape[0] == unexpected_units
     assert unexpected_data[unexpected_data.county_fips == ""].shape[0] == 0
     assert unexpected_data["county_fips"].map(lambda x: len(x) == 6).all()
@@ -313,9 +276,69 @@ def test_get_unexpected_units_county(va_governor_county_data):
     va_governor_county_data["last_election_results_turnout"] = va_governor_county_data.baseline_turnout + 1
 
     combined_data_handler = CombinedDataHandler(va_governor_county_data, current_data, estimands, geographic_unit_type)
-    unexpected_data = combined_data_handler.get_unexpected_units(100, ["county_fips"])
+    turnout_factor_lower = 0.5
+    turnout_factor_upper = 1.5
+    unexpected_data = combined_data_handler.get_unexpected_units(
+        100, ["county_fips"], turnout_factor_lower, turnout_factor_upper
+    )
     assert unexpected_data.shape[0] == reporting_unexpected_units + 1
     assert unexpected_data[unexpected_data.county_fips == ""].shape[0] == 0
     assert unexpected_data["county_fips"].map(lambda x: len(x) == 6).all()
     # test that nonreporting unexpected unit is captured here
     assert unexpected_data[unexpected_data.percent_expected_vote == 50].shape[0] == 1
+
+
+def test_zero_baseline_turnout_as_unexpected(va_governor_county_data):
+    election_id = "2017-11-07_VA_G"
+    office = "G"
+    geographic_unit_type = "county"
+    estimands = ["turnout"]
+
+    live_data_handler = MockLiveDataHandler(
+        election_id, office, geographic_unit_type, estimands, data=va_governor_county_data
+    )
+    current_data = live_data_handler.get_n_fully_reported(n=20)
+    # Add an additional row of a nonreporting unexpected unit that we test below
+    va_governor_county_data.loc[0, "baseline_turnout"] = 0
+    va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout
+    va_governor_county_data["last_election_results_turnout"] = va_governor_county_data.baseline_turnout + 1
+    combined_data_handler = CombinedDataHandler(va_governor_county_data, current_data, estimands, geographic_unit_type)
+    turnout_factor_lower = 0.5
+    turnout_factor_upper = 1.5
+    unexpected_data = combined_data_handler.get_unexpected_units(
+        100, ["county_fips"], turnout_factor_lower, turnout_factor_upper
+    )
+    assert va_governor_county_data.loc[0].geographic_unit_fips in unexpected_data.geographic_unit_fips.tolist()
+    assert len(unexpected_data) == 1
+
+    reporting_units = combined_data_handler.get_reporting_units(100, turnout_factor_lower, turnout_factor_upper)
+    assert len(reporting_units) == 20 - 1
+    assert va_governor_county_data.loc[0].geographic_unit_fips not in reporting_units.geographic_unit_fips.tolist()
+
+    nonreporting_units = combined_data_handler.get_nonreporting_units(100, turnout_factor_lower, turnout_factor_upper)
+    assert va_governor_county_data.loc[0].geographic_unit_fips not in nonreporting_units.geographic_unit_fips.tolist()
+
+
+def test_turnout_factor_as_unexpected(va_governor_county_data):
+    election_id = "2017-11-07_VA_G"
+    office = "G"
+    geographic_unit_type = "county"
+    estimands = ["turnout"]
+
+    live_data_handler = MockLiveDataHandler(
+        election_id, office, geographic_unit_type, estimands, data=va_governor_county_data
+    )
+    current_data = live_data_handler.get_n_fully_reported(n=133)
+    # Add an additional row of a nonreporting unexpected unit that we test below
+    va_governor_county_data.loc[0, "baseline_turnout"] = 0
+    va_governor_county_data["baseline_weights"] = va_governor_county_data.baseline_turnout
+    va_governor_county_data["last_election_results_turnout"] = va_governor_county_data.baseline_turnout + 1
+    combined_data_handler = CombinedDataHandler(va_governor_county_data, current_data, estimands, geographic_unit_type)
+    turnout_factor_lower = 0.95
+    turnout_factor_upper = 1.2
+    unexpected_data = combined_data_handler.get_unexpected_units(
+        100, ["county_fips"], turnout_factor_lower, turnout_factor_upper
+    )
+    over = combined_data_handler.data[combined_data_handler.data.turnout_factor >= turnout_factor_upper].shape[0]
+    under = combined_data_handler.data[combined_data_handler.data.turnout_factor < turnout_factor_lower].shape[0]
+    unexpected_data.shape[0] == over + under
diff --git a/tests/handlers/test_config.py b/tests/handlers/test_config.py
index 2567b535..0aed2ef3 100644
--- a/tests/handlers/test_config.py
+++ b/tests/handlers/test_config.py
@@ -77,7 +77,7 @@ def test_get_estimands_general(va_config):
 
     office = "G"
     estimands = config_handler.get_estimands(office)
-    expected = ["dem", "gop", "turnout"]
+    expected = ["dem", "gop", "turnout", "margin"]
     assert expected == estimands
 
 
@@ -120,9 +120,10 @@ def test_get_features(va_config):
     office = "G"
     features = config_handler.get_features(office)
 
-    assert len(features) == 13
+    assert len(features) == 14
     assert features[0] == "age_le_30"
-    assert features[-1] == "percent_bachelor_or_higher"
+    assert features[-2] == "percent_bachelor_or_higher"
+    assert features[-1] == "baseline_normalized_margin"
 
 
 def test_get_aggregates(va_config):
diff --git a/tests/handlers/test_estimandizer.py b/tests/handlers/test_estimandizer.py
index d2321fcc..69f1fd1e 100644
--- a/tests/handlers/test_estimandizer.py
+++ b/tests/handlers/test_estimandizer.py
@@ -30,7 +30,6 @@ def test_add_estimand_results_historical(va_governor_county_data):
     (output_df, result_columns) = estimandizer.add_estimand_results(va_data_copy, estimands, True)
 
     assert "results_party_vote_share_dem" in output_df.columns
-    assert "results_weights" in output_df.columns
     assert result_columns == ["results_party_vote_share_dem", "results_turnout"]
 
 
diff --git a/tests/handlers/test_featurizer.py b/tests/handlers/test_featurizer.py
index 64e1fe16..75a86f74 100644
--- a/tests/handlers/test_featurizer.py
+++ b/tests/handlers/test_featurizer.py
@@ -347,8 +347,8 @@ def test_generate_fixed_effects(va_governor_county_data):
         handle_unreporting="drop",
     )
 
-    reporting_data = combined_data_handler.get_reporting_units(99)
-    nonreporting_data = combined_data_handler.get_nonreporting_units(99)
+    reporting_data = combined_data_handler.get_reporting_units(99, 0.5, 1.5)
+    nonreporting_data = combined_data_handler.get_nonreporting_units(99, 0.5, 1.5)
 
     featurizer = Featurizer([], {"county_classification": "all"})
 
@@ -382,8 +382,8 @@ def test_generate_fixed_effects(va_governor_county_data):
 
     featurizer = Featurizer([], {"county_classification": ["all"], "county_fips": ["all"]})
 
-    reporting_data = combined_data_handler.get_reporting_units(99)
-    nonreporting_data = combined_data_handler.get_nonreporting_units(99)
+    reporting_data = combined_data_handler.get_reporting_units(99, 0.5, 1.5)
+    nonreporting_data = combined_data_handler.get_nonreporting_units(99, 0.5, 1.5)
 
     n_train = reporting_data.shape[0]
     all_units = pd.concat([reporting_data, nonreporting_data], axis=0)
@@ -438,8 +438,8 @@ def test_generate_fixed_effects_not_all_reporting(va_governor_county_data):
         handle_unreporting="drop",
     )
 
-    reporting_data = combined_data_handler.get_reporting_units(99)
-    nonreporting_data = combined_data_handler.get_nonreporting_units(99)
+    reporting_data = combined_data_handler.get_reporting_units(99, 0.5, 1.5)
+    nonreporting_data = combined_data_handler.get_nonreporting_units(99, 0.5, 1.5)
 
     featurizer = Featurizer([], {"county_fips": ["all"]})
     n_train = reporting_data.shape[0]
@@ -504,8 +504,8 @@ def test_generate_fixed_effects_mixed_reporting(va_governor_precinct_data):
         handle_unreporting="drop",
     )
 
-    reporting_data = combined_data_handler.get_reporting_units(99)
-    nonreporting_data = combined_data_handler.get_nonreporting_units(99)
+    reporting_data = combined_data_handler.get_reporting_units(99, 0.5, 1.5)
+    nonreporting_data = combined_data_handler.get_nonreporting_units(99, 0.5, 1.5)
 
     featurizer = Featurizer([], ["county_fips"])
 
diff --git a/tests/integration_test.sh b/tests/integration_test.sh
index 33a3e4e6..c25be886 100644
--- a/tests/integration_test.sh
+++ b/tests/integration_test.sh
@@ -28,3 +28,6 @@ elexmodel 2021-11-02_VA_G --estimands=dem --office_id=Y --geographic_unit_type=p
 
 echo "Running AZ 2021 Republican Senate primary precinct model"
 elexmodel 2020-08-04_AZ_R --estimands mcsally_61631 --office_id=S --geographic_unit_type=precinct --percent_reporting 20 --aggregates=postal_code
+
+echo "Running VA Governor 2017 precinct model using the bootstrap model with model parameters "
+elexmodel 2017-11-07_VA_G --estimands=margin --office_id=G --geographic_unit_type=precinct --pi_method=bootstrap --percent_reporting=10 --aggregates=postal_code --fixed_effects=county_classification --features=baseline_normalized_margin  --model_parameters='{"B": 10}'
\ No newline at end of file
diff --git a/tests/models/test_bootstrap_election_model.py b/tests/models/test_bootstrap_election_model.py
new file mode 100644
index 00000000..1b608802
--- /dev/null
+++ b/tests/models/test_bootstrap_election_model.py
@@ -0,0 +1,1045 @@
+import numpy as np
+import pandas as pd
+import pytest
+import scipy as sp
+
+from elexmodel.handlers.data.CombinedData import CombinedDataHandler
+from elexmodel.handlers.data.LiveData import MockLiveDataHandler
+from elexmodel.handlers.data.PreprocessedData import PreprocessedDataHandler
+from elexmodel.models.BootstrapElectionModel import BootstrapElectionModelException, OLSRegressionSolver
+
+TOL = 1e-5
+
+
+def test_cross_validation(bootstrap_election_model, rng):
+    """
+    Tests cross validation for finding the optimal lambda.
+    """
+    x = rng.normal(loc=2, scale=5, size=(100, 75))
+    x[:, 0] = 1
+    beta = rng.normal(size=(75, 1))
+    y = x @ beta + rng.normal(loc=0, scale=1, size=(100, 1))
+    lambdas = [-0.1, 0, 0.01, 0.1, 1, 100]
+    res = bootstrap_election_model.cv_lambda(x, y, lambdas)
+    assert res == 0.01
+
+    beta = rng.normal(scale=0.001, size=(75, 1))
+    y = x @ beta + rng.normal(loc=0, scale=1, size=(100, 1))
+    lambdas = [-0.1, 0, 0.01, 0.1, 1, 100]
+    res = bootstrap_election_model.cv_lambda(x, y, lambdas)
+    assert res == 100
+
+
+def test_estimate_epsilon(bootstrap_election_model):
+    """
+    Testing estimating the contest level effect.
+    """
+    # the contest level effect (epsilon) is estiamted as the average of the residuals in that contest
+    # if a contest has less then 2 units reporting then the contest level effect is set to zero
+    residuals = np.asarray([0.5, 0.5, 0.3, 0.8, 0.5]).reshape(-1, 1)
+    aggregate_indicator = np.asarray([[1, 1, 0, 0, 0], [0, 0, 1, 1, 0], [0, 0, 0, 0, 1]]).T
+    epsilon_hat = bootstrap_election_model._estimate_epsilon(residuals, aggregate_indicator)
+    assert epsilon_hat[0] == (residuals[0] + residuals[1]) / 2
+    assert epsilon_hat[1] == (residuals[2] + residuals[3]) / 2
+    assert epsilon_hat[2] == 0  # since the aggrgate indicator for that row only has one 1 which is less than 2
+
+
+def test_estimate_delta(bootstrap_election_model):
+    """
+    Testing estimating the unit level error
+    """
+    # the unit level error is the difference between the residu and the state level effect
+    residuals = np.asarray([0.5, 0.5, 0.3, 0.8, 0.5]).reshape(-1, 1)
+    aggregate_indicator = np.asarray([[1, 1, 0, 0, 0], [0, 0, 1, 1, 0], [0, 0, 0, 0, 1]]).T
+    epsilon_hat = bootstrap_election_model._estimate_epsilon(residuals, aggregate_indicator)  # [0.5, 0.55, 0]
+    delta_hat = bootstrap_election_model._estimate_delta(residuals, epsilon_hat, aggregate_indicator)
+    desired = np.asarray([0.0, 0.0, -0.25, 0.25, 0.5])
+    np.testing.assert_array_almost_equal(desired, delta_hat)
+
+
+def test_estimate_model_error(bootstrap_election_model, rng):
+    x = rng.normal(loc=2, scale=5, size=(100, 5))
+    x[:, 0] = 1
+    beta = rng.integers(low=-100, high=100, size=(5, 1))
+    y = x @ beta
+    ols_regression = OLSRegressionSolver()
+    ols_regression.fit(x, y)
+    aggregate_indicator = rng.multivariate_hypergeometric([1] * 5, 1, size=100)
+    residuals, epsilon_hat, delta_hat = bootstrap_election_model._estimate_model_errors(
+        ols_regression, x, y, aggregate_indicator
+    )
+    y_hat = ols_regression.predict(x)
+    np.testing.assert_array_almost_equal(ols_regression.residuals(y, y_hat, loo=True, center=True), residuals)
+    # epsilon_hat and delta_hat are tested above
+
+
+def test_get_strata(bootstrap_election_model):
+    reporting_units = pd.DataFrame(
+        [["a", True, True], ["b", True, True], ["c", True, True]],
+        columns=["county_classification", "reporting", "expected"],
+    )
+    nonreporting_units = pd.DataFrame(
+        [["c", False, True], ["d", False, True]], columns=["county_classification", "reporting", "expected"]
+    )
+    x_train_strata, x_test_strata = bootstrap_election_model._get_strata(reporting_units, nonreporting_units)
+
+    assert "intercept" in x_train_strata.columns
+    # a has been dropped
+    assert "county_classification_b" in x_train_strata.columns
+    assert "county_classification_c" in x_train_strata.columns
+    assert "county_classification_d" in x_train_strata.columns
+    np.testing.assert_array_almost_equal(x_train_strata.values, np.asarray([[1, 0, 0, 0], [1, 1, 0, 0], [1, 0, 1, 0]]))
+
+    assert "intercept" in x_test_strata.columns
+    assert "county_classification_b" in x_test_strata.columns
+    assert "county_classification_c" in x_test_strata.columns
+    assert "county_classification_d" in x_test_strata.columns
+
+    np.testing.assert_array_almost_equal(x_test_strata.values, np.asarray([[1, 0, 1, 0], [1, 0, 0, 1]]))
+
+
+def test_estimate_strata_dist(bootstrap_election_model, rng):
+    x_train, x_test = None, None
+    x_train_strata = pd.DataFrame([[1, 0, 0], [1, 0, 0], [1, 1, 0], [1, 1, 0], [1, 1, 0]])
+    x_test_strata = pd.DataFrame([[1, 1, 0], [1, 0, 1], [1, 0, 1]])
+    delta_hat = np.asarray([-0.3, 0.5, 0.2, 0.25, 0.28])
+    lb = 0.1
+    ub = 0.3
+    ppf, cdf = bootstrap_election_model._estimate_strata_dist(
+        x_train, x_train_strata, x_test, x_test_strata, delta_hat, lb, ub
+    )
+    assert ppf[(1, 0, 0)](0.01) == pytest.approx(delta_hat[:2].min())
+    assert ppf[(1, 0, 0)](0.34) == pytest.approx(lb)
+    assert ppf[(1, 0, 0)](0.66) == pytest.approx(ub)
+    assert ppf[(1, 0, 0)](0.99) == pytest.approx(delta_hat[:2].max())
+
+    # since all elements of the second strata are > 0.1 and < 0.3 we use the lb and ub for the first quantiles
+    assert ppf[(1, 1, 0)](0.01) == pytest.approx(lb)
+    assert ppf[(1, 1, 0)](0.25) == pytest.approx(lb)
+    assert ppf[(1, 1, 0)](0.26) == pytest.approx(delta_hat[2:].min())
+    assert ppf[(1, 1, 0)](0.50) == pytest.approx(np.median(delta_hat[2:]))
+    assert ppf[(1, 1, 0)](0.51) == pytest.approx(delta_hat[2:].max())
+    assert ppf[(1, 1, 0)](0.74) == pytest.approx(delta_hat[2:].max())
+    assert ppf[(1, 1, 0)](0.75) == pytest.approx(ub)
+    assert ppf[(1, 1, 0)](0.99) == pytest.approx(ub)
+
+    assert ppf[(1, 0, 1)](0.49) == pytest.approx(lb)
+    assert ppf[(1, 0, 1)](0.50) == pytest.approx(ub)
+
+    # anything less than -0.3 returns 0.01
+    assert cdf[(1, 0, 0)](-0.4) == pytest.approx(0.01)
+    # lower QR sees three datapoints: -0.3, 0.1 and 0.5, so -0.3 is at the right boundary of 0.01 and 0.33
+    assert cdf[(1, 0, 0)](-0.3) == pytest.approx(0.33)
+    # upper QR sees three datapoints: -0.3, 0.3 and 0.5 and so 0.3 is that right boundary of 0.33 and 0.66
+    assert cdf[(1, 0, 0)](0.3) == pytest.approx(0.66)
+    # upper QR sees three datapoints: -0.3, 0.3 and 0.5 and so 0.5 is on the right boundary of 0.66 and 0.99
+    assert cdf[(1, 0, 0)](0.5) == pytest.approx(0.99)
+    # since interp keyword righ=1
+    assert cdf[(1, 0, 0)](0.51) == pytest.approx(1)
+
+    assert cdf[(1, 1, 0)](0.1) == pytest.approx(0.01)
+    assert cdf[(1, 1, 0)](0.2) == pytest.approx(0.49)
+    assert cdf[(1, 1, 0)](0.28) == pytest.approx(0.74)
+    assert cdf[(1, 1, 0)](0.3) == pytest.approx(0.99)
+
+    assert cdf[(1, 0, 1)](0.1) == pytest.approx(0.01)
+    assert cdf[(1, 0, 1)](0.3) == pytest.approx(0.99)
+
+
+def test_generate_nonreporting_bounds(bootstrap_election_model, rng):
+    nonreporting_units = pd.DataFrame(
+        [[0.1, 1.2, 75], [0.8, 0.8, 24], [0.1, 0.01, 0], [-0.2, 0.8, 99], [-0.3, 0.9, 100]],
+        columns=["results_normalized_margin", "turnout_factor", "percent_expected_vote"],
+    )
+
+    # assumes that all outstanding vote will go to one of the two parties
+    lower, upper = bootstrap_election_model._generate_nonreporting_bounds(
+        nonreporting_units, "results_normalized_margin"
+    )
+
+    # hand checked in excel
+    assert lower[0] == pytest.approx(-0.175)
+    assert upper[0] == pytest.approx(0.325)
+
+    assert lower[1] == pytest.approx(-0.568)
+    assert upper[1] == pytest.approx(0.952)
+
+    # if expected vote is close to 0 or 1 we set the bounds to be the extreme case
+    assert lower[2] == bootstrap_election_model.y_unobserved_lower_bound
+    assert upper[2] == bootstrap_election_model.y_unobserved_upper_bound
+
+    assert lower[3] == pytest.approx(-0.208)
+    assert upper[3] == pytest.approx(-0.188)
+
+    assert lower[4] == bootstrap_election_model.y_unobserved_lower_bound
+    assert upper[4] == bootstrap_election_model.y_unobserved_upper_bound
+
+    lower, upper = bootstrap_election_model._generate_nonreporting_bounds(nonreporting_units, "turnout_factor")
+
+    assert lower[0] == pytest.approx(0.96)
+    assert upper[0] == pytest.approx(4.8)
+
+    assert lower[1] == pytest.approx(1.081081081)
+    assert upper[1] == pytest.approx(80)  # this is 80 since we divide 0.8 / 0.01 (it's clipped)
+
+    assert lower[2] == bootstrap_election_model.z_unobserved_lower_bound
+    assert upper[2] == bootstrap_election_model.z_unobserved_upper_bound
+
+    assert lower[3] == pytest.approx(0.536912752)
+    assert upper[3] == pytest.approx(1.632653061)
+
+    assert lower[4] == bootstrap_election_model.z_unobserved_lower_bound
+    assert upper[4] == bootstrap_election_model.z_unobserved_upper_bound
+
+    # changing parameters
+    bootstrap_election_model.y_unobserved_lower_bound = -0.8
+    bootstrap_election_model.y_unobserved_upper_bound = 0.8
+
+    lower, upper = bootstrap_election_model._generate_nonreporting_bounds(
+        nonreporting_units, "results_normalized_margin"
+    )
+    assert lower[0] == pytest.approx(-0.125)
+    assert upper[0] == pytest.approx(0.275)
+
+    assert lower[1] == pytest.approx(-0.416)
+    assert upper[1] == pytest.approx(0.8)
+
+    assert lower[2] == bootstrap_election_model.y_unobserved_lower_bound
+    assert upper[2] == bootstrap_election_model.y_unobserved_upper_bound
+
+    assert lower[3] == pytest.approx(-0.206)
+    assert upper[3] == pytest.approx(-0.19)
+
+    assert lower[4] == bootstrap_election_model.y_unobserved_lower_bound
+    assert upper[4] == bootstrap_election_model.y_unobserved_upper_bound
+
+    bootstrap_election_model.y_unobserved_lower_bound = 0.8
+    bootstrap_election_model.y_unobserved_upper_bound = 1.2
+    bootstrap_election_model.percent_expected_vote_error_bound = 0.1
+
+    lower, upper = bootstrap_election_model._generate_nonreporting_bounds(nonreporting_units, "turnout_factor")
+    assert lower[0] == pytest.approx(1.411764706)
+    assert upper[0] == pytest.approx(1.846153846)
+
+    assert lower[1] == pytest.approx(2.352941176)
+    assert upper[1] == pytest.approx(5.714285714)
+
+    assert lower[2] == bootstrap_election_model.z_unobserved_lower_bound
+    assert upper[2] == bootstrap_election_model.z_unobserved_upper_bound
+
+    assert lower[3] == pytest.approx(0.733944954)
+    assert upper[3] == pytest.approx(0.898876404)
+
+    assert lower[4] == bootstrap_election_model.z_unobserved_lower_bound
+    assert upper[4] == bootstrap_election_model.z_unobserved_upper_bound
+
+
+def test_strata_pit(bootstrap_election_model, rng):
+    x_train, x_test = None, None
+    x_train_strata = pd.DataFrame([[1, 0, 0], [1, 0, 0], [1, 1, 0], [1, 1, 0], [1, 1, 0]])
+    x_test_strata = pd.DataFrame([[1, 1, 0], [1, 0, 1], [1, 0, 1]])
+    delta_hat = np.asarray([-0.3, 0.5, 0.2, 0.25, 0.28])
+    lb = 0.1
+    ub = 0.3
+    ppf, cdf = bootstrap_election_model._estimate_strata_dist(
+        x_train, x_train_strata, x_test, x_test_strata, delta_hat, lb, ub
+    )
+
+    x_train_strata_unique = np.unique(x_train_strata, axis=0).astype(int)
+    uniforms = bootstrap_election_model._strata_pit(x_train_strata, x_train_strata_unique, delta_hat, cdf)
+
+    assert uniforms[0] == pytest.approx(
+        0.33
+    )  # since -0.3 is the lowest of the three elements [-0.3, 0.5, 0.3] where 0.3 is the UB (LB is -0.3)
+    assert uniforms[1] == pytest.approx(0.99)  # since 0.5 is the largest
+    assert uniforms[2] == pytest.approx(0.49)  # [-0.3, 0.2, 0.25, 0.28, 0.3]
+    assert uniforms[3] == pytest.approx(0.5)
+    assert uniforms[4] == pytest.approx(0.74)
+
+
+def test_bootstrap_deltas(bootstrap_election_model):
+    x_train, x_test = None, None
+    x_train_strata = pd.DataFrame([[1, 0, 0], [1, 0, 0], [1, 1, 0], [1, 1, 0], [1, 1, 0]])
+    x_test_strata = pd.DataFrame([[1, 1, 0], [1, 0, 1], [1, 0, 1]])
+    delta_hat = np.asarray([-0.3, 0.5, 0.2, 0.25, 0.28])
+    lb = 0.1
+    ub = 0.3
+    ppf, cdf = bootstrap_election_model._estimate_strata_dist(
+        x_train, x_train_strata, x_test, x_test_strata, delta_hat, lb, ub
+    )
+    x_train_strata_unique = np.unique(x_train_strata, axis=0).astype(int)
+    uniforms = bootstrap_election_model._strata_pit(x_train_strata, x_train_strata_unique, delta_hat, cdf)
+    unifs = np.concatenate([uniforms, uniforms], axis=1)
+    x_train_strata_unique = np.unique(x_train_strata, axis=0).astype(int)
+    bootstrap_deltas_y, bootstrap_deltas_z = bootstrap_election_model._bootstrap_deltas(
+        unifs, x_train_strata, x_train_strata_unique, ppf, ppf
+    )
+    assert bootstrap_deltas_y.shape == (delta_hat.shape[0], bootstrap_election_model.B)
+    assert bootstrap_deltas_z.shape == (delta_hat.shape[0], bootstrap_election_model.B)
+    # testing that all elements of bootstrap delta is one of delta hat or lb or ub
+    assert np.isclose(
+        bootstrap_deltas_z.flatten().reshape(1, -1), np.concatenate([delta_hat, [0.1, 0.3]]).reshape(-1, 1), rtol=0.001
+    ).any(0).mean() == pytest.approx(1)
+    assert np.isclose(
+        bootstrap_deltas_y.flatten().reshape(1, -1), np.concatenate([delta_hat, [0.1, 0.3]]).reshape(-1, 1), rtol=0.001
+    ).any(0).mean() == pytest.approx(1)
+
+
+def test_bootstrap_epsilons(bootstrap_election_model):
+    residuals = np.asarray([0.5, 0.5, 0.3, 0.8, 0.5]).reshape(-1, 1)
+    aggregate_indicator = np.asarray([[1, 1, 0, 0, 0], [0, 0, 1, 1, 0], [0, 0, 0, 0, 1]]).T
+    epsilon_hat = bootstrap_election_model._estimate_epsilon(residuals, aggregate_indicator)
+    delta_hat = bootstrap_election_model._estimate_delta(residuals, epsilon_hat, aggregate_indicator)
+
+    x_train, x_test = None, None
+    x_train_strata = pd.DataFrame([[1, 0, 0], [1, 0, 0], [1, 1, 0], [1, 1, 0], [1, 1, 0]])
+    x_test_strata = pd.DataFrame([[1, 1, 0], [1, 0, 1], [1, 0, 1]])
+    x_train_strata_unique = np.unique(x_train_strata, axis=0).astype(int)
+    lb = 0.1
+    ub = 0.3
+    ppf, cdf = bootstrap_election_model._estimate_strata_dist(
+        x_train, x_train_strata, x_test, x_test_strata, delta_hat, lb, ub
+    )
+
+    bootstrap_epsilons_y, bootstrap_epsilons_z = bootstrap_election_model._bootstrap_epsilons(
+        epsilon_hat, epsilon_hat, x_train_strata, x_train_strata_unique, ppf, ppf, aggregate_indicator
+    )
+    assert bootstrap_epsilons_y.shape == (epsilon_hat.shape[0], bootstrap_election_model.B)
+    assert bootstrap_epsilons_z.shape == (epsilon_hat.shape[0], bootstrap_election_model.B)
+
+    # the last epsilon only has one element, so epsilon_hat is zero so the bootstrapped versions should be zero also
+    assert np.isclose(bootstrap_epsilons_y[-1], 0).mean() == pytest.approx(1)
+    # the others have the correct mean
+    assert bootstrap_epsilons_y[0].mean() == pytest.approx(epsilon_hat[0], rel=0.1)
+    assert bootstrap_epsilons_y[1].mean() == pytest.approx(epsilon_hat[1], rel=0.1)
+
+
+def test_bootstrap_errors(bootstrap_election_model):
+    residuals = np.asarray([0.5, 0.5, 0.3, 0.8, 0.5]).reshape(-1, 1)
+    aggregate_indicator = np.asarray([[1, 1, 0, 0, 0], [0, 0, 1, 1, 0], [0, 0, 0, 0, 1]]).T
+    epsilon_hat = bootstrap_election_model._estimate_epsilon(residuals, aggregate_indicator)
+    delta_hat = bootstrap_election_model._estimate_delta(residuals, epsilon_hat, aggregate_indicator)
+
+    x_train, x_test = None, None
+    x_train_strata = pd.DataFrame([[1, 0, 0], [1, 0, 0], [1, 1, 0], [1, 1, 0], [1, 1, 0]])
+    x_test_strata = pd.DataFrame([[1, 1, 0], [1, 0, 1], [1, 0, 1]])
+    lb = 0.1
+    ub = 0.3
+    ppf, cdf = bootstrap_election_model._estimate_strata_dist(
+        x_train, x_train_strata, x_test, x_test_strata, delta_hat, lb, ub
+    )
+    epsilon_B, delta_B = bootstrap_election_model._bootstrap_errors(
+        epsilon_hat, epsilon_hat, delta_hat, delta_hat, x_train_strata, cdf, cdf, ppf, ppf, aggregate_indicator
+    )
+    epsilon_y_B, epsilon_z_B = epsilon_B
+    delta_y_B, delta_z_B = delta_B
+
+    assert epsilon_y_B.shape == (aggregate_indicator.shape[1], bootstrap_election_model.B)
+    assert epsilon_z_B.shape == (aggregate_indicator.shape[1], bootstrap_election_model.B)
+    assert delta_y_B.shape == (residuals.shape[0], bootstrap_election_model.B)
+    assert delta_z_B.shape == (residuals.shape[0], bootstrap_election_model.B)
+    # the values of bootstrap epsilon and delta are checked above
+
+
+def test_sample_test_delta(bootstrap_election_model):
+    residuals = np.asarray([0.5, 0.5, 0.3, 0.8, 0.5]).reshape(-1, 1)
+    aggregate_indicator = np.asarray([[1, 1, 0, 0, 0], [0, 0, 1, 1, 0], [0, 0, 0, 0, 1]]).T
+    epsilon_hat = bootstrap_election_model._estimate_epsilon(residuals, aggregate_indicator)
+    delta_hat = bootstrap_election_model._estimate_delta(residuals, epsilon_hat, aggregate_indicator)
+
+    x_train, x_test = None, None
+    x_train_strata = pd.DataFrame([[1, 0, 0], [1, 0, 0], [1, 1, 0], [1, 1, 0], [1, 1, 0]])
+    x_test_strata = pd.DataFrame([[1, 1, 0], [1, 0, 1], [1, 0, 1], [1, 0, 0]])
+    lb = 0.1
+    ub = 0.3
+    ppf, cdf = bootstrap_election_model._estimate_strata_dist(
+        x_train, x_train_strata, x_test, x_test_strata, delta_hat, lb, ub
+    )
+
+    delta_y, delta_z = bootstrap_election_model._sample_test_delta(x_test_strata, ppf, ppf)
+
+    assert delta_y.shape == (x_test_strata.shape[0], bootstrap_election_model.B)
+    assert delta_z.shape == (x_test_strata.shape[0], bootstrap_election_model.B)
+    # delta_y should be either in delta_hat or lb or ub for all sampled uniforms EXCEPT when we sample exactly at a break (so between the percentiles of two samples)
+    # this should only happen ~1% of time per breakpoint, for this strata there are 5 such breakpoints (this strats is 1,1,0)
+    assert (
+        np.isclose(delta_y[0].reshape(1, -1), np.concatenate([delta_hat, [0.1, 0.3]]).reshape(-1, 1), rtol=0.01)
+        .any(0)
+        .mean()
+        >= 0.94
+    )
+    # for this strata there are 5 such breakpoints: this stratas is 1,0,0
+    assert (
+        np.isclose(delta_y[3].reshape(1, -1), np.concatenate([delta_hat, [0.1, 0.3]]).reshape(-1, 1), rtol=0.01)
+        .any(0)
+        .mean()
+        >= 0.95
+    )
+    # one breakpoint at 0.1 and 0.3 for startum (1,0,1)
+    assert (
+        np.isclose(delta_y[1].reshape(1, -1), np.concatenate([delta_hat, [0.1, 0.3]]).reshape(-1, 1), rtol=0.01)
+        .any(0)
+        .mean()
+        >= 0.98
+    )
+    assert (
+        np.isclose(delta_y[2].reshape(1, -1), np.concatenate([delta_hat, [0.1, 0.3]]).reshape(-1, 1), rtol=0.01)
+        .any(0)
+        .mean()
+        >= 0.98
+    )
+
+
+def test_sample_test_epsilon(bootstrap_election_model):
+    residuals = np.asarray([0.5, 0.5, 0.3, 0.8, 0.5, 0.2]).reshape(-1, 1)
+    aggregate_indicator_train = np.asarray(
+        [[1, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0]]
+    )
+    aggregate_indicator_test = np.asarray(
+        [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0]]
+    )
+    epsilon_hat = bootstrap_election_model._estimate_epsilon(residuals, aggregate_indicator_train)
+
+    epsilon_y, epsilon_z = bootstrap_election_model._sample_test_epsilon(
+        residuals, residuals, epsilon_hat, epsilon_hat, aggregate_indicator_train, aggregate_indicator_test
+    )
+
+    assert epsilon_y.shape == (aggregate_indicator_test.shape[0], bootstrap_election_model.B)
+    assert epsilon_z.shape == (aggregate_indicator_test.shape[0], bootstrap_election_model.B)
+
+    # aggregate 3 has no elements in the training set, and rows 2,3,5 in the test set
+    # are parts of aggregate 3
+    assert np.isclose(epsilon_z[0], 0).all()
+    assert np.isclose(epsilon_z[1], 0).all()
+    assert not np.isclose(epsilon_z[2], 0).all()
+    assert not np.isclose(epsilon_z[3], 0).all()
+    assert np.isclose(epsilon_z[4], 0).all()
+    assert not np.isclose(epsilon_z[5], 0).all()
+
+    # testing that if there is only one element epsilon_hat that we return 0
+    epsilon_y, epsilon_z = bootstrap_election_model._sample_test_epsilon(
+        residuals, residuals, [[4]], [[4]], aggregate_indicator_train, aggregate_indicator_test
+    )
+    assert epsilon_y.shape == (1, bootstrap_election_model.B)
+    assert epsilon_z.shape == (1, bootstrap_election_model.B)
+
+    assert np.isclose(epsilon_z[0], 0).all()
+
+
+def test_sample_test_errors(bootstrap_election_model):
+    residuals = np.asarray([0.5, 0.5, 0.3, 0.8, 0.5]).reshape(-1, 1)
+    aggregate_indicator_train = np.asarray([[1, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1]])
+    aggregate_indicator_test = np.asarray(
+        [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0]]
+    )
+    epsilon_hat = bootstrap_election_model._estimate_epsilon(residuals, aggregate_indicator_train)
+    delta_hat = bootstrap_election_model._estimate_delta(residuals, epsilon_hat, aggregate_indicator_train)
+
+    x_train, x_test = None, None
+    x_train_strata = pd.DataFrame([[1, 0, 0], [1, 0, 0], [1, 1, 0], [1, 1, 0], [1, 1, 0]])
+    x_test_strata = pd.DataFrame([[1, 1, 0], [1, 0, 1], [1, 0, 1], [1, 1, 0], [1, 0, 1], [1, 0, 0]])
+    lb = 0.1
+    ub = 0.3
+    ppf, cdf = bootstrap_election_model._estimate_strata_dist(
+        x_train, x_train_strata, x_test, x_test_strata, delta_hat, lb, ub
+    )
+    test_error_y, test_error_z = bootstrap_election_model._sample_test_errors(
+        residuals,
+        residuals,
+        epsilon_hat,
+        epsilon_hat,
+        x_test_strata,
+        ppf,
+        ppf,
+        aggregate_indicator_train,
+        aggregate_indicator_test,
+    )
+    assert test_error_y.shape == (aggregate_indicator_test.shape[0], bootstrap_election_model.B)
+    assert test_error_z.shape == (aggregate_indicator_test.shape[0], bootstrap_election_model.B)
+    # values for sampling epsilons and deltas above
+
+
+def test_compute_bootstrap_errors(bootstrap_election_model, va_governor_county_data):
+    election_id = "2017-11-07_VA_G"
+    office_id = "G"
+    geographic_unit_type = "county"
+    estimands = ["margin"]
+    percent_reporting_threshold = 100
+
+    data_handler = MockLiveDataHandler(
+        election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data
+    )
+
+    data_handler.shuffle()
+    current_data = data_handler.get_percent_fully_reported(50)
+
+    preprocessed_data_handler = PreprocessedDataHandler(
+        election_id, office_id, geographic_unit_type, estimands, {"margin": "margin"}, data=va_governor_county_data
+    )
+
+    combined_data_handler = CombinedDataHandler(
+        preprocessed_data_handler.data, current_data, estimands, geographic_unit_type, handle_unreporting="drop"
+    )
+
+    turnout_factor_lower = 0.5
+    turnout_factor_upper = 2.0
+    reporting_units = combined_data_handler.get_reporting_units(
+        percent_reporting_threshold,
+        turnout_factor_lower=turnout_factor_lower,
+        turnout_factor_upper=turnout_factor_upper,
+    )
+    nonreporting_units = combined_data_handler.get_nonreporting_units(
+        percent_reporting_threshold,
+        turnout_factor_lower=turnout_factor_lower,
+        turnout_factor_upper=turnout_factor_upper,
+    )
+    unexpected_units = combined_data_handler.get_unexpected_units(
+        percent_reporting_threshold,
+        ["postal_code"],
+        turnout_factor_lower=turnout_factor_lower,
+        turnout_factor_upper=turnout_factor_upper,
+    )
+
+    assert not bootstrap_election_model.ran_bootstrap
+    bootstrap_election_model.B = 10
+    bootstrap_election_model.compute_bootstrap_errors(reporting_units, nonreporting_units, unexpected_units)
+    assert bootstrap_election_model.ran_bootstrap
+    assert bootstrap_election_model.weighted_yz_test_pred.shape == (nonreporting_units.shape[0], 1)
+    assert bootstrap_election_model.weighted_yz_test_pred.shape == bootstrap_election_model.weighted_z_test_pred.shape
+    assert bootstrap_election_model.errors_B_1.shape == (nonreporting_units.shape[0], bootstrap_election_model.B)
+    assert bootstrap_election_model.errors_B_2.shape == (nonreporting_units.shape[0], bootstrap_election_model.B)
+    assert bootstrap_election_model.errors_B_3.shape == (nonreporting_units.shape[0], bootstrap_election_model.B)
+    assert bootstrap_election_model.errors_B_4.shape == (nonreporting_units.shape[0], bootstrap_election_model.B)
+
+
+def test_get_unit_predictions(bootstrap_election_model, va_governor_county_data):
+    election_id = "2017-11-07_VA_G"
+    office_id = "G"
+    geographic_unit_type = "county"
+    estimands = ["margin"]
+    percent_reporting_threshold = 100
+
+    data_handler = MockLiveDataHandler(
+        election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data
+    )
+
+    data_handler.shuffle()
+    current_data = data_handler.get_percent_fully_reported(50)
+
+    preprocessed_data_handler = PreprocessedDataHandler(
+        election_id, office_id, geographic_unit_type, estimands, {"margin": "margin"}, data=va_governor_county_data
+    )
+
+    combined_data_handler = CombinedDataHandler(
+        preprocessed_data_handler.data, current_data, estimands, geographic_unit_type, handle_unreporting="drop"
+    )
+
+    turnout_factor_lower = 0.5
+    turnout_factor_upper = 2.0
+    reporting_units = combined_data_handler.get_reporting_units(
+        percent_reporting_threshold,
+        turnout_factor_lower=turnout_factor_lower,
+        turnout_factor_upper=turnout_factor_upper,
+    )
+    nonreporting_units = combined_data_handler.get_nonreporting_units(
+        percent_reporting_threshold,
+        turnout_factor_lower=turnout_factor_lower,
+        turnout_factor_upper=turnout_factor_upper,
+    )
+    unexpected_units = combined_data_handler.get_unexpected_units(
+        percent_reporting_threshold, ["postal_code"], turnout_factor_lower, turnout_factor_upper
+    )
+
+    bootstrap_election_model.B = 10
+    assert not bootstrap_election_model.ran_bootstrap
+    unit_predictions = bootstrap_election_model.get_unit_predictions(
+        reporting_units, nonreporting_units, estimand="margin", unexpected_units=unexpected_units
+    )
+    assert bootstrap_election_model.ran_bootstrap
+    assert unit_predictions.shape == (nonreporting_units.shape[0], 1)
+
+
+def test_is_top_level_aggregate(bootstrap_election_model):
+    assert bootstrap_election_model._is_top_level_aggregate(["postal_code"])
+    assert bootstrap_election_model._is_top_level_aggregate(["postal_code", "district"])
+
+    assert not bootstrap_election_model._is_top_level_aggregate(["postal_code", "county_fips"])
+    assert not bootstrap_election_model._is_top_level_aggregate(["postal_code", "district", "county_classification"])
+    assert not bootstrap_election_model._is_top_level_aggregate(["county_fips"])
+    assert not bootstrap_election_model._is_top_level_aggregate([])
+
+
+def test_aggregate_predictions(bootstrap_election_model):
+    reporting_units = pd.DataFrame(
+        [
+            ["a", -3, 0.2, 1, 1, 1, 1, 3, 5, 8, "c"],
+            ["a", 1, 0, 1, 1, 1, 1, 2, 1, 3, "a"],
+            ["b", 5, -0.1, 1, 1, 1, 1, 8, 3, 11, "a"],
+            ["c", 3, -0.2, 1, 1, 1, 1, 9, 1, 9, "c"],
+            ["c", 3, 0.8, 1, 1, 1, 1, 2, 4, 6, "c"],
+        ],
+        columns=[
+            "postal_code",
+            "pred_margin",
+            "results_margin",
+            "results_weights",
+            "baseline_weights",
+            "turnout_factor",
+            "reporting",
+            "baseline_dem",
+            "baseline_gop",
+            "baseline_turnout",
+            "district",
+        ],
+    )
+    nonreporting_units = pd.DataFrame(
+        [
+            ["a", -3, 0.2, 1, 1, 1, 0, 3, 5, 8, "c"],
+            ["b", 1, 0, 1, 1, 1, 0, 2, 1, 3, "a"],
+            ["d", 5, -0.1, 1, 1, 1, 0, 8, 3, 11, "c"],
+            ["d", 3, 0.8, 1, 1, 1, 0, 2, 4, 6, "c"],
+            ["e", 4, 0.1, 1, 1, 1, 0, 5, 1, 9, "e"],
+            ["e", 4, 0.1, 1, 1, 1, 0, 5, 1, 9, "a"],
+        ],
+        columns=[
+            "postal_code",
+            "pred_margin",
+            "results_margin",
+            "results_weights",
+            "baseline_weights",
+            "turnout_factor",
+            "reporting",
+            "baseline_dem",
+            "baseline_gop",
+            "baseline_turnout",
+            "district",
+        ],
+    )
+    unexpected_units = pd.DataFrame(
+        [
+            ["a", -3, 0.2, 1, 1, 1, 0, np.nan, np.nan, np.nan, "c"],
+            ["d", 1, 0, 1, 1, 1, 0, np.nan, np.nan, np.nan, "c"],
+            ["f", 5, -0.1, 1, 1, 1, 0, np.nan, np.nan, np.nan, "f"],
+            ["f", 3, 0.8, 1, 1, 1, 0, np.nan, np.nan, np.nan, "f"],
+        ],
+        columns=[
+            "postal_code",
+            "pred_margin",
+            "results_margin",
+            "results_weights",
+            "baseline_weights",
+            "turnout_factor",
+            "reporting",
+            "baseline_dem",
+            "baseline_gop",
+            "baseline_turnout",
+            "district",
+        ],
+    )
+
+    bootstrap_election_model.weighted_z_test_pred = np.asarray([1, 1, 1, 1, 1, 1]).reshape(-1, 1)
+
+    # test that is top level aggregate is working
+    aggregate_predictions = bootstrap_election_model.get_aggregate_predictions(
+        reporting_units, nonreporting_units, unexpected_units, ["district"], "margin"
+    )
+    with pytest.raises(AttributeError):
+        bootstrap_election_model.aggregate_baseline_margin
+
+    aggregate_predictions = bootstrap_election_model.get_aggregate_predictions(
+        reporting_units, nonreporting_units, unexpected_units, ["postal_code"], "margin"
+    )
+
+    assert bootstrap_election_model.aggregate_baseline_margin is not None
+
+    assert aggregate_predictions[aggregate_predictions.postal_code == "a"].pred_margin[0] == pytest.approx(
+        -2.6 / 4
+    )  # (-3 (pred) + 0.2 + 0 (reporting margin) + 0.2 (unexpected margin))/ 4
+    assert aggregate_predictions[aggregate_predictions.postal_code == "b"].pred_margin[1] == pytest.approx(
+        0.9 / 2
+    )  # (-0.1 + 1) / 2
+    assert aggregate_predictions[aggregate_predictions.postal_code == "c"].pred_margin[2] == pytest.approx(
+        0.3
+    )  # (0.3 + 0.3) / 2
+    assert aggregate_predictions[aggregate_predictions.postal_code == "d"].pred_margin[3] == pytest.approx(
+        8 / 3
+    )  # 0.8 / 3
+    assert aggregate_predictions[aggregate_predictions.postal_code == "e"].pred_margin[4] == pytest.approx(
+        4
+    )  # (4 + 4) / 2
+    assert aggregate_predictions[aggregate_predictions.postal_code == "f"].pred_margin[5] == pytest.approx(0.7 / 2)
+
+    assert aggregate_predictions[aggregate_predictions.postal_code == "a"].results_margin[0] == pytest.approx(0.6 / 4)
+    assert aggregate_predictions[aggregate_predictions.postal_code == "b"].results_margin[1] == pytest.approx(-0.1 / 2)
+    assert aggregate_predictions[aggregate_predictions.postal_code == "c"].results_margin[2] == pytest.approx(0.3)
+    assert aggregate_predictions[aggregate_predictions.postal_code == "d"].results_margin[3] == pytest.approx(0.7 / 3)
+    assert aggregate_predictions[aggregate_predictions.postal_code == "e"].results_margin[4] == pytest.approx(0.1)
+    assert aggregate_predictions[aggregate_predictions.postal_code == "f"].results_margin[5] == pytest.approx(0.7 / 2)
+
+    assert aggregate_predictions[aggregate_predictions.postal_code == "a"].reporting[0] == pytest.approx(2)
+    assert aggregate_predictions[aggregate_predictions.postal_code == "b"].reporting[1] == pytest.approx(1)
+    assert aggregate_predictions[aggregate_predictions.postal_code == "c"].reporting[2] == pytest.approx(2)
+    assert aggregate_predictions[aggregate_predictions.postal_code == "d"].reporting[3] == pytest.approx(0)
+    assert aggregate_predictions[aggregate_predictions.postal_code == "e"].reporting[4] == pytest.approx(0)
+    assert aggregate_predictions[aggregate_predictions.postal_code == "f"].reporting[5] == pytest.approx(0)
+
+    # test more complicated z predictions
+    bootstrap_election_model.weighted_z_test_pred = np.asarray([2, 3, 1, 1, 1, 1]).reshape(-1, 1)
+
+    aggregate_predictions = bootstrap_election_model.get_aggregate_predictions(
+        reporting_units, nonreporting_units, unexpected_units, ["postal_code"], "margin"
+    )
+
+    assert aggregate_predictions[aggregate_predictions.postal_code == "a"].pred_margin[0] == pytest.approx(
+        -2.6 / 5
+    )  # now divided by 5 since the a in non reporting has weight 2
+    assert aggregate_predictions[aggregate_predictions.postal_code == "b"].pred_margin[1] == pytest.approx(0.9 / 4)
+
+    # test more complicated aggregate (postal code-district)
+    bootstrap_election_model.weighted_z_test_pred = np.asarray([1, 1, 1, 1, 1, 1]).reshape(-1, 1)
+    aggregate_predictions = bootstrap_election_model.get_aggregate_predictions(
+        reporting_units, nonreporting_units, unexpected_units, ["postal_code", "district"], "margin"
+    )
+
+    assert aggregate_predictions.shape[0] == 8
+    assert aggregate_predictions[
+        (aggregate_predictions.postal_code == "a") & (aggregate_predictions.district == "a")
+    ].pred_margin[0] == pytest.approx(0)
+    assert aggregate_predictions[
+        (aggregate_predictions.postal_code == "a") & (aggregate_predictions.district == "c")
+    ].pred_margin[1] == pytest.approx(-2.6 / 3)
+    assert aggregate_predictions[
+        (aggregate_predictions.postal_code == "b") & (aggregate_predictions.district == "a")
+    ].pred_margin[2] == pytest.approx(0.9 / 2)
+    assert aggregate_predictions[
+        (aggregate_predictions.postal_code == "c") & (aggregate_predictions.district == "c")
+    ].pred_margin[3] == pytest.approx(0.6 / 2)
+    assert aggregate_predictions[
+        (aggregate_predictions.postal_code == "d") & (aggregate_predictions.district == "c")
+    ].pred_margin[4] == pytest.approx(8 / 3)
+    assert aggregate_predictions[
+        (aggregate_predictions.postal_code == "e") & (aggregate_predictions.district == "a")
+    ].pred_margin[5] == pytest.approx(4)
+    assert aggregate_predictions[
+        (aggregate_predictions.postal_code == "e") & (aggregate_predictions.district == "e")
+    ].pred_margin[6] == pytest.approx(4)
+    assert aggregate_predictions[
+        (aggregate_predictions.postal_code == "f") & (aggregate_predictions.district == "f")
+    ].pred_margin[7] == pytest.approx(0.7 / 2)
+
+    assert aggregate_predictions[
+        (aggregate_predictions.postal_code == "a") & (aggregate_predictions.district == "a")
+    ].reporting[0] == pytest.approx(1)
+    assert aggregate_predictions[
+        (aggregate_predictions.postal_code == "a") & (aggregate_predictions.district == "c")
+    ].reporting[1] == pytest.approx(1)
+    assert aggregate_predictions[
+        (aggregate_predictions.postal_code == "b") & (aggregate_predictions.district == "a")
+    ].reporting[2] == pytest.approx(1)
+    assert aggregate_predictions[
+        (aggregate_predictions.postal_code == "c") & (aggregate_predictions.district == "c")
+    ].reporting[3] == pytest.approx(2)
+    assert aggregate_predictions[
+        (aggregate_predictions.postal_code == "d") & (aggregate_predictions.district == "c")
+    ].reporting[4] == pytest.approx(0)
+    assert aggregate_predictions[
+        (aggregate_predictions.postal_code == "e") & (aggregate_predictions.district == "a")
+    ].reporting[5] == pytest.approx(0)
+    assert aggregate_predictions[
+        (aggregate_predictions.postal_code == "e") & (aggregate_predictions.district == "e")
+    ].reporting[6] == pytest.approx(0)
+    assert aggregate_predictions[
+        (aggregate_predictions.postal_code == "f") & (aggregate_predictions.district == "f")
+    ].reporting[7] == pytest.approx(0)
+
+
+def test_get_quantile(bootstrap_election_model):
+    bootstrap_election_model.B = 1000
+    alpha = 0.95
+    lower_q, upper_q = bootstrap_election_model._get_quantiles(alpha)
+    assert lower_q == pytest.approx(0.025)
+    assert upper_q == pytest.approx(0.975)
+
+
+def test_get_unit_prediction_intervals(bootstrap_election_model, rng):
+    reporting_units, nonreporting_units = None, None
+    n = 10
+    B = 10000
+    alpha = 0.95
+
+    s = 2
+    bootstrap_election_model.weighted_yz_test_pred = rng.normal(scale=1, size=(n, 1))
+    bootstrap_election_model.errors_B_1 = rng.normal(scale=s, size=(n, B))
+    bootstrap_election_model.errors_B_2 = rng.normal(scale=s, size=(n, B))
+    lower, upper = bootstrap_election_model.get_unit_prediction_intervals(
+        reporting_units, nonreporting_units, alpha, "margin"
+    )
+
+    lower_alpha = (1 - alpha) / 2
+    upper_alpha = 1 - lower_alpha
+
+    lower_q = np.floor(lower_alpha * (B + 1)) / B
+    upper_q = np.ceil(upper_alpha * (B - 1)) / B
+
+    assert lower.shape == (n, 1)
+    assert upper.shape == (n, 1)
+    assert all(upper > lower)
+    assert all(upper > bootstrap_election_model.weighted_yz_test_pred)
+    assert all(bootstrap_election_model.weighted_yz_test_pred > lower)
+
+    # sqrt of variance of two errors above
+    normal_lower_q = sp.stats.norm.ppf(lower_q, scale=np.sqrt(s**2 + s**2))
+    normal_upper_q = sp.stats.norm.ppf(upper_q, scale=np.sqrt(s**2 + s**2))
+    # arbitrarily one can be off because of sampling variability and rounding
+    assert ((bootstrap_election_model.weighted_yz_test_pred - normal_upper_q).round() == lower).mean() >= 0.9
+    assert ((bootstrap_election_model.weighted_yz_test_pred - normal_lower_q).round() == upper).mean() >= 0.9
+
+
+def test_get_aggregate_prediction_intervals(bootstrap_election_model, rng):
+    reporting_units = pd.DataFrame(
+        [
+            ["a", -3, 0.2, 1, 1, 1, 1, 3, 5, 8, "c"],
+            ["a", 1, 0, 1, 1, 1, 1, 2, 1, 3, "a"],
+            ["b", 5, -0.1, 1, 1, 1, 1, 8, 3, 11, "a"],
+            ["c", 3, -0.2, 1, 1, 1, 1, 9, 1, 9, "c"],
+            ["c", 3, 0.8, 1, 1, 1, 1, 2, 4, 6, "c"],
+        ],
+        columns=[
+            "postal_code",
+            "pred_margin",
+            "results_margin",
+            "results_weights",
+            "baseline_weights",
+            "turnout_factor",
+            "reporting",
+            "baseline_dem",
+            "baseline_gop",
+            "baseline_turnout",
+            "district",
+        ],
+    )
+    reporting_units["results_normalized_margin"] = reporting_units.results_margin / reporting_units.results_weights
+    nonreporting_units = pd.DataFrame(
+        [
+            ["a", -3, 0.2, 1, 1, 1, 0, 3, 5, 8, "c"],
+            ["b", 1, 0, 1, 1, 1, 0, 2, 1, 3, "a"],
+            ["d", 5, -0.1, 1, 1, 1, 0, 8, 3, 11, "c"],
+            ["d", 3, 0.8, 1, 1, 1, 0, 2, 4, 6, "c"],
+            ["e", 4, 0.1, 1, 1, 1, 0, 5, 1, 9, "e"],
+            ["e", 4, 0.1, 1, 1, 1, 0, 5, 1, 9, "a"],
+        ],
+        columns=[
+            "postal_code",
+            "pred_margin",
+            "results_margin",
+            "results_weights",
+            "baseline_weights",
+            "turnout_factor",
+            "reporting",
+            "baseline_dem",
+            "baseline_gop",
+            "baseline_turnout",
+            "district",
+        ],
+    )
+    nonreporting_units["results_normalized_margin"] = (
+        nonreporting_units.results_margin / nonreporting_units.results_weights
+    )
+    unexpected_units = pd.DataFrame(
+        [
+            ["a", -3, 0.2, 1, 1, 1, 0, np.nan, np.nan, np.nan, "c"],
+            ["d", 1, 0, 1, 1, 1, 0, np.nan, np.nan, np.nan, "c"],
+            ["f", 5, -0.1, 1, 1, 1, 0, np.nan, np.nan, np.nan, "f"],
+            ["f", 3, 0.8, 1, 1, 1, 0, np.nan, np.nan, np.nan, "f"],
+        ],
+        columns=[
+            "postal_code",
+            "pred_margin",
+            "results_margin",
+            "results_weights",
+            "baseline_weights",
+            "turnout_factor",
+            "reporting",
+            "baseline_dem",
+            "baseline_gop",
+            "baseline_turnout",
+            "district",
+        ],
+    )
+    unexpected_units["results_normalized_margin"] = unexpected_units.results_margin / unexpected_units.results_weights
+
+    n = nonreporting_units.shape[0]
+    B = 10
+    s = 1.0
+    bootstrap_election_model.B = B
+    bootstrap_election_model.errors_B_1 = rng.normal(scale=s, size=(n, B))
+    bootstrap_election_model.errors_B_2 = rng.normal(scale=s, size=(n, B))
+    bootstrap_election_model.errors_B_3 = rng.normal(scale=s, size=(n, B))
+    bootstrap_election_model.errors_B_4 = rng.normal(scale=s, size=(n, B))
+    bootstrap_election_model.weighted_z_test_pred = rng.normal(scale=s, size=(n, 1))
+    bootstrap_election_model.weighted_yz_test_pred = rng.normal(scale=s, size=(n, 1))
+
+    lower, upper = bootstrap_election_model.get_aggregate_prediction_intervals(
+        reporting_units, nonreporting_units, unexpected_units, ["postal_code"], 0.95, None, None
+    )
+
+    assert lower.shape == (6, 1)
+    assert upper.shape == (6, 1)
+
+    assert lower[2] == pytest.approx(upper[2])  # since c is fully reporting
+    assert lower[5] == pytest.approx(upper[5])  # since all f units are unexpected
+    assert all(lower <= upper)
+
+    # test with more complicated aggregate
+    lower, upper = bootstrap_election_model.get_aggregate_prediction_intervals(
+        reporting_units, nonreporting_units, unexpected_units, ["postal_code", "district"], 0.95, None, None
+    )
+
+    assert lower.shape == (8, 1)
+    assert upper.shape == (8, 1)
+
+    assert lower[0] == pytest.approx(upper[0])  # a-a is fully reporting
+    assert lower[3] == pytest.approx(upper[3])  # c-c is fully reporting
+    assert lower[7] == pytest.approx(upper[7])  # c-c is fully reporting
+    assert lower[7] == pytest.approx(upper[7])  # f-f is fully unexpected
+    assert all(lower <= upper)
+
+
+def test_get_national_summary_estimates(bootstrap_election_model, rng):
+    n = 10
+    s = 2.0
+    B = 20
+    bootstrap_election_model.B = B
+    bootstrap_election_model.aggregate_error_B_1 = rng.normal(scale=s, size=(n, B))
+    bootstrap_election_model.aggregate_error_B_2 = rng.normal(scale=s, size=(n, B))
+    bootstrap_election_model.aggregate_error_B_3 = rng.normal(scale=s, size=(n, B))
+    bootstrap_election_model.aggregate_error_B_4 = rng.normal(scale=s, size=(n, B))
+    bootstrap_election_model.aggregate_perc_margin_total = rng.normal(scale=s, size=(n, 1))
+
+    nat_sum_estimates = bootstrap_election_model.get_national_summary_estimates(None, None, 0, 0.95)
+    assert "margin" in nat_sum_estimates
+    assert len(nat_sum_estimates["margin"]) == 3
+    assert nat_sum_estimates["margin"][0] >= nat_sum_estimates["margin"][1]
+    assert nat_sum_estimates["margin"][0] <= nat_sum_estimates["margin"][2]
+
+    # testing adding to base
+    base_to_add = rng.random()
+    nat_sum_estimates_w_base = bootstrap_election_model.get_national_summary_estimates(None, None, base_to_add, 0.95)
+    assert nat_sum_estimates_w_base["margin"][0] == pytest.approx(nat_sum_estimates["margin"][0] + base_to_add)
+    assert nat_sum_estimates_w_base["margin"][1] == pytest.approx(nat_sum_estimates["margin"][1] + base_to_add)
+    assert nat_sum_estimates_w_base["margin"][2] == pytest.approx(nat_sum_estimates["margin"][2] + base_to_add)
+
+    # test calling races
+    states_called = {i: 1 for i in range(n)}
+    nat_sum_data_dict = {i: 3 for i in range(n)}
+    nat_sum_data_dict[1] = 7
+    nat_sum_estimates = bootstrap_election_model.get_national_summary_estimates(
+        nat_sum_data_dict, states_called, 0, 0.95
+    )
+    assert nat_sum_estimates["margin"][0] == pytest.approx(34)
+    assert nat_sum_estimates["margin"][1] == pytest.approx(34)
+    assert nat_sum_estimates["margin"][2] == pytest.approx(34)
+
+    states_called = {i: 0 for i in range(n)}
+    nat_sum_estimates = bootstrap_election_model.get_national_summary_estimates(
+        nat_sum_data_dict, states_called, 0, 0.95
+    )
+    assert nat_sum_estimates["margin"][0] == pytest.approx(0)
+    assert nat_sum_estimates["margin"][1] == pytest.approx(0)
+    assert nat_sum_estimates["margin"][2] == pytest.approx(0)
+
+    states_called = {i: 0 for i in range(n)}
+    states_called[1] = 1
+    nat_sum_estimates = bootstrap_election_model.get_national_summary_estimates(
+        nat_sum_data_dict, states_called, 0, 0.95
+    )
+    assert nat_sum_estimates["margin"][0] == pytest.approx(7)
+    assert nat_sum_estimates["margin"][1] == pytest.approx(7)
+    assert nat_sum_estimates["margin"][2] == pytest.approx(7)
+
+    # test exceptions
+    states_called = {i: 0 for i in range(n - 1)}
+    with pytest.raises(BootstrapElectionModelException):
+        nat_sum_estimates = bootstrap_election_model.get_national_summary_estimates(
+            nat_sum_data_dict, states_called, 0, 0.95
+        )
+
+    nat_sum_data_dict = {i: 3 for i in range(n - 1)}
+    with pytest.raises(BootstrapElectionModelException):
+        nat_sum_estimates = bootstrap_election_model.get_national_summary_estimates(
+            nat_sum_data_dict, states_called, 0, 0.95
+        )
+
+
+# TODO: write unit test for combined aggregation (e.g. prediction, intervals, aggregate etc.)
+# also where an unexpected or non reporting unit starts ahead of an reporting unit
+def test_total_aggregation(bootstrap_election_model, va_assembly_precinct_data):
+    election_id = "2017-11-07_VA_G"
+    office_id = "Y"
+    geographic_unit_type = "precinct-district"
+    estimands = ["margin"]
+    estimands_baseline = {"margin": "margin"}
+    unexpected_units = 50
+    alpha = 0.9
+    percent_reporting_threshold = 100
+
+    live_data_handler = MockLiveDataHandler(
+        election_id,
+        office_id,
+        geographic_unit_type,
+        estimands,
+        unexpected_units=unexpected_units,
+        data=va_assembly_precinct_data,
+    )
+
+    live_data_handler.shuffle()
+    current_data = live_data_handler.get_percent_fully_reported(400)
+
+    preprocessed_data_handler = PreprocessedDataHandler(
+        election_id, office_id, geographic_unit_type, estimands, estimands_baseline, data=va_assembly_precinct_data
+    )
+
+    combined_data_handler = CombinedDataHandler(
+        preprocessed_data_handler.data, current_data, estimands, geographic_unit_type
+    )
+
+    reporting_units = combined_data_handler.get_reporting_units(percent_reporting_threshold, 0.5, 1.5)
+    nonreporting_units = combined_data_handler.get_nonreporting_units(percent_reporting_threshold, 0.5, 1.5)
+    unexpected_units = combined_data_handler.get_unexpected_units(
+        percent_reporting_threshold,
+        aggregates=["postal_code", "district"],
+        turnout_factor_lower=0.5,
+        turnout_factor_upper=1.5,
+    )
+
+    bootstrap_election_model.B = 300
+
+    unit_predictions = bootstrap_election_model.get_unit_predictions(
+        reporting_units, nonreporting_units, "margin", unexpected_units=unexpected_units
+    )
+    unit_lower, unit_upper = bootstrap_election_model.get_unit_prediction_intervals(
+        reporting_units, nonreporting_units, alpha, "margin"
+    )
+
+    assert unit_predictions.shape[0] == unit_lower.shape[0]
+    assert unit_predictions.shape[0] == unit_upper.shape[0]
+    assert all(unit_lower.flatten() <= unit_predictions.flatten())
+    assert all(unit_predictions.flatten() <= unit_upper.flatten())
+
+    reporting_units["pred_margin"] = reporting_units.results_margin
+    nonreporting_units["pred_margin"] = unit_predictions
+    unexpected_units["pred_margin"] = unexpected_units.results_margin
+
+    district_predictions = bootstrap_election_model.get_aggregate_predictions(
+        reporting_units, nonreporting_units, unexpected_units, ["postal_code", "district"], "margin"
+    )
+    district_lower, district_upper = bootstrap_election_model.get_aggregate_prediction_intervals(
+        reporting_units, nonreporting_units, unexpected_units, ["postal_code", "district"], alpha, None, "margin"
+    )
+    district_predictions.shape[0] == len(preprocessed_data_handler.data.district.unique())
+
+    assert district_predictions.shape[0] == district_lower.shape[0]
+    assert district_predictions.shape[0] == district_upper.shape[0]
+
+    assert all(district_lower.flatten() <= district_predictions.pred_margin + TOL)
+    assert all(district_predictions.pred_margin <= district_upper.flatten() + TOL)
diff --git a/tests/test_client.py b/tests/test_client.py
index 802d20b4..93402322 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -618,7 +618,6 @@ def test_get_estimates_some_reporting(model_client, va_governor_county_data, va_
         "upper_0.9_turnout",
         "results_turnout",
     ]
-
     assert result["state_data"]["postal_code"][0] == "VA"
     assert result["state_data"]["pred_turnout"][0] == 2587563.0
     assert result["state_data"]["results_turnout"][0] == 1570077.0