diff --git a/src/elexmodel/client.py b/src/elexmodel/client.py
index 554689ca..feb8cd65 100644
--- a/src/elexmodel/client.py
+++ b/src/elexmodel/client.py
@@ -232,7 +232,8 @@ def get_estimates(
         handle_unreporting = kwargs.get("handle_unreporting", "drop")
 
         district_election = False
-        if office in {"H", "Y", "Z"}:
+        # if office starts with one of these letters
+        if office.startswith("H") or office.startswith("Y") or office.startswith("Z"):
             district_election = True
 
         model_settings = {
diff --git a/src/elexmodel/models/BootstrapElectionModel.py b/src/elexmodel/models/BootstrapElectionModel.py
index 003fe182..d271c483 100644
--- a/src/elexmodel/models/BootstrapElectionModel.py
+++ b/src/elexmodel/models/BootstrapElectionModel.py
@@ -1,11 +1,13 @@
 from __future__ import annotations  # pylint: disable=too-many-lines
 
 import logging
+from itertools import combinations
 
 import numpy as np
 import pandas as pd
 from elexsolver.OLSRegressionSolver import OLSRegressionSolver
 from elexsolver.QuantileRegressionSolver import QuantileRegressionSolver
+from scipy.linalg import block_diag
 from scipy.special import expit
 
 from elexmodel.handlers.data.Featurizer import Featurizer
@@ -90,6 +92,9 @@ def __init__(self, model_settings={}):
         self.lhs_called_threshold = 0.005
         self.rhs_called_threshold = -0.005
 
+        # this is the correlation structure we impose when we sample from the contest level random effects
+        self.contest_correlations = model_settings.get("contest_correlations", [])
+
         # Assume that we have a baseline normalized margin
         # (D^{Y'} - R^{Y'}) / (D^{Y'} + R^{Y'}) is one of the covariates
         if "baseline_normalized_margin" not in self.features:
@@ -156,11 +161,11 @@ def _estimate_epsilon(self, residuals: np.ndarray, aggregate_indicator: np.ndarr
         This function estimates the epsilon (contest level random effects)
         """
         # the estimate for epsilon is the average of the residuals in each contest
-        epsilon_hat = (aggregate_indicator.T @ residuals) / aggregate_indicator.sum(axis=0).reshape(-1, 1)
+        epsilon_hat, _, _, _ = np.linalg.lstsq(aggregate_indicator, residuals)
+
         # we can't estimate a contest level effect if only have 1 unit in that contest (since our residual can just)
         # be made equal to zero by setting the random effect to that value
         epsilon_hat[aggregate_indicator.sum(axis=0) < 2] = 0
-
         return epsilon_hat
 
     def _estimate_delta(
@@ -584,42 +589,94 @@ def _sample_test_epsilon(
     ) -> tuple[np.ndarray, np.ndarray]:
         """
         This function generates new test epsilons (contest level random effects)
-        NOTE: for now we are sampling from a normal with mean zero and sample variance.
-        """
-        # the contests that need a sampled contest level random effect are those that still have outstanding
-        # units (since for the others, the contest level random effect is a known fixed quantity) but no
-        # current estimate for the epsilon in that contest (so ones with NO training examples)
-
-        # this gives us indices of contests for which we have no samples in the training set
-        contests_not_in_reporting_units = np.where(np.all(aggregate_indicator_train == 0, axis=0))[0]
-        # gives us contests for which there is at least one county not reporting
-        contests_in_nonreporting_units = np.where(np.any(aggregate_indicator_test > 0, axis=0))[0]
-        # contests that need a random effect are:
-        #   contests that we want to make predictions on (ie. there are still outstanding units)
-        #   BUT no units in that contest are reporting
-        contests_that_need_random_effect = np.intersect1d(
-            contests_not_in_reporting_units, contests_in_nonreporting_units
-        )
 
-        # \epsilon ~ N(0, \Sigma)
-        mu_hat = [0, 0]
+        For partially observed contests, we apply a normal approximation, and sample
+        from the normal distribution implied by the remaining uncertainty of a
+        sampling without replacement process.
 
-        # the variance is the sample variance of epsilons for which we have an estimate
+        For unobserved contests, we sample from the normal distribution implied by the
+        sample covariance of the partially observed epsilons
+        """
         non_zero_epsilon_indices = np.nonzero(epsilon_y_hat)[0]
         # if there is only one non zero contest OR if the contest is a state level election only
         # then just sample from nearly zero since no variance
         if non_zero_epsilon_indices.shape[0] == 1:
             return np.zeros((1, self.B)), np.zeros((1, self.B))
 
-        sigma_hat = np.cov(np.concatenate([epsilon_y_hat, epsilon_z_hat], axis=1)[np.nonzero(epsilon_y_hat)[0]].T)
+        aggregate_indicator = np.concatenate([aggregate_indicator_train, aggregate_indicator_test], axis=0)
+
+        # computes standard error of epsilon_hat estimate for each contest
+        # formula is given by square root of (1 - n/N) * (pop variance) / n
+        # where n is the number of observed units and N is the number of units in the contest
+        # pop variance is the variance of the residuals in the contest
+        def _get_epsilon_hat_std(residuals, epsilon):
+            var = np.var(aggregate_indicator_train * residuals, axis=0)  # incorrect denominator for variance
+            var *= aggregate_indicator_train.shape[0] / (
+                aggregate_indicator_train.sum(axis=0) - 1
+            )  # Bessel's correction
+            var *= (
+                1 - aggregate_indicator_train.sum(axis=0) / aggregate_indicator.sum(axis=0)
+            ) / aggregate_indicator_train.sum(axis=0)
+
+            # where we have < 2 units in a contest, we set the variance to the variance of the observed epsilon_hat values
+            var[np.isnan(var) | np.isinf(var)] = np.var(epsilon[np.nonzero(epsilon)[0]].T, ddof=1)
+            return np.sqrt(var)
+
+        std_y = _get_epsilon_hat_std(residuals_y, epsilon_y_hat)
+        std_z = _get_epsilon_hat_std(residuals_z, epsilon_z_hat)
+
+        std = np.zeros((std_y.shape[0] + std_z.shape[0],))
+        std[0::2] = std_y
+        std[1::2] = std_z
+
+        # high observed correlation between epsilon_y_hat and epsilon_z_hat in 2020, so this is important
+        corr = np.corrcoef(np.concatenate([epsilon_y_hat, epsilon_z_hat], axis=1)[np.nonzero(epsilon_y_hat)[0]].T)
+        # tile corr into a block diagonal matrix
+        corr_list = [corr] * aggregate_indicator.shape[1]
+        corr_hat = block_diag(*corr_list)
+
+        # if we have additional inter-contest correlations to impose
+        # self.contest_correlations is list of tuples
+        # e.g., [(("AK", "AL", "AR"), 0.5), (("CA", "CO", "CT"), 0.5)]
+        # this will impose a correlation of 0.5 among the sampled state-level swings
+        # for AK, AL, AR and CA, CO, CT
+        if len(self.contest_correlations) > 0:
+            for contests, correlation in self.contest_correlations:
+                for c_1, c_2 in combinations(contests, 2):
+                    i, j = 2 * self.aggregate_names[c_1], 2 * self.aggregate_names[c_2]
+                    corr_hat[i, j] = correlation  # epsilon_y_correlation
+                    corr_hat[i + 1, j + 1] = correlation  # epsilon_z_correlation
+                    corr_hat[j, i] = correlation
+                    corr_hat[j + 1, i + 1] = correlation
+                    corr_hat[i + 1, j] = min(corr_hat[i, i + 1], correlation)
+                    corr_hat[j, i + 1] = min(corr_hat[i, i + 1], correlation)
+                    corr_hat[i, j + 1] = min(corr_hat[i, i + 1], correlation)
+                    corr_hat[j + 1, i] = min(corr_hat[i, i + 1], correlation)
+
+            # project correlation matrix back to PSD cone
+            evals, evecs = np.linalg.eigh(corr_hat)
+            if (evals < -1e-5).any():
+                LOG.info("Epsilon correl. matrix is not PSD and requires projection.")
+                evals = np.clip(evals, 0, None)
+                corr_hat = evecs @ np.diag(evals) @ evecs.T
+                corr_hat /= np.sqrt(np.outer(np.diag(corr_hat), np.diag(corr_hat)))
 
-        # sample new test epsilons, but only for states that need tem
-        test_epsilon = self.rng.multivariate_normal(
-            mu_hat, sigma_hat, size=(len(contests_that_need_random_effect), self.B)
-        )
+        # \epsilon ~ N(0, \Sigma)
+        # Sigma is a block diagonal matrix with 2x2 blocks running down the diagonal and 0s elsewhere
+        # each block is the covariance matrix of epsilon_y and epsilon_z for a particular contest,
+        # e.g., the first block is for the AK contest, the second block is for the AL contest, etc.
+        # we can sample from this distribution to get new epsilons
+        mu_hat = np.zeros(corr_hat.shape[0])
+        sigma_hat = np.diag(std) @ corr_hat @ np.diag(std)
+
+        test_epsilon = self.rng.multivariate_normal(mu_hat, sigma_hat, size=self.B)
+
+        test_epsilon_y = test_epsilon[:, 0::2].T
+        test_epsilon_z = test_epsilon[:, 1::2].T
+
+        test_epsilon_y = aggregate_indicator_test @ test_epsilon_y
+        test_epsilon_z = aggregate_indicator_test @ test_epsilon_z
 
-        test_epsilon_y = aggregate_indicator_test[:, contests_that_need_random_effect] @ test_epsilon[:, :, 0]
-        test_epsilon_z = aggregate_indicator_test[:, contests_that_need_random_effect] @ test_epsilon[:, :, 1]
         return test_epsilon_y, test_epsilon_z
 
     def _sample_test_errors(
@@ -785,11 +842,34 @@ def compute_bootstrap_errors(
         # between unit and contest (ie. a 1 in i,j says that unit j belongs to contest i)
         # in case district election we need to create a variable that defines the state, district
         # which is what the contest is
-        if self.district_election:
+        if self.district_election:  # want to model aggregate effect at both district and state levels
             all_units["postal_code-district"] = all_units[["postal_code", "district"]].agg("_".join, axis=1)
-            aggregate_indicator = pd.get_dummies(all_units["postal_code-district"]).values
+
+            contest_indicator = pd.get_dummies(all_units["postal_code-district"])
+            postal_code_indicator = pd.get_dummies(all_units["postal_code"])
+
+            # drop districts that are at-large districts for a state
+            postal_code_filter = all_units.groupby("postal_code")["postal_code-district"].nunique() > 1
+            valid_postal_codes = postal_code_filter[postal_code_filter].index
+            valid_districts = all_units[all_units.postal_code.isin(valid_postal_codes)]["postal_code-district"].unique()
+            contest_indicator_filtered = contest_indicator.loc[:, valid_districts]
+
+            # drop contest indicators if there are fewer than 10 units in contest
+            contest_indicator_filtered = contest_indicator_filtered.loc[:, contest_indicator.sum(axis=0) > 10]
+
+            self.aggregate_names = {
+                c: i
+                for i, c in enumerate(
+                    postal_code_indicator.columns.tolist() + contest_indicator_filtered.columns.tolist()
+                )
+            }
+            aggregate_indicator = np.concatenate(
+                (postal_code_indicator.values, contest_indicator_filtered.values), axis=1
+            )
         else:
-            aggregate_indicator = pd.get_dummies(all_units["postal_code"]).values
+            contest_indicator = pd.get_dummies(all_units["postal_code"])
+            self.aggregate_names = {c: i for i, c in enumerate(contest_indicator.columns.tolist())}
+            aggregate_indicator = contest_indicator.values
 
         aggregate_indicator_expected = aggregate_indicator[: (n_train + n_test)]
         aggregate_indicator_train = aggregate_indicator_expected[:n_train]
@@ -1023,7 +1103,7 @@ def compute_bootstrap_errors(
         # and turn into turnout estimate
         self.weighted_z_test_pred = z_test_pred * weights_test
         self.ran_bootstrap = True
-        self.n_contests = aggregate_indicator.shape[1]
+        self.n_contests = contest_indicator.values.shape[1]
 
     def get_unit_predictions(
         self, reporting_units: pd.DataFrame, nonreporting_units: pd.DataFrame, estimand: str, **kwargs
@@ -1437,6 +1517,10 @@ def get_aggregate_prediction_intervals(
         interval_upper = interval_upper.reshape(-1, 1)
         interval_lower = interval_lower.reshape(-1, 1)
 
+        # guarantee overlap between the prediction interval and the point prediction
+        interval_lower = np.minimum(interval_lower, aggregate_perc_margin_total - 0.001)
+        interval_upper = np.maximum(interval_upper, aggregate_perc_margin_total + 0.001)
+
         return PredictionIntervals(interval_lower, interval_upper)
 
     def get_national_summary_estimates(self, nat_sum_data_dict: dict, base_to_add: int | float, alpha: float) -> list:
diff --git a/tests/models/test_bootstrap_election_model.py b/tests/models/test_bootstrap_election_model.py
index 39021b6f..5bd0edc9 100644
--- a/tests/models/test_bootstrap_election_model.py
+++ b/tests/models/test_bootstrap_election_model.py
@@ -39,9 +39,11 @@ def test_estimate_epsilon(bootstrap_election_model):
     residuals = np.asarray([0.5, 0.5, 0.3, 0.8, 0.5]).reshape(-1, 1)
     aggregate_indicator = np.asarray([[1, 1, 0, 0, 0], [0, 0, 1, 1, 0], [0, 0, 0, 0, 1]]).T
     epsilon_hat = bootstrap_election_model._estimate_epsilon(residuals, aggregate_indicator)
-    assert epsilon_hat[0] == (residuals[0] + residuals[1]) / 2
-    assert epsilon_hat[1] == (residuals[2] + residuals[3]) / 2
-    assert epsilon_hat[2] == 0  # since the aggrgate indicator for that row only has one 1 which is less than 2
+    assert np.isclose(epsilon_hat[0], (residuals[0] + residuals[1]) / 2)
+    assert np.isclose(epsilon_hat[1], (residuals[2] + residuals[3]) / 2)
+    assert np.isclose(
+        epsilon_hat[2], 0
+    )  # since the aggrgate indicator for that row only has one 1 which is less than 2
 
 
 def test_estimate_delta(bootstrap_election_model):
@@ -405,31 +407,21 @@ def test_sample_test_epsilon(bootstrap_election_model):
         # circumstance is here: https://arcpublishing.atlassian.net/browse/ELEX-3431
         epsilon_hat = bootstrap_election_model._estimate_epsilon(residuals, aggregate_indicator_train)
 
-    epsilon_y, epsilon_z = bootstrap_election_model._sample_test_epsilon(
-        residuals, residuals, epsilon_hat, epsilon_hat, aggregate_indicator_train, aggregate_indicator_test
-    )
+        epsilon_y, epsilon_z = bootstrap_election_model._sample_test_epsilon(
+            residuals, residuals, epsilon_hat, epsilon_hat, aggregate_indicator_train, aggregate_indicator_test
+        )
 
     assert epsilon_y.shape == (aggregate_indicator_test.shape[0], bootstrap_election_model.B)
     assert epsilon_z.shape == (aggregate_indicator_test.shape[0], bootstrap_election_model.B)
 
-    # aggregate 3 has no elements in the training set, and rows 2,3,5 in the test set
-    # are parts of aggregate 3
-    assert np.isclose(epsilon_z[0], 0).all()
-    assert np.isclose(epsilon_z[1], 0).all()
+    # test epsilons should never be zero
+    assert not np.isclose(epsilon_z[0], 0).all()
+    assert not np.isclose(epsilon_z[1], 0).all()
     assert not np.isclose(epsilon_z[2], 0).all()
     assert not np.isclose(epsilon_z[3], 0).all()
-    assert np.isclose(epsilon_z[4], 0).all()
+    assert not np.isclose(epsilon_z[4], 0).all()
     assert not np.isclose(epsilon_z[5], 0).all()
 
-    # testing that if there is only one element epsilon_hat that we return 0
-    epsilon_y, epsilon_z = bootstrap_election_model._sample_test_epsilon(
-        residuals, residuals, [[4]], [[4]], aggregate_indicator_train, aggregate_indicator_test
-    )
-    assert epsilon_y.shape == (1, bootstrap_election_model.B)
-    assert epsilon_z.shape == (1, bootstrap_election_model.B)
-
-    assert np.isclose(epsilon_z[0], 0).all()
-
 
 def test_sample_test_errors(bootstrap_election_model):
     residuals = np.asarray([0.5, 0.5, 0.3, 0.8, 0.5]).reshape(-1, 1)
@@ -965,8 +957,8 @@ def test_get_aggregate_prediction_intervals(bootstrap_election_model, rng):
     assert lower.shape == (6, 1)
     assert upper.shape == (6, 1)
 
-    assert lower[2] == pytest.approx(upper[2])  # since c is fully reporting
-    assert lower[5] == pytest.approx(upper[5])  # since all f units are unexpected
+    assert lower[2] == pytest.approx(upper[2] - 0.002)  # since c is fully reporting
+    assert lower[5] == pytest.approx(upper[5] - 0.002)  # since all f units are unexpected
     assert all(lower <= upper)
 
     # test race calls
@@ -989,7 +981,7 @@ def test_get_aggregate_prediction_intervals(bootstrap_election_model, rng):
         None,
         lhs_called_contests=lhs_called_contests,
     )
-    assert (lower >= bootstrap_election_model.lhs_called_threshold).all()
+    assert (lower >= bootstrap_election_model.lhs_called_threshold - 0.001).all()
     assert (upper >= bootstrap_election_model.lhs_called_threshold).all()
     assert (bootstrap_election_model.divided_error_B_1 == bootstrap_election_model.lhs_called_threshold).all()
     assert (bootstrap_election_model.divided_error_B_2 == bootstrap_election_model.lhs_called_threshold).all()
@@ -1014,7 +1006,7 @@ def test_get_aggregate_prediction_intervals(bootstrap_election_model, rng):
         rhs_called_contests=rhs_called_contests,
     )
     assert (lower <= bootstrap_election_model.rhs_called_threshold).all()
-    assert (upper <= bootstrap_election_model.rhs_called_threshold).all()
+    assert (upper <= bootstrap_election_model.rhs_called_threshold + 0.001).all()
     assert (bootstrap_election_model.divided_error_B_1 == bootstrap_election_model.rhs_called_threshold).all()
     assert (bootstrap_election_model.divided_error_B_2 == bootstrap_election_model.rhs_called_threshold).all()
 
@@ -1045,10 +1037,10 @@ def test_get_aggregate_prediction_intervals(bootstrap_election_model, rng):
     assert lower.shape == (8, 1)
     assert upper.shape == (8, 1)
 
-    assert lower[0] == pytest.approx(upper[0])  # a-a is fully reporting
-    assert lower[3] == pytest.approx(upper[3])  # c-c is fully reporting
-    assert lower[7] == pytest.approx(upper[7])  # c-c is fully reporting
-    assert lower[7] == pytest.approx(upper[7])  # f-f is fully unexpected
+    assert lower[0] == pytest.approx(upper[0] - 0.002)  # a-a is fully reporting
+    assert lower[3] == pytest.approx(upper[3] - 0.002)  # c-c is fully reporting
+    assert lower[7] == pytest.approx(upper[7] - 0.002)  # c-c is fully reporting
+    assert lower[7] == pytest.approx(upper[7] - 0.002)  # f-f is fully unexpected
     assert all(lower <= upper)
 
 
diff --git a/tests/test_client.py b/tests/test_client.py
index 3b821d28..8ecb690b 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -876,5 +876,9 @@ def test_get_national_summary_votes_estimates(model_client, va_governor_county_d
 
     current = model_client.get_national_summary_votes_estimates(None, 0, [0.99])
 
-    pd.testing.assert_frame_equal(current, model_client.results_handler.final_results["nat_sum_data"])
-    pd.testing.assert_frame_equal(expected_df, model_client.results_handler.final_results["nat_sum_data"])
+    pd.testing.assert_frame_equal(
+        current, model_client.results_handler.final_results["nat_sum_data"], check_dtype=False
+    )
+    pd.testing.assert_frame_equal(
+        expected_df, model_client.results_handler.final_results["nat_sum_data"], check_dtype=False
+    )