From 64debdeb8b8ed5c784b5e027ac5b465baedf4765 Mon Sep 17 00:00:00 2001 From: lennybronner Date: Wed, 3 Jul 2024 00:17:12 -0400 Subject: [PATCH 1/5] improving residuals --- src/elexsolver/LinearSolver.py | 59 ++++++++++++++++++++-- src/elexsolver/OLSRegressionSolver.py | 43 +++++++++------- src/elexsolver/QuantileRegressionSolver.py | 46 +++++++++++++---- 3 files changed, 115 insertions(+), 33 deletions(-) diff --git a/src/elexsolver/LinearSolver.py b/src/elexsolver/LinearSolver.py index 949cb5bd..4ae29b9c 100644 --- a/src/elexsolver/LinearSolver.py +++ b/src/elexsolver/LinearSolver.py @@ -29,21 +29,24 @@ class LinearSolver(ABC): def __init__(self): self.coefficients = None + self.rng = np.random.default_rng(seed=0) @classmethod - def fit(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None, lambda_: float = 0.0, *kwargs): + def fit(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None, lambda_: float = 0.0, cache: bool = True, **kwargs): """ Fits model """ raise NotImplementedError - def predict(self, x: np.ndarray) -> np.ndarray: + def predict(self, x: np.ndarray, coefficients: np.ndarray | None = None) -> np.ndarray: """ Use coefficients to predict """ self._check_any_element_nan_or_inf(x) - return x @ self.coefficients + if coefficients is None: + return x @ self.coefficients + return x @ coefficients def get_coefficients(self) -> np.ndarray: """ @@ -77,3 +80,53 @@ def _check_intercept(self, x): """ if ~np.all(x[:, 0] == 1): warnings.warn("Warning: fit_intercept=True and not all elements of the first columns are 1s") + + def residuals(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None, K: int | None = None, center: bool = True, **kwargs) -> np.ndarray: + """ + Computes residuals for the model + """ + if K is None: + # when K is None we are just using the training residuals + y_hat = self.predict(x).reshape(*y.shape) + residuals = y - y_hat + else: + if weights is None: + weights = np.ones_like(y) + + # create indices for k-fold crossfiting + indices = np.arange(x.shape[0]) + # shuffle for random order of datapoints + self.rng.shuffle(indices) + x_shuffled, y_shuffled, weights_shuffled = x[indices], y[indices], weights[indices] + + # create folds + x_folds = np.array_split(x_shuffled, K) + y_folds = np.array_split(y_shuffled, K) + weights_folds = np.array_split(weights_shuffled, K) + + residuals = [] + for k in range(K): + # extract test points + x_test, y_test, weights_test, = x_folds[k], y_folds[k], weights_folds[k] + + # extract training points + x_train = np.concatenate([x_folds[j] for j in range(K) if j != k]) + y_train = np.concatenate([y_folds[j] for j in range(K) if j != k]) + weights_train = np.concatenate([weights_folds[j] for j in range(K) if j != k]) + + # fit k-th model + coefficients_k = self.fit(x_train, y_train, weights=weights_train, cache=False, **kwargs) + y_hat_k = self.predict(x_test, coefficients=coefficients_k) + + # k-th residuals + residuals_k = y_test - y_hat_k + residuals.append(residuals_k) + + residuals = np.concatenate(residuals) + # undo shuffling of residuals, to put them in the original dataset order + residuals = residuals[np.argsort(indices)] + + if center: + residuals -= np.mean(residuals, axis=0) + + return residuals diff --git a/src/elexsolver/OLSRegressionSolver.py b/src/elexsolver/OLSRegressionSolver.py index ca055354..de066491 100644 --- a/src/elexsolver/OLSRegressionSolver.py +++ b/src/elexsolver/OLSRegressionSolver.py @@ -92,6 +92,7 @@ def fit( fit_intercept: bool = True, regularize_intercept: bool = False, n_feat_ignore_reg: int = 0, + cache: bool = True ): self._check_any_element_nan_or_inf(x) self._check_any_element_nan_or_inf(y) @@ -111,33 +112,37 @@ def fit( # in the bootstrap setting we can now pass in the normal equations and can # save time re-computing them if normal_eqs is None: - self.normal_eqs = self._compute_normal_equations( + normal_eqs = self._compute_normal_equations( x, L, lambda_, fit_intercept, regularize_intercept, n_feat_ignore_reg ) - else: - self.normal_eqs = normal_eqs # compute hat matrix: X (X^T X)^{-1} X^T - self.hat_vals = np.diag(x @ self.normal_eqs @ L) - + hat_vals = np.diag(x @ normal_eqs @ L) # compute coefficients: (X^T X)^{-1} X^T y - self.coefficients = self.normal_eqs @ L @ y + coefficients = normal_eqs @ L @ y - def residuals(self, y: np.ndarray, y_hat: np.ndarray, loo: bool = True, center: bool = True) -> np.ndarray: - """ - Computes residuals for the model - """ - # compute standard residuals - residuals = y - y_hat + if cache: + self.normal_eqs = normal_eqs + self.hat_vals = hat_vals + self.coefficients = coefficients + + return coefficients - # if leave one out is True, inflate by (1 - P) - # in OLS setting inflating by (1 - P) is the same as computing the leave one out residuals - # the un-inflated training residuals are too small, since training covariates were observed during fitting - if loo: + def residuals(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None, K: int | None = None, center: bool = True, **kwargs) -> np.ndarray: + if K == x.shape[0]: + # compute standard residuals + y_hat = self.predict(x) + residuals = y - y_hat + + # inflate by (1 - P) + # in OLS setting inflating by (1 - P) is the same as computing the leave one out residuals + # the un-inflated training residuals are too small, since training covariates were observed during fitting residuals /= (1 - self.hat_vals).reshape(-1, 1) - # centering removes the column mean - if center: - residuals -= np.mean(residuals, axis=0) + # centering removes the column mean + if center: + residuals -= np.mean(residuals, axis=0) + else: + residuals = super().residuals(x, y, weights, K, center, **kwargs) return residuals diff --git a/src/elexsolver/QuantileRegressionSolver.py b/src/elexsolver/QuantileRegressionSolver.py index cfe4c525..97211db0 100644 --- a/src/elexsolver/QuantileRegressionSolver.py +++ b/src/elexsolver/QuantileRegressionSolver.py @@ -30,7 +30,11 @@ def _fit(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray, tau: float) -> # A_eq are the equality constraint matrix # b_eq is the equality constraint vector (ie. A_eq @ x = b_eq) # bounds are the (min, max) possible values of every element of x - res = linprog(-1 * S, A_eq=Phi.T, b_eq=zeros, bounds=bounds, method="highs", options={"presolve": False}) + try: + res = linprog(-1 * S, A_eq=Phi.T, b_eq=zeros, bounds=bounds, method="highs", options={"presolve": False}) + except: + import pdb; pdb.set_trace() + # marginal are the dual values, since we are solving the dual this is equivalent to the primal return -1 * res.eqlin.marginals @@ -84,6 +88,7 @@ def fit( regularize_intercept: bool = False, n_feat_ignore_reg: int = 0, normalize_weights: bool = True, + cache: bool = True ): """ Fits quantile regression @@ -105,23 +110,42 @@ def fit( raise ZeroDivisionError weights = weights / weights_sum + if y.ndim == 1: # code expects 2-dim array + y = y.reshape(-1,1) + # _fit assumes that taus is list, so if we want to do one value of tau then turn into a list if isinstance(taus, float): taus = [taus] + else: + assert y.shape[1] == 1 # you can either have multiple taus or multiple ys + coefficients_array = [] for tau in taus: - if lambda_ > 0: - coefficients = self._fit_with_regularization( - x, y, weights, tau, lambda_, regularize_intercept, n_feat_ignore_reg - ) - else: - coefficients = self._fit(x, y, weights, tau) - self.coefficients.append(coefficients) - - def predict(self, x: np.ndarray) -> np.ndarray: + for y_arr in y.T: + y_arr = y_arr.reshape(-1,1) + if lambda_ > 0: + coefficients = self._fit_with_regularization( + x, y_arr, weights, tau, lambda_, regularize_intercept, n_feat_ignore_reg + ) + else: + coefficients = self._fit(x, y_arr, weights, tau) + coefficients_array.append(coefficients) + + coefficients_array = np.asarray(coefficients_array).T + if cache: + self.coefficients = coefficients_array + + return coefficients_array + + def predict(self, x: np.ndarray, coefficients: np.ndarray | None = None) -> np.ndarray: """ Use coefficients to predict """ self._check_any_element_nan_or_inf(x) - return self.coefficients @ x.T + if coefficients is None: + preds = x @ self.coefficients + else: + preds = x @ coefficients + + return preds \ No newline at end of file From d752cd7720d7204af0b47ba2a81c41c425ff4cbc Mon Sep 17 00:00:00 2001 From: lennybronner Date: Tue, 24 Sep 2024 21:09:55 -0400 Subject: [PATCH 2/5] fixed unit tests --- src/elexsolver/QuantileRegressionSolver.py | 2 - tests/test_ols.py | 14 +++---- tests/test_quantile.py | 48 +++++++++++----------- 3 files changed, 31 insertions(+), 33 deletions(-) diff --git a/src/elexsolver/QuantileRegressionSolver.py b/src/elexsolver/QuantileRegressionSolver.py index f8f6fd39..91665ee1 100644 --- a/src/elexsolver/QuantileRegressionSolver.py +++ b/src/elexsolver/QuantileRegressionSolver.py @@ -118,11 +118,9 @@ def fit( taus = [taus] else: assert y.shape[1] == 1 # you can either have multiple taus or multiple ys - coefficients_array = [] for tau in taus: for y_arr in y.T: - y_arr = y_arr.reshape(-1,1) if lambda_ > 0: coefficients = self._fit_with_regularization( x, y_arr, weights, tau, lambda_, regularize_intercept, n_feat_ignore_reg diff --git a/tests/test_ols.py b/tests/test_ols.py index 935b2e63..e3999915 100644 --- a/tests/test_ols.py +++ b/tests/test_ols.py @@ -211,17 +211,17 @@ def test_residuals_no_weights(random_data_no_weights): lm.fit(x, y, fit_intercept=False) predictions = lm.predict(x) - residuals = lm.residuals(y, predictions, loo=False, center=False) - + residuals = lm.residuals(x, y, K=None, center=False) assert residuals[0] == pytest.approx(0.885973530) assert residuals[-1] == pytest.approx(0.841996302) - residuals = lm.residuals(y, predictions, loo=True, center=False) + # equivalent of leave-one-out residuals + residuals = lm.residuals(x, y, K=100, center=False) assert residuals[0] == pytest.approx(0.920112164) assert residuals[-1] == pytest.approx(0.875896477) - residuals = lm.residuals(y, predictions, loo=True, center=True) + residuals = lm.residuals(x, y, K=100, center=True) assert np.sum(residuals) == pytest.approx(0) @@ -234,17 +234,17 @@ def test_residuals_weights(random_data_weights): lm.fit(x, y, weights=weights, fit_intercept=False) predictions = lm.predict(x) - residuals = lm.residuals(y, predictions, loo=False, center=False) + residuals = lm.residuals(x, y, weights=weights, K=None, center=False) assert residuals[0] == pytest.approx(-1.971798590) assert residuals[-1] == pytest.approx(-1.373951578) - residuals = lm.residuals(y, predictions, loo=True, center=False) + residuals = lm.residuals(x, y, weights=weights, K=100, center=False) assert residuals[0] == pytest.approx(-1.999718445) assert residuals[-1] == pytest.approx(-1.438563033) - residuals = lm.residuals(y, predictions, loo=True, center=True) + residuals = lm.residuals(x, y, k=100, center=True) assert np.sum(residuals) == pytest.approx(0) diff --git a/tests/test_quantile.py b/tests/test_quantile.py index 792c6bd5..515bdc0f 100644 --- a/tests/test_quantile.py +++ b/tests/test_quantile.py @@ -23,7 +23,7 @@ def test_basic_median_1(): preds = quantreg.predict(x) # you'd think it would be 8 instead of 7.5, but run quantreg in R to confirm # has to do with missing intercept - np.testing.assert_array_equal(preds, [[7.5, 7.5, 7.5, 15]]) + np.testing.assert_array_equal(preds, [[7.5], [7.5], [7.5], [15]]) def test_basic_median_2(): @@ -33,7 +33,7 @@ def test_basic_median_2(): y = np.asarray([3, 8, 9, 15]) quantreg.fit(x, y, tau) preds = quantreg.predict(x) - np.testing.assert_array_equal(preds, [[8, 8, 8, 15]]) + np.testing.assert_array_equal(preds, [[8], [8], [8], [15]]) def test_basic_lower(): @@ -43,7 +43,7 @@ def test_basic_lower(): y = np.asarray([3, 8, 9, 15]) quantreg.fit(x, y, tau) preds = quantreg.predict(x) - np.testing.assert_array_equal(preds, [[3, 3, 3, 15]]) + np.testing.assert_array_equal(preds, [[3], [3], [3], [15]]) def test_basic_upper(): @@ -53,7 +53,7 @@ def test_basic_upper(): y = np.asarray([3, 8, 9, 15]) quantreg.fit(x, y, tau) preds = quantreg.predict(x) - np.testing.assert_array_equal(preds, [[9, 9, 9, 15]]) + np.testing.assert_array_equal(preds, [[9], [9], [9], [15]]) ###################### @@ -68,7 +68,7 @@ def test_random_median(random_data_no_weights): y = random_data_no_weights["y"].values quantreg.fit(x, y, tau, fit_intercept=False) quantreg.predict(x) - np.testing.assert_allclose(quantreg.coefficients, [[1.57699, 6.74906, 4.40175, 4.85346, 4.51814]], rtol=TOL) + np.testing.assert_allclose(quantreg.coefficients, [[1.57699], [6.74906], [4.40175], [4.85346], [4.51814]], rtol=TOL) def test_random_lower(random_data_no_weights): @@ -78,7 +78,7 @@ def test_random_lower(random_data_no_weights): y = random_data_no_weights["y"].values quantreg.fit(x, y, tau, fit_intercept=False) quantreg.predict(x) - np.testing.assert_allclose(quantreg.coefficients, [[0.17759, 6.99588, 4.18896, 4.83906, 3.22546]], rtol=TOL) + np.testing.assert_allclose(quantreg.coefficients, [[0.17759], [6.99588], [4.18896], [4.83906], [3.22546]], rtol=TOL) def test_random_upper(random_data_no_weights): @@ -88,7 +88,7 @@ def test_random_upper(random_data_no_weights): y = random_data_no_weights["y"].values quantreg.fit(x, y, tau, fit_intercept=False) quantreg.predict(x) - np.testing.assert_allclose(quantreg.coefficients, [[1.85617, 6.81286, 6.05586, 5.51965, 4.19864]], rtol=TOL) + np.testing.assert_allclose(quantreg.coefficients, [[1.85617], [6.81286], [6.05586], [5.51965], [4.19864]], rtol=TOL) ################# @@ -103,10 +103,10 @@ def test_multiple(random_data_no_weights): y = random_data_no_weights["y"].values quantreg.fit(x, y, taus, fit_intercept=False) quantreg.predict(x) - assert len(quantreg.coefficients) == 3 - np.testing.assert_allclose(quantreg.coefficients[0], [0.17759, 6.99588, 4.18896, 4.83906, 3.22546], rtol=TOL) - np.testing.assert_allclose(quantreg.coefficients[1], [1.57699, 6.74906, 4.40175, 4.85346, 4.51814], rtol=TOL) - np.testing.assert_allclose(quantreg.coefficients[2], [1.85617, 6.81286, 6.05586, 5.51965, 4.19864], rtol=TOL) + assert quantreg.coefficients.shape == (5, 3) + np.testing.assert_allclose(quantreg.coefficients[:,0], [0.17759, 6.99588, 4.18896, 4.83906, 3.22546], rtol=TOL) + np.testing.assert_allclose(quantreg.coefficients[:,1], [1.57699, 6.74906, 4.40175, 4.85346, 4.51814], rtol=TOL) + np.testing.assert_allclose(quantreg.coefficients[:,2], [1.85617, 6.81286, 6.05586, 5.51965, 4.19864], rtol=TOL) ###################### @@ -122,7 +122,7 @@ def test_basic_median_weights(): weights = np.asarray([1, 1, 100, 3]) quantreg.fit(x, y, tau, weights) preds = quantreg.predict(x) - np.testing.assert_array_equal(preds, [[9, 9, 9, 15]]) + np.testing.assert_array_equal(preds, [[9], [9], [9], [15]]) def test_random_median_weights(random_data_weights): @@ -133,7 +133,7 @@ def test_random_median_weights(random_data_weights): weights = random_data_weights["weights"].values quantreg.fit(x, y, tau, weights=weights, fit_intercept=False) quantreg.predict(x) - np.testing.assert_allclose(quantreg.coefficients, [[1.59521, 2.17864, 4.68050, 3.10920, 9.63739]], rtol=TOL) + np.testing.assert_allclose(quantreg.coefficients, [[1.59521], [2.17864], [4.68050], [3.10920], [9.63739]], rtol=TOL) def test_random_lower_weights(random_data_weights): @@ -144,7 +144,7 @@ def test_random_lower_weights(random_data_weights): weights = random_data_weights["weights"].values quantreg.fit(x, y, tau, weights=weights, fit_intercept=False) quantreg.predict(x) - np.testing.assert_allclose(quantreg.coefficients, [[0.63670, 1.27028, 4.81500, 3.08055, 8.69929]], rtol=TOL) + np.testing.assert_allclose(quantreg.coefficients, [[0.63670], [1.27028], [4.81500], [3.08055], [8.69929]], rtol=TOL) def test_random_upper_weights(random_data_weights): @@ -155,7 +155,7 @@ def test_random_upper_weights(random_data_weights): weights = random_data_weights["weights"].values quantreg.fit(x, y, tau, weights=weights, fit_intercept=False) quantreg.predict(x) - np.testing.assert_allclose(quantreg.coefficients, [[3.47742, 2.07360, 4.51754, 4.15237, 9.58856]], rtol=TOL) + np.testing.assert_allclose(quantreg.coefficients, [[3.47742], [2.07360], [4.51754], [4.15237], [9.58856]], rtol=TOL) ############################# @@ -189,12 +189,12 @@ def test_weight_normalization_same_fit(random_data_weights): quantreg = QuantileRegressionSolver() quantreg.fit(x, y, tau, weights, normalize_weights=True) preds = quantreg.predict(x) - np.testing.assert_allclose(preds, [[9, 9, 9, 15]], rtol=TOL) + np.testing.assert_allclose(preds, [[9], [9], [9], [15]], rtol=TOL) quantreg = QuantileRegressionSolver() quantreg.fit(x, y, tau, weights, normalize_weights=False) preds = quantreg.predict(x) - np.testing.assert_allclose(preds, [[9, 9, 9, 15]], rtol=TOL) + np.testing.assert_allclose(preds, [[9], [9], [9], [15]], rtol=TOL) ######################## @@ -205,13 +205,13 @@ def test_weight_normalization_same_fit(random_data_weights): def test_regularization_without_intercept(random_data_no_weights): tau = 0.5 x = random_data_no_weights[["x0", "x1", "x2", "x3", "x4"]].values - y = random_data_no_weights["y"].values + y = random_data_no_weights["y"].values.flatten() quantreg = QuantileRegressionSolver() lambda_ = 1e6 quantreg.fit(x, y, tau, lambda_=lambda_, fit_intercept=False, regularize_intercept=True) np.testing.assert_allclose( - quantreg.coefficients, [[0, 0, 0, 0, 0]], atol=TOL + quantreg.coefficients, [[0], [0], [0], [0], [0]], atol=TOL ) # using absolute tolerance since comparing to zero @@ -225,7 +225,7 @@ def test_regularization_with_intercept(random_data_no_weights): lambda_ = 1e6 quantreg.fit(x, y, tau, lambda_=lambda_, fit_intercept=True, regularize_intercept=False) coefficients_w_reg = quantreg.coefficients - np.testing.assert_allclose(quantreg.coefficients[0][1:], [0, 0, 0, 0], atol=TOL) + np.testing.assert_allclose(quantreg.coefficients[1:], [[0], [0], [0], [0]], atol=TOL) assert np.abs(coefficients_w_reg[0][0]) > TOL @@ -239,10 +239,10 @@ def test_regularization_with_intercept_and_features(random_data_no_weights): lambda_ = 1e6 quantreg.fit(x, y, tau, lambda_=lambda_, fit_intercept=True, regularize_intercept=False, n_feat_ignore_reg=2) coefficients_w_reg = quantreg.coefficients - np.testing.assert_allclose(quantreg.coefficients[0][3:], [0, 0], atol=TOL) - assert np.abs(coefficients_w_reg[0][0]) > TOL - assert np.abs(coefficients_w_reg[0][1]) > TOL - assert np.abs(coefficients_w_reg[0][2]) > TOL + np.testing.assert_allclose(quantreg.coefficients[3:], [[0], [0]], atol=TOL) + assert np.abs(coefficients_w_reg[0]) > TOL + assert np.abs(coefficients_w_reg[1]) > TOL + assert np.abs(coefficients_w_reg[2]) > TOL ######################## From 835a3f65eb57dccc9bd4a4c1e1b8d22ab2b97c8c Mon Sep 17 00:00:00 2001 From: lennybronner Date: Tue, 24 Sep 2024 21:12:03 -0400 Subject: [PATCH 3/5] ran linter --- src/elexsolver/LinearSolver.py | 32 ++++++++++++++++++---- src/elexsolver/OLSRegressionSolver.py | 12 ++++++-- src/elexsolver/QuantileRegressionSolver.py | 15 ++++------ tests/test_ols.py | 4 +-- tests/test_quantile.py | 6 ++-- 5 files changed, 47 insertions(+), 22 deletions(-) diff --git a/src/elexsolver/LinearSolver.py b/src/elexsolver/LinearSolver.py index 4ae29b9c..fe393210 100644 --- a/src/elexsolver/LinearSolver.py +++ b/src/elexsolver/LinearSolver.py @@ -32,7 +32,15 @@ def __init__(self): self.rng = np.random.default_rng(seed=0) @classmethod - def fit(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None, lambda_: float = 0.0, cache: bool = True, **kwargs): + def fit( + self, + x: np.ndarray, + y: np.ndarray, + weights: np.ndarray | None = None, + lambda_: float = 0.0, + cache: bool = True, + **kwargs, + ): """ Fits model """ @@ -81,7 +89,15 @@ def _check_intercept(self, x): if ~np.all(x[:, 0] == 1): warnings.warn("Warning: fit_intercept=True and not all elements of the first columns are 1s") - def residuals(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None, K: int | None = None, center: bool = True, **kwargs) -> np.ndarray: + def residuals( + self, + x: np.ndarray, + y: np.ndarray, + weights: np.ndarray | None = None, + K: int | None = None, + center: bool = True, + **kwargs, + ) -> np.ndarray: """ Computes residuals for the model """ @@ -98,7 +114,7 @@ def residuals(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = N # shuffle for random order of datapoints self.rng.shuffle(indices) x_shuffled, y_shuffled, weights_shuffled = x[indices], y[indices], weights[indices] - + # create folds x_folds = np.array_split(x_shuffled, K) y_folds = np.array_split(y_shuffled, K) @@ -107,8 +123,12 @@ def residuals(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = N residuals = [] for k in range(K): # extract test points - x_test, y_test, weights_test, = x_folds[k], y_folds[k], weights_folds[k] - + x_test, y_test, _, = ( + x_folds[k], + y_folds[k], + weights_folds[k], + ) + # extract training points x_train = np.concatenate([x_folds[j] for j in range(K) if j != k]) y_train = np.concatenate([y_folds[j] for j in range(K) if j != k]) @@ -121,7 +141,7 @@ def residuals(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = N # k-th residuals residuals_k = y_test - y_hat_k residuals.append(residuals_k) - + residuals = np.concatenate(residuals) # undo shuffling of residuals, to put them in the original dataset order residuals = residuals[np.argsort(indices)] diff --git a/src/elexsolver/OLSRegressionSolver.py b/src/elexsolver/OLSRegressionSolver.py index de066491..68d68200 100644 --- a/src/elexsolver/OLSRegressionSolver.py +++ b/src/elexsolver/OLSRegressionSolver.py @@ -92,7 +92,7 @@ def fit( fit_intercept: bool = True, regularize_intercept: bool = False, n_feat_ignore_reg: int = 0, - cache: bool = True + cache: bool = True, ): self._check_any_element_nan_or_inf(x) self._check_any_element_nan_or_inf(y) @@ -128,7 +128,15 @@ def fit( return coefficients - def residuals(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None, K: int | None = None, center: bool = True, **kwargs) -> np.ndarray: + def residuals( + self, + x: np.ndarray, + y: np.ndarray, + weights: np.ndarray | None = None, + K: int | None = None, + center: bool = True, + **kwargs + ) -> np.ndarray: if K == x.shape[0]: # compute standard residuals y_hat = self.predict(x) diff --git a/src/elexsolver/QuantileRegressionSolver.py b/src/elexsolver/QuantileRegressionSolver.py index 91665ee1..ce4476e0 100644 --- a/src/elexsolver/QuantileRegressionSolver.py +++ b/src/elexsolver/QuantileRegressionSolver.py @@ -30,10 +30,7 @@ def _fit(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray, tau: float) -> # A_eq are the equality constraint matrix # b_eq is the equality constraint vector (ie. A_eq @ x = b_eq) # bounds are the (min, max) possible values of every element of x - try: - res = linprog(-1 * S, A_eq=Phi.T, b_eq=zeros, bounds=bounds, method="highs", options={"presolve": False}) - except: - import pdb; pdb.set_trace() + res = linprog(-1 * S, A_eq=Phi.T, b_eq=zeros, bounds=bounds, method="highs", options={"presolve": False}) # marginal are the dual values, since we are solving the dual this is equivalent to the primal return -1 * res.eqlin.marginals @@ -88,7 +85,7 @@ def fit( regularize_intercept: bool = False, n_feat_ignore_reg: int = 0, normalize_weights: bool = True, - cache: bool = True + cache: bool = True, ): """ Fits quantile regression @@ -110,14 +107,14 @@ def fit( raise ZeroDivisionError weights = weights / weights_sum - if y.ndim == 1: # code expects 2-dim array - y = y.reshape(-1,1) + if y.ndim == 1: # code expects 2-dim array + y = y.reshape(-1, 1) # _fit assumes that taus is list, so if we want to do one value of tau then turn into a list if isinstance(taus, float): taus = [taus] else: - assert y.shape[1] == 1 # you can either have multiple taus or multiple ys + assert y.shape[1] == 1 # you can either have multiple taus or multiple ys coefficients_array = [] for tau in taus: for y_arr in y.T: @@ -146,4 +143,4 @@ def predict(self, x: np.ndarray, coefficients: np.ndarray | None = None) -> np.n else: preds = x @ coefficients - return preds \ No newline at end of file + return preds diff --git a/tests/test_ols.py b/tests/test_ols.py index e3999915..bea6876b 100644 --- a/tests/test_ols.py +++ b/tests/test_ols.py @@ -209,7 +209,7 @@ def test_residuals_no_weights(random_data_no_weights): x = random_data_no_weights[["x0", "x1", "x2", "x3", "x4"]].values y = random_data_no_weights["y"].values.reshape(-1, 1) lm.fit(x, y, fit_intercept=False) - predictions = lm.predict(x) + lm.predict(x) residuals = lm.residuals(x, y, K=None, center=False) assert residuals[0] == pytest.approx(0.885973530) @@ -232,7 +232,7 @@ def test_residuals_weights(random_data_weights): weights = random_data_weights["weights"].values lm.fit(x, y, weights=weights, fit_intercept=False) - predictions = lm.predict(x) + lm.predict(x) residuals = lm.residuals(x, y, weights=weights, K=None, center=False) diff --git a/tests/test_quantile.py b/tests/test_quantile.py index 515bdc0f..a9386156 100644 --- a/tests/test_quantile.py +++ b/tests/test_quantile.py @@ -104,9 +104,9 @@ def test_multiple(random_data_no_weights): quantreg.fit(x, y, taus, fit_intercept=False) quantreg.predict(x) assert quantreg.coefficients.shape == (5, 3) - np.testing.assert_allclose(quantreg.coefficients[:,0], [0.17759, 6.99588, 4.18896, 4.83906, 3.22546], rtol=TOL) - np.testing.assert_allclose(quantreg.coefficients[:,1], [1.57699, 6.74906, 4.40175, 4.85346, 4.51814], rtol=TOL) - np.testing.assert_allclose(quantreg.coefficients[:,2], [1.85617, 6.81286, 6.05586, 5.51965, 4.19864], rtol=TOL) + np.testing.assert_allclose(quantreg.coefficients[:, 0], [0.17759, 6.99588, 4.18896, 4.83906, 3.22546], rtol=TOL) + np.testing.assert_allclose(quantreg.coefficients[:, 1], [1.57699, 6.74906, 4.40175, 4.85346, 4.51814], rtol=TOL) + np.testing.assert_allclose(quantreg.coefficients[:, 2], [1.85617, 6.81286, 6.05586, 5.51965, 4.19864], rtol=TOL) ###################### From fd442b329c07f0def5f51bee24ab99409de8a4bb Mon Sep 17 00:00:00 2001 From: lennybronner Date: Tue, 24 Sep 2024 22:22:57 -0400 Subject: [PATCH 4/5] updated tests --- tests/conftest.py | 6 ++++++ tests/test_linear_solver.py | 20 +++++++++++++++++++- tests/test_ols.py | 14 ++++++++++++++ tests/test_quantile.py | 11 +++++++++++ 4 files changed, 50 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index ca274686..0271ca64 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ import sys import pandas as pd +import numpy as np import pytest _TEST_FOLDER = os.path.dirname(__file__) @@ -40,3 +41,8 @@ def random_data_no_weights(get_fixture): @pytest.fixture(scope="session") def random_data_weights(get_fixture): return get_fixture("random_data_n100_p5_12549_weights.csv") + +@pytest.fixture(scope="session") +def rng(): + seed = 8232 + return np.random.default_rng(seed=seed) \ No newline at end of file diff --git a/tests/test_linear_solver.py b/tests/test_linear_solver.py index 98e1f163..4cc06d86 100644 --- a/tests/test_linear_solver.py +++ b/tests/test_linear_solver.py @@ -2,9 +2,27 @@ import pytest from elexsolver.LinearSolver import LinearSolver - +from elexsolver.QuantileRegressionSolver import QuantileRegressionSolver def test_fit(): solver = LinearSolver() with pytest.raises(NotImplementedError): solver.fit(np.ndarray((5, 3)), np.ndarray((1, 3))) + + +################## +# Test residuals # +################## +def test_residuals_without_weights(rng): + x = rng.normal(size=(100, 5)) + beta = rng.normal(size=(5, 1)) + y = x @ beta + + # we need an a subclass of LinearSolver to actually run a fit + reg = QuantileRegressionSolver() + reg.fit(x, y, fit_intercept=False) + reg.predict(x) + + residuals_train = reg.residuals(x, y, K=None, center=False) + residuals_K = reg.residuals(x, y, K=10, center=False) + import pdb; pdb.set_trace() \ No newline at end of file diff --git a/tests/test_ols.py b/tests/test_ols.py index bea6876b..875037e7 100644 --- a/tests/test_ols.py +++ b/tests/test_ols.py @@ -30,6 +30,20 @@ def test_basic2(): preds = lm.predict(x) assert all(np.abs(preds - [6.666667, 6.666667, 6.666667, 15]) <= TOL) +def test_cache(): + lm = OLSRegressionSolver() + x = np.asarray([[1, 1], [1, 1], [1, 1], [1, 2]]) + y = np.asarray([3, 8, 9, 15]) + lm.fit(x, y, fit_intercept=True, cache=False) + + assert lm.normal_eqs is None + assert lm.hat_vals is None + + lm.fit(x, y, fit_intercept=True, cache=True) + + assert lm.normal_eqs is not None + assert lm.hat_vals is not None + assert lm.coefficients is not None ###################### # Intermediate tests # diff --git a/tests/test_quantile.py b/tests/test_quantile.py index a9386156..7738501e 100644 --- a/tests/test_quantile.py +++ b/tests/test_quantile.py @@ -55,6 +55,17 @@ def test_basic_upper(): preds = quantreg.predict(x) np.testing.assert_array_equal(preds, [[9], [9], [9], [15]]) +def test_cache(): + quantreg = QuantileRegressionSolver() + tau = 0.9 + x = np.asarray([[1, 1], [1, 1], [1, 1], [1, 2]]) + y = np.asarray([3, 8, 9, 15]) + quantreg.fit(x, y, tau, cache=False) + + assert quantreg.coefficients == [] + + quantreg.fit(x, y, tau, cache=True) + assert len(quantreg.coefficients) > 0 ###################### # Intermediate tests # From db3c80bd50616d177eaa15709f0713068d8a72b9 Mon Sep 17 00:00:00 2001 From: lennybronner Date: Tue, 24 Sep 2024 22:23:23 -0400 Subject: [PATCH 5/5] linter --- tests/conftest.py | 5 +++-- tests/test_linear_solver.py | 6 +++--- tests/test_ols.py | 2 ++ tests/test_quantile.py | 2 ++ 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 0271ca64..7c811157 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,8 +3,8 @@ import os import sys -import pandas as pd import numpy as np +import pandas as pd import pytest _TEST_FOLDER = os.path.dirname(__file__) @@ -42,7 +42,8 @@ def random_data_no_weights(get_fixture): def random_data_weights(get_fixture): return get_fixture("random_data_n100_p5_12549_weights.csv") + @pytest.fixture(scope="session") def rng(): seed = 8232 - return np.random.default_rng(seed=seed) \ No newline at end of file + return np.random.default_rng(seed=seed) diff --git a/tests/test_linear_solver.py b/tests/test_linear_solver.py index 4cc06d86..f0994a31 100644 --- a/tests/test_linear_solver.py +++ b/tests/test_linear_solver.py @@ -4,6 +4,7 @@ from elexsolver.LinearSolver import LinearSolver from elexsolver.QuantileRegressionSolver import QuantileRegressionSolver + def test_fit(): solver = LinearSolver() with pytest.raises(NotImplementedError): @@ -23,6 +24,5 @@ def test_residuals_without_weights(rng): reg.fit(x, y, fit_intercept=False) reg.predict(x) - residuals_train = reg.residuals(x, y, K=None, center=False) - residuals_K = reg.residuals(x, y, K=10, center=False) - import pdb; pdb.set_trace() \ No newline at end of file + reg.residuals(x, y, K=None, center=False) + reg.residuals(x, y, K=10, center=False) diff --git a/tests/test_ols.py b/tests/test_ols.py index 875037e7..66adf620 100644 --- a/tests/test_ols.py +++ b/tests/test_ols.py @@ -30,6 +30,7 @@ def test_basic2(): preds = lm.predict(x) assert all(np.abs(preds - [6.666667, 6.666667, 6.666667, 15]) <= TOL) + def test_cache(): lm = OLSRegressionSolver() x = np.asarray([[1, 1], [1, 1], [1, 1], [1, 2]]) @@ -45,6 +46,7 @@ def test_cache(): assert lm.hat_vals is not None assert lm.coefficients is not None + ###################### # Intermediate tests # ###################### diff --git a/tests/test_quantile.py b/tests/test_quantile.py index 7738501e..c39e8052 100644 --- a/tests/test_quantile.py +++ b/tests/test_quantile.py @@ -55,6 +55,7 @@ def test_basic_upper(): preds = quantreg.predict(x) np.testing.assert_array_equal(preds, [[9], [9], [9], [15]]) + def test_cache(): quantreg = QuantileRegressionSolver() tau = 0.9 @@ -67,6 +68,7 @@ def test_cache(): quantreg.fit(x, y, tau, cache=True) assert len(quantreg.coefficients) > 0 + ###################### # Intermediate tests # ######################