-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Updating residuals #27
base: develop
Are you sure you want to change the base?
Changes from all commits
64debde
585ae38
d752cd7
835a3f6
fd442b3
db3c80b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,21 +29,32 @@ class LinearSolver(ABC): | |
|
||
def __init__(self): | ||
self.coefficients = None | ||
self.rng = np.random.default_rng(seed=0) | ||
|
||
@classmethod | ||
def fit(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None, lambda_: float = 0.0, *kwargs): | ||
def fit( | ||
self, | ||
x: np.ndarray, | ||
y: np.ndarray, | ||
weights: np.ndarray | None = None, | ||
lambda_: float = 0.0, | ||
cache: bool = True, | ||
**kwargs, | ||
): | ||
""" | ||
Fits model | ||
""" | ||
raise NotImplementedError | ||
|
||
def predict(self, x: np.ndarray) -> np.ndarray: | ||
def predict(self, x: np.ndarray, coefficients: np.ndarray | None = None) -> np.ndarray: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How would you feel about leaving this (public method) like: def predict(self, x: np.ndarray) -> np.ndarray:
return _predict(x, self.coefficients) and adding: def _predict(self, x: np.ndarray, coefficients: np.ndarray) -> np.ndarray:
return x @ coefficients and then in |
||
""" | ||
Use coefficients to predict | ||
""" | ||
self._check_any_element_nan_or_inf(x) | ||
|
||
return x @ self.coefficients | ||
if coefficients is None: | ||
return x @ self.coefficients | ||
return x @ coefficients | ||
|
||
def get_coefficients(self) -> np.ndarray: | ||
""" | ||
|
@@ -77,3 +88,65 @@ def _check_intercept(self, x): | |
""" | ||
if ~np.all(x[:, 0] == 1): | ||
warnings.warn("Warning: fit_intercept=True and not all elements of the first columns are 1s") | ||
|
||
def residuals( | ||
self, | ||
x: np.ndarray, | ||
y: np.ndarray, | ||
weights: np.ndarray | None = None, | ||
K: int | None = None, | ||
center: bool = True, | ||
**kwargs, | ||
) -> np.ndarray: | ||
""" | ||
Computes residuals for the model | ||
""" | ||
if K is None: | ||
# when K is None we are just using the training residuals | ||
y_hat = self.predict(x).reshape(*y.shape) | ||
residuals = y - y_hat | ||
else: | ||
if weights is None: | ||
weights = np.ones_like(y) | ||
|
||
# create indices for k-fold crossfiting | ||
indices = np.arange(x.shape[0]) | ||
# shuffle for random order of datapoints | ||
self.rng.shuffle(indices) | ||
x_shuffled, y_shuffled, weights_shuffled = x[indices], y[indices], weights[indices] | ||
|
||
# create folds | ||
x_folds = np.array_split(x_shuffled, K) | ||
y_folds = np.array_split(y_shuffled, K) | ||
weights_folds = np.array_split(weights_shuffled, K) | ||
|
||
residuals = [] | ||
for k in range(K): | ||
# extract test points | ||
x_test, y_test, _, = ( | ||
x_folds[k], | ||
y_folds[k], | ||
weights_folds[k], | ||
) | ||
|
||
# extract training points | ||
x_train = np.concatenate([x_folds[j] for j in range(K) if j != k]) | ||
y_train = np.concatenate([y_folds[j] for j in range(K) if j != k]) | ||
weights_train = np.concatenate([weights_folds[j] for j in range(K) if j != k]) | ||
|
||
# fit k-th model | ||
coefficients_k = self.fit(x_train, y_train, weights=weights_train, cache=False, **kwargs) | ||
y_hat_k = self.predict(x_test, coefficients=coefficients_k) | ||
|
||
# k-th residuals | ||
residuals_k = y_test - y_hat_k | ||
residuals.append(residuals_k) | ||
|
||
residuals = np.concatenate(residuals) | ||
# undo shuffling of residuals, to put them in the original dataset order | ||
residuals = residuals[np.argsort(indices)] | ||
|
||
if center: | ||
residuals -= np.mean(residuals, axis=0) | ||
|
||
return residuals |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -92,6 +92,7 @@ def fit( | |
fit_intercept: bool = True, | ||
regularize_intercept: bool = False, | ||
n_feat_ignore_reg: int = 0, | ||
cache: bool = True, | ||
): | ||
self._check_any_element_nan_or_inf(x) | ||
self._check_any_element_nan_or_inf(y) | ||
|
@@ -111,33 +112,45 @@ def fit( | |
# in the bootstrap setting we can now pass in the normal equations and can | ||
# save time re-computing them | ||
if normal_eqs is None: | ||
self.normal_eqs = self._compute_normal_equations( | ||
normal_eqs = self._compute_normal_equations( | ||
x, L, lambda_, fit_intercept, regularize_intercept, n_feat_ignore_reg | ||
) | ||
else: | ||
self.normal_eqs = normal_eqs | ||
|
||
# compute hat matrix: X (X^T X)^{-1} X^T | ||
self.hat_vals = np.diag(x @ self.normal_eqs @ L) | ||
|
||
hat_vals = np.diag(x @ normal_eqs @ L) | ||
# compute coefficients: (X^T X)^{-1} X^T y | ||
self.coefficients = self.normal_eqs @ L @ y | ||
coefficients = normal_eqs @ L @ y | ||
|
||
def residuals(self, y: np.ndarray, y_hat: np.ndarray, loo: bool = True, center: bool = True) -> np.ndarray: | ||
""" | ||
Computes residuals for the model | ||
""" | ||
# compute standard residuals | ||
residuals = y - y_hat | ||
if cache: | ||
self.normal_eqs = normal_eqs | ||
self.hat_vals = hat_vals | ||
self.coefficients = coefficients | ||
|
||
return coefficients | ||
|
||
# if leave one out is True, inflate by (1 - P) | ||
# in OLS setting inflating by (1 - P) is the same as computing the leave one out residuals | ||
# the un-inflated training residuals are too small, since training covariates were observed during fitting | ||
if loo: | ||
def residuals( | ||
self, | ||
x: np.ndarray, | ||
y: np.ndarray, | ||
weights: np.ndarray | None = None, | ||
K: int | None = None, | ||
center: bool = True, | ||
**kwargs | ||
) -> np.ndarray: | ||
if K == x.shape[0]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it possible that any other subclasses would benefit from this logic? 🤔 |
||
# compute standard residuals | ||
y_hat = self.predict(x) | ||
residuals = y - y_hat | ||
|
||
# inflate by (1 - P) | ||
# in OLS setting inflating by (1 - P) is the same as computing the leave one out residuals | ||
# the un-inflated training residuals are too small, since training covariates were observed during fitting | ||
residuals /= (1 - self.hat_vals).reshape(-1, 1) | ||
|
||
# centering removes the column mean | ||
if center: | ||
residuals -= np.mean(residuals, axis=0) | ||
# centering removes the column mean | ||
if center: | ||
residuals -= np.mean(residuals, axis=0) | ||
else: | ||
residuals = super().residuals(x, y, weights, K, center, **kwargs) | ||
|
||
return residuals |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
import os | ||
import sys | ||
|
||
import numpy as np | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know you said you're struggling to add unit tests for all these changes and I'm curious what you think is missing. These look good to me although I'll keep thinking about it 🤔 🎉 |
||
import pandas as pd | ||
import pytest | ||
|
||
|
@@ -40,3 +41,9 @@ def random_data_no_weights(get_fixture): | |
@pytest.fixture(scope="session") | ||
def random_data_weights(get_fixture): | ||
return get_fixture("random_data_n100_p5_12549_weights.csv") | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
def rng(): | ||
seed = 8232 | ||
return np.random.default_rng(seed=seed) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you help me understand how the
cache
is used? It seems like there are cases where we want to (not) keep track of certain things computed during the k-fold residuals process, but I'm not sure I understand why or why not. Plus, this looks like it's only used in the subclasses, so I'd suggest we either (a) remove it from here, or (b) add a method in this super-class that's likedef cache_things()
(bad method name) where the logic for this is used and can be shared by all subclasses 🤔