-
Notifications
You must be signed in to change notification settings - Fork 66
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add ability to analyse Regression Kink Analysis Designs #264
Changes from 28 commits
15ef4ec
81ea342
e8905f3
6b7e0e9
b4436c9
64c0b99
1aaa028
7847ede
8ce77a1
9cf6b51
6e47576
5e099c2
cc07b94
dca2844
e3c37be
2f89a97
3ca5f87
59a8b3e
0b70c9e
732707f
7e982c0
1ba0333
dab61dc
2ad1ba3
80537c6
b26a375
255ea83
232944b
4386f6b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,7 +12,7 @@ | |
""" | ||
|
||
import warnings | ||
from typing import Optional, Union | ||
from typing import Union | ||
|
||
import arviz as az | ||
import matplotlib.pyplot as plt | ||
|
@@ -328,7 +328,7 @@ | |
fontsize=LEGEND_FONT_SIZE, | ||
) | ||
|
||
return (fig, ax) | ||
return fig, ax | ||
|
||
def summary(self) -> None: | ||
""" | ||
|
@@ -424,7 +424,7 @@ | |
ax[0].plot( | ||
self.datapost.index, self.post_X, "-", c=[0.8, 0.8, 0.8], zorder=1 | ||
) | ||
return (fig, ax) | ||
return fig, ax | ||
|
||
|
||
class DifferenceInDifferences(ExperimentalDesign): | ||
|
@@ -794,7 +794,7 @@ | |
model=None, | ||
running_variable_name: str = "x", | ||
epsilon: float = 0.001, | ||
bandwidth: Optional[float] = None, | ||
bandwidth: float = np.inf, | ||
**kwargs, | ||
): | ||
super().__init__(model=model, **kwargs) | ||
|
@@ -807,7 +807,7 @@ | |
self.bandwidth = bandwidth | ||
self._input_validation() | ||
|
||
if self.bandwidth is not None: | ||
if self.bandwidth is not np.inf: | ||
fmin = self.treatment_threshold - self.bandwidth | ||
fmax = self.treatment_threshold + self.bandwidth | ||
filtered_data = self.data.query(f"{fmin} <= x <= {fmax}") | ||
|
@@ -836,7 +836,7 @@ | |
self.score = self.model.score(X=self.X, y=self.y) | ||
|
||
# get the model predictions of the observed data | ||
if self.bandwidth is not None: | ||
if self.bandwidth is not np.inf: | ||
xi = np.linspace(fmin, fmax, 200) | ||
else: | ||
xi = np.linspace( | ||
|
@@ -903,7 +903,7 @@ | |
self.data, | ||
x=self.running_variable_name, | ||
y=self.outcome_variable_name, | ||
c="k", # hue="treated", | ||
c="k", | ||
ax=ax, | ||
) | ||
|
||
|
@@ -939,7 +939,7 @@ | |
labels=labels, | ||
fontsize=LEGEND_FONT_SIZE, | ||
) | ||
return (fig, ax) | ||
return fig, ax | ||
|
||
def summary(self) -> None: | ||
""" | ||
|
@@ -957,6 +957,220 @@ | |
self.print_coefficients() | ||
|
||
|
||
class RegressionKink(ExperimentalDesign): | ||
""" | ||
A class to analyse sharp regression kink experiments. | ||
|
||
:param data: | ||
A pandas dataframe | ||
:param formula: | ||
A statistical model formula | ||
:param kink_point: | ||
A scalar threshold value at which there is a change in the first derivative of | ||
the assignment function | ||
:param model: | ||
A PyMC model | ||
:param running_variable_name: | ||
The name of the predictor variable that the kink_point is based upon | ||
:param epsilon: | ||
A small scalar value which determines how far above and below the kink point to | ||
evaluate the causal impact. | ||
:param bandwidth: | ||
Data outside of the bandwidth (relative to the discontinuity) is not used to fit | ||
the model. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
data: pd.DataFrame, | ||
formula: str, | ||
kink_point: float, | ||
model=None, | ||
running_variable_name: str = "x", | ||
epsilon: float = 0.001, | ||
bandwidth: float = np.inf, | ||
**kwargs, | ||
): | ||
super().__init__(model=model, **kwargs) | ||
self.expt_type = "Regression Kink" | ||
self.data = data | ||
self.formula = formula | ||
self.running_variable_name = running_variable_name | ||
self.kink_point = kink_point | ||
self.epsilon = epsilon | ||
self.bandwidth = bandwidth | ||
self._input_validation() | ||
|
||
if self.bandwidth is not np.inf: | ||
fmin = self.kink_point - self.bandwidth | ||
fmax = self.kink_point + self.bandwidth | ||
filtered_data = self.data.query(f"{fmin} <= x <= {fmax}") | ||
if len(filtered_data) <= 10: | ||
warnings.warn( | ||
f"Choice of bandwidth parameter has lead to only {len(filtered_data)} remaining datapoints. Consider increasing the bandwidth parameter.", # noqa: E501 | ||
UserWarning, | ||
) | ||
y, X = dmatrices(formula, filtered_data) | ||
else: | ||
y, X = dmatrices(formula, self.data) | ||
|
||
self._y_design_info = y.design_info | ||
self._x_design_info = X.design_info | ||
self.labels = X.design_info.column_names | ||
self.y, self.X = np.asarray(y), np.asarray(X) | ||
self.outcome_variable_name = y.design_info.column_names[0] | ||
|
||
COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.X.shape[0])} | ||
self.model.fit(X=self.X, y=self.y, coords=COORDS) | ||
|
||
# score the goodness of fit to all data | ||
self.score = self.model.score(X=self.X, y=self.y) | ||
|
||
# get the model predictions of the observed data | ||
if self.bandwidth is not np.inf: | ||
xi = np.linspace(fmin, fmax, 200) | ||
else: | ||
xi = np.linspace( | ||
np.min(self.data[self.running_variable_name]), | ||
np.max(self.data[self.running_variable_name]), | ||
200, | ||
) | ||
self.x_pred = pd.DataFrame( | ||
{self.running_variable_name: xi, "treated": self._is_treated(xi)} | ||
) | ||
(new_x,) = build_design_matrices([self._x_design_info], self.x_pred) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm sure there. is a reason but why is the output wrapped in braces?and then immediately into an array? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
self.pred = self.model.predict(X=np.asarray(new_x)) | ||
|
||
# evaluate gradient change around kink point | ||
mu_kink_left, mu_kink, mu_kink_right = self._probe_kink_point() | ||
self.gradient_change = self._eval_gradient_change( | ||
mu_kink_left, mu_kink, mu_kink_right, epsilon | ||
) | ||
|
||
@staticmethod | ||
def _eval_gradient_change(mu_kink_left, mu_kink, mu_kink_right, epsilon): | ||
"""Evaluate the gradient change at the kink point. | ||
It works by evaluating the model below the kink point, at the kink point, | ||
and above the kink point. | ||
This is a static method for ease of testing. | ||
""" | ||
gradient_left = (mu_kink - mu_kink_left) / epsilon | ||
gradient_right = (mu_kink_right - mu_kink) / epsilon | ||
gradient_change = gradient_right - gradient_left | ||
return gradient_change | ||
|
||
def _probe_kink_point(self): | ||
# Create a dataframe to evaluate predicted outcome at the kink point and either | ||
# side | ||
x_predict = pd.DataFrame( | ||
{ | ||
self.running_variable_name: np.array( | ||
[ | ||
self.kink_point - self.epsilon, | ||
self.kink_point, | ||
self.kink_point + self.epsilon, | ||
] | ||
), | ||
"treated": np.array([0, 1, 1]), | ||
} | ||
) | ||
(new_x,) = build_design_matrices([self._x_design_info], x_predict) | ||
predicted = self.model.predict(X=np.asarray(new_x)) | ||
# extract predicted mu values | ||
mu_kink_left = predicted["posterior_predictive"].sel(obs_ind=0)["mu"] | ||
mu_kink = predicted["posterior_predictive"].sel(obs_ind=1)["mu"] | ||
mu_kink_right = predicted["posterior_predictive"].sel(obs_ind=2)["mu"] | ||
return mu_kink_left, mu_kink, mu_kink_right | ||
|
||
def _input_validation(self): | ||
"""Validate the input data and model formula for correctness""" | ||
if "treated" not in self.formula: | ||
raise FormulaException( | ||
"A predictor called `treated` should be in the formula" | ||
) | ||
|
||
if _is_variable_dummy_coded(self.data["treated"]) is False: | ||
raise DataException( | ||
"""The treated variable should be dummy coded. Consisting of 0's and 1's only.""" # noqa: E501 | ||
) | ||
|
||
if self.bandwidth <= 0: | ||
raise ValueError("The bandwidth must be greater than zero.") | ||
|
||
if self.epsilon <= 0: | ||
raise ValueError("Epsilon must be greater than zero.") | ||
|
||
def _is_treated(self, x): | ||
"""Returns ``True`` if `x` is greater than or equal to the treatment threshold.""" # noqa: E501 | ||
return np.greater_equal(x, self.kink_point) | ||
|
||
def plot(self): | ||
""" | ||
Plot the results | ||
""" | ||
fig, ax = plt.subplots() | ||
# Plot raw data | ||
sns.scatterplot( | ||
self.data, | ||
x=self.running_variable_name, | ||
y=self.outcome_variable_name, | ||
c="k", # hue="treated", | ||
drbenvincent marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ax=ax, | ||
) | ||
|
||
# Plot model fit to data | ||
h_line, h_patch = plot_xY( | ||
self.x_pred[self.running_variable_name], | ||
self.pred["posterior_predictive"].mu, | ||
ax=ax, | ||
plot_hdi_kwargs={"color": "C1"}, | ||
) | ||
handles = [(h_line, h_patch)] | ||
labels = ["Posterior mean"] | ||
|
||
# create strings to compose title | ||
title_info = f"{self.score.r2:.3f} (std = {self.score.r2_std:.3f})" | ||
r2 = f"Bayesian $R^2$ on all data = {title_info}" | ||
percentiles = self.gradient_change.quantile([0.03, 1 - 0.03]).values | ||
ci = r"$CI_{94\%}$" + f"[{percentiles[0]:.2f}, {percentiles[1]:.2f}]" | ||
grad_change = f""" | ||
Change in gradient = {self.gradient_change.mean():.2f}, | ||
""" | ||
ax.set(title=r2 + "\n" + grad_change + ci) | ||
# Intervention line | ||
ax.axvline( | ||
x=self.kink_point, | ||
ls="-", | ||
lw=3, | ||
color="r", | ||
label="treatment threshold", | ||
) | ||
ax.legend( | ||
handles=(h_tuple for h_tuple in handles), | ||
labels=labels, | ||
fontsize=LEGEND_FONT_SIZE, | ||
) | ||
return fig, ax | ||
|
||
def summary(self) -> None: | ||
""" | ||
Print text output summarising the results | ||
""" | ||
|
||
print( | ||
f""" | ||
{self.expt_type:=^80} | ||
Formula: {self.formula} | ||
Running variable: {self.running_variable_name} | ||
Kink point on running variable: {self.kink_point} | ||
|
||
Results: | ||
Change in slope at kink point = {self.gradient_change.mean():.2f} | ||
""" | ||
) | ||
self.print_coefficients() | ||
|
||
|
||
class PrePostNEGD(ExperimentalDesign): | ||
""" | ||
A class to analyse data from pretest/posttest designs | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I know you suggested not having the notebook as a place for explaining the theory of kink designs, but i feel like the differences between tweaking at least one of epsilon/bandwidth parameters could be mentioned or shown.
It's fine if tweaking them isn't needed for your example, but it'd be good to hint at why they are there.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've added an example to demonstrate use of the
bandwidth
parameter. And I've added an admonition box to explain what epsilon does.