esantorella · jameshgrn · Mar 5, 2023 · Mar 5, 2023
diff --git a/binscatter/binscatter.py b/binscatter/binscatter.py
@@ -1,43 +1,73 @@
-"""Monkey-patch Matplotlib to add an 'ax.binscatter' method."""
 import warnings
 from typing import Dict, Iterable, List, Optional, Tuple
 
-import matplotlib
 import numpy as np
 import numpy.typing as npt
 from sklearn import linear_model
 
 
 def _get_bins(n_elements: int, n_bins: int) -> List[slice]:
+    """
+    Returns a list of slice objects representing bins of equal size.
+
+    Parameters
+    ----------
+    n_elements : int
+        The number of elements to divide into bins.
+    n_bins : int
+        The number of bins to create.
+
+    Returns
+    -------
+    bins : list of slice objects
+        The bins, represented as slice objects.
+    """
+
     bin_edges = np.linspace(0, n_elements, n_bins + 1).astype(int)
-    bins = [slice(bin_edges[i], bin_edges[i + 1]) for i in range(len(bin_edges) - 1)]
-    return bins
-
-
-def get_binscatter_objects(
-    y: np.ndarray,
-    x: np.ndarray,
-    controls,
-    n_bins: int,
-    recenter_x: bool,
-    recenter_y: bool,
-    bins: Optional[Iterable],
-) -> Tuple[List[float], List[float], float, float]:
+    return [slice(bin_edges[i], bin_edges[i + 1]) for i in range(len(bin_edges) - 1)]
+
+
+def get_binscatter_objects(y: np.ndarray, x: np.ndarray, controls, n_bins: int, recenter_x: bool, recenter_y: bool,
+                           bins: Optional[Iterable]) -> Tuple[List[float], List[float], float, float]:
     """
-    Returns mean x and mean y within each bin, and coefficients if residualizing.
-    Parameters are essentially the same as in binscatter.
+    Computes mean x and y values within bins, and optionally residuals.
+
+    Parameters
+    ----------
+    y : numpy.ndarray
+        The dependent variable to bin.
+    x : numpy.ndarray
+        The independent variable to bin.
+    controls : array-like or sparse matrix or None, default=None
+        Control variables to use for residualization. If provided, residuals will be computed
+        and used to plot a regression line.
+    n_bins : int
+        The number of bins to use in the plot.
+    recenter_x : bool
+        Whether to recenter residualized x by adding the mean of the original x.
+    recenter_y : bool
+        Whether to recenter the dependent variable y by adding its mean.
+    bins : iterable of slice objects or None, default=None
+        The bins to use in the plot. If None, equal-sized bins will be used.
+
+    Returns
+    -------
+    x_means : list of floats
+        The mean x values within each bin.
+    y_means : list of floats
+        The mean y values within each bin.
+    intercept : float
+        The intercept of the regression line, if residuals were computed.
+    coef : float
+        The slope of the regression line, if residuals were computed.
+
     """
-    # Check if data is sorted
 
     if controls is None:
         if np.any(np.diff(x) < 0):
-            argsort = np.argsort(x)
-            x = x[argsort]
-            y = y[argsort]
-        x_data = x
-        y_data = y
+            x, y = np.sort([x, y], axis = 1)
+        x_data, y_data = x, y
     else:
-        # Residualize
         if np.ndim(controls) == 1:
             controls = np.expand_dims(controls, 1)
 
@@ -46,9 +76,7 @@ def get_binscatter_objects(
 
         demeaning_x_reg = linear_model.LinearRegression().fit(controls, x)
         x_data = x - demeaning_x_reg.predict(controls)
-        argsort = np.argsort(x_data)
-        x_data = x_data[argsort]
-        y_data = y_data[argsort]
+        x_data, y_data = np.sort([x_data, y_data], axis = 1)
 
         if recenter_y:
             y_data += np.mean(y)
@@ -67,65 +95,68 @@ def get_binscatter_objects(
     return x_means, y_means, reg.intercept_, reg.coef_[0]
 
 
-def binscatter(
-    self,
-    x: npt.ArrayLike,
-    y: npt.ArrayLike,
-    controls=None,
-    n_bins=20,
-    line_kwargs: Optional[Dict] = None,
-    scatter_kwargs: Optional[Dict] = None,
-    recenter_x: bool = False,
-    recenter_y: bool = True,
-    # TODO: make 'bins' consistent with functions in other libraries, as in pd.cut
-    bins: Optional[Iterable[slice]] = None,
-    fit_reg: Optional[bool] = True,
-) -> Tuple[List[float], List[float], float, float]:
+def binscatter(self, x: npt.ArrayLike, y: npt.ArrayLike, controls = None, n_bins = 20,
+               line_kwargs: Optional[Dict] = None, scatter_kwargs: Optional[Dict] = None, recenter_x: bool = False,
+               recenter_y: bool = True, bins: Optional[Iterable[slice]] = None, fit_reg: Optional[bool] = True) -> \
+Tuple[List[float], List[float], float, float]:
     """
-    :param self: matplotlib.axes.Axes object.
-        i.e., fig, axes = plt.subplots(3)
-              axes[0].binscatter(x, y)
-
-    :param y: Numpy ArrayLike, such as numpy.ndarray or pandas.Series; must be 1d
-    :param x: Numpy ArrayLike, such as numpy.ndarray or pandas.Series
-    :param controls: Optional, {array-like, sparse matrix}; whatever can be passed to
-        sklearn.linear_model.LinearRegression
-    :param n_bins: int, default 20
-    :param line_kwargs: keyword arguments passed to the line in the
-    :param scatter_kwargs: dict
-    :param recenter_y: If true, recenter y-tilde so its mean is the mean of y
-    :param recenter_x: If true, recenter y-tilde so its mean is the mean of y
-    :param bins: Indices of each bin. By default, if you leave 'bins' as None,
-        binscatter constructs equal sized bins;
-        if you don't like that, use this parameter to construct your own.
-    :param fit_reg: Whether to plot a regression line.
+    Plots a binned scatter plot with optional regression line.
+
+    Parameters
+    ----------
+    self : matplotlib.axes.Axes object
+        The plot to which to add the binned scatter plot.
+    y : array-like
+        The dependent variable to plot.
+    x : array-like
+        The independent variable to plot.
+    controls : array-like or sparse matrix, default=None
+        Control variables to use for residualization. If provided, residuals will be plotted
+        against residualized x.
+    n_bins : int, default=20
+        The number of bins to use in the plot.
+    line_kwargs : dict or None, default=None
+        Keyword arguments to pass to the regression line plot.
+    scatter_kwargs : dict or None, default=None
+        Keyword arguments to pass to the scatter plot.
+    recenter_x : bool, default=False
+        Whether to recenter residualized x by adding the mean of the original x.
+    recenter_y : bool, default=True
+        Whether to recenter the dependent variable y by adding its mean.
+    bins : iterable of slice objects or None, default=None
+        The bins to use in the plot. If None, equal-sized bins will be used.
+    fit_reg : bool or None, default=True
+        Whether to plot a regression line. If None, no regression line will be plotted.
+
+    Returns
+    -------
+    x_means : list of floats
+        The mean x values within each bin.
+    y_means : list of floats
+        The mean y values within each bin.
+    intercept : float
+        The intercept of the regression line.
+    coef : float
+        The slope of the regression line.
     """
+
     if line_kwargs is None:
         line_kwargs = {}
     elif not fit_reg:
         warnings.warn("Both fit_reg=False and non-None line_kwargs were passed.")
     if scatter_kwargs is None:
         scatter_kwargs = {}
 
-    x_means, y_means, intercept, coef = get_binscatter_objects(
-        np.asarray(y), np.asarray(x), controls, n_bins, recenter_x, recenter_y, bins
-    )
+    x_means, y_means, intercept, coef = get_binscatter_objects(np.asarray(y), np.asarray(x), controls, n_bins,
+        recenter_x, recenter_y, bins)
 
     self.scatter(x_means, y_means, **scatter_kwargs)
     x_range = np.array(self.get_xlim())
     if fit_reg:
-        self.plot(
-            x_range,
-            intercept + x_range * coef,
-            label="beta=" + str(round(coef, 3)),
-            **line_kwargs
-        )
-    # If series were passed, might be able to label
+        self.plot(x_range, intercept + x_range * coef, label = "beta=" + str(round(coef, 3)), **line_kwargs)
+
     if hasattr(x, "name"):
         self.set_xlabel(x.name)
     if hasattr(y, "name"):
         self.set_ylabel(y.name)
     return x_means, y_means, intercept, coef
-
-
-matplotlib.axes.Axes.binscatter = binscatter