From bf922e1460539f0a630130e3c790fa353b6c6a68 Mon Sep 17 00:00:00 2001 From: Gokcen Eraslan Date: Mon, 2 Dec 2019 10:52:18 -0500 Subject: [PATCH 01/28] Add replace option to subsample. --- scanpy/preprocessing/_simple.py | 5 ++++- scanpy/tests/test_preprocessing.py | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/scanpy/preprocessing/_simple.py b/scanpy/preprocessing/_simple.py index 9a9a23bb97..5a339d8dab 100644 --- a/scanpy/preprocessing/_simple.py +++ b/scanpy/preprocessing/_simple.py @@ -941,6 +941,7 @@ def subsample( n_obs: Optional[int] = None, random_state: Union[int, RandomState] = 0, copy: bool = False, + replace: bool = False, ) -> Optional[AnnData]: """\ Subsample to a fraction of the number of observations. @@ -959,6 +960,8 @@ def subsample( copy If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. + replace + If True, samples are drawn with replacement. Returns ------- @@ -979,7 +982,7 @@ def subsample( logg.debug(f'... subsampled to {new_n_obs} data points') else: raise ValueError('Either pass `n_obs` or `fraction`.') - obs_indices = np.random.choice(old_n_obs, size=new_n_obs, replace=False) + obs_indices = np.random.choice(old_n_obs, size=new_n_obs, replace=replace) if isinstance(data, AnnData): adata = data.copy() if copy else data adata._inplace_subset_obs(obs_indices) diff --git a/scanpy/tests/test_preprocessing.py b/scanpy/tests/test_preprocessing.py index 8c1add73a3..6496750568 100644 --- a/scanpy/tests/test_preprocessing.py +++ b/scanpy/tests/test_preprocessing.py @@ -99,6 +99,8 @@ def test_subsample(): assert adata.n_obs == 40 sc.pp.subsample(adata, fraction=0.1) assert adata.n_obs == 4 + sc.pp.subsample(adata, n_obs=201, replace=True) + assert adata.n_obs == 201 def test_scale(): From 671ec71fb76c2036aad3a771d1137d7b4677c455 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=B6k=C3=A7en=20Eraslan?= Date: Mon, 20 Apr 2020 16:34:36 -0400 Subject: [PATCH 02/28] Add sc.pp.sample with axis argument. --- scanpy/preprocessing/__init__.py | 2 +- scanpy/preprocessing/_simple.py | 105 +++++++++++++++++++++++------ scanpy/tests/test_preprocessing.py | 31 +++++++-- 3 files changed, 109 insertions(+), 29 deletions(-) diff --git a/scanpy/preprocessing/__init__.py b/scanpy/preprocessing/__init__.py index 7c2c4d7aca..81241da595 100644 --- a/scanpy/preprocessing/__init__.py +++ b/scanpy/preprocessing/__init__.py @@ -2,7 +2,7 @@ from ._simple import filter_cells, filter_genes from ._deprecated.highly_variable_genes import filter_genes_dispersion from ._highly_variable_genes import highly_variable_genes -from ._simple import log1p, sqrt, scale, subsample +from ._simple import log1p, sqrt, scale, subsample, sample from ._simple import normalize_per_cell, regress_out, downsample_counts from ._pca import pca from ._qc import calculate_qc_metrics diff --git a/scanpy/preprocessing/_simple.py b/scanpy/preprocessing/_simple.py index 2af4cff1c4..ad484f6f4f 100644 --- a/scanpy/preprocessing/_simple.py +++ b/scanpy/preprocessing/_simple.py @@ -743,16 +743,17 @@ def scale( return X if copy else None -def subsample( +def sample( data: Union[AnnData, np.ndarray, spmatrix], fraction: Optional[float] = None, - n_obs: Optional[int] = None, + n: Optional[int] = None, random_state: AnyRandom = 0, copy: bool = False, replace: bool = False, + axis: int = 0, ) -> Optional[AnnData]: """\ - Subsample to a fraction of the number of observations. + Sample observations or variables with or without replacement. Parameters ---------- @@ -760,9 +761,10 @@ def subsample( The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. fraction - Subsample to this `fraction` of the number of observations. - n_obs - Subsample to this number of observations. + Subsample to this `fraction` of the number of observations or variables. + See `axis`. + n + Sample to this number of observations or variables. See `axis`. random_state Random seed to change subsampling. copy @@ -770,35 +772,97 @@ def subsample( determines whether a copy is returned. replace If True, samples are drawn with replacement. + axis + Sample observations (axis=0) or variables (axis=1). Default is 0. Returns ------- - Returns `X[obs_indices], obs_indices` if data is array-like, otherwise - subsamples the passed :class:`~anndata.AnnData` (`copy == False`) or - returns a subsampled copy of it (`copy == True`). + Returns `X[indices] or X[:, indices], indices` depending on the axis + argument if data is array-like, otherwise samples the passed + :class:`~anndata.AnnData` (`copy == False`) or returns a sampled + copy of it (`copy == True`). """ np.random.seed(random_state) - old_n_obs = data.n_obs if isinstance(data, AnnData) else data.shape[0] - if n_obs is not None: - new_n_obs = n_obs + old_n = data.shape[axis] + if axis not in (0, 1): + raise ValueError(f'`axis` must be either 0 or 1.') + if fraction is None and n is None: + raise ValueError(f'Either `fraction` or `n` must be set.') + if fraction is not None and n is not None: + raise ValueError(f'Providing both `fraction` and `n` is not allowed.') + if n is not None: + new_n = n elif fraction is not None: - if fraction > 1 or fraction < 0: + if fraction < 0: + raise ValueError(f'`fraction needs to be nonnegative`, not {fraction}') + if not replace and fraction > 1: raise ValueError( - f'`fraction` needs to be within [0, 1], not {fraction}' + f'If replace=False, `fraction` needs to be within [0, 1], not {fraction}' ) - new_n_obs = int(fraction * old_n_obs) - logg.debug(f'... subsampled to {new_n_obs} data points') + new_n = int(fraction * old_n) + obs_or_var_str = 'observations' if axis == 0 else 'variables' + logg.debug(f'... sampled to {new_n} {obs_or_var_str}') else: raise ValueError('Either pass `n_obs` or `fraction`.') - obs_indices = np.random.choice(old_n_obs, size=new_n_obs, replace=replace) + indices = np.random.choice(old_n, size=new_n, replace=replace) if isinstance(data, AnnData): if copy: - return data[obs_indices].copy() + view = data[indices] if axis == 0 else data[:, indices] + return view.copy() else: - data._inplace_subset_obs(obs_indices) + if axis == 0: + data._inplace_subset_obs(indices) + else: + data._inplace_subset_var(indices) else: X = data - return X[obs_indices], obs_indices + return X[indices] if axis == 0 else X[:, indices], indices + + +def subsample( + data: Union[AnnData, np.ndarray, spmatrix], + fraction: Optional[float] = None, + n_obs: Optional[int] = None, + random_state: AnyRandom = 0, + copy: bool = False, +) -> Optional[AnnData]: + """\ + Subsample to a fraction of the number of observations. + + .. warning:: + .. deprecated:: 1.4.7 + Use :func:`~scanpy.pp.sample` instead. + + Parameters + ---------- + data + The (annotated) data matrix of shape `n_obs` × `n_vars`. + Rows correspond to cells and columns to genes. + fraction + Subsample to this `fraction` of the number of observations. + n_obs + Subsample to this number of observations. + random_state + Random seed to change subsampling. + copy + If an :class:`~anndata.AnnData` is passed, + determines whether a copy is returned. + + Returns + ------- + Returns `X[obs_indices], obs_indices` if data is array-like, otherwise + subsamples the passed :class:`~anndata.AnnData` (`copy == False`) or + returns a subsampled copy of it (`copy == True`). + """ + return sample( + data=data, + fraction=fraction, + n=n_obs, + random_state=random_state, + copy=copy, + replace=False, + axis=0, + ) @deprecated_arg_names({"target_counts": "counts_per_cell"}) @@ -972,7 +1036,6 @@ def _downsample_array( return col - def zscore_deprecated(X: np.ndarray) -> np.ndarray: """\ Z-score standardize each variable/gene in X. diff --git a/scanpy/tests/test_preprocessing.py b/scanpy/tests/test_preprocessing.py index b6fce8b11d..e96ad35033 100644 --- a/scanpy/tests/test_preprocessing.py +++ b/scanpy/tests/test_preprocessing.py @@ -91,20 +91,37 @@ def test_normalize_per_cell(): axis=1).A1.tolist() -def test_subsample(): +def test_sample(): adata = AnnData(np.ones((200, 10))) - sc.pp.subsample(adata, n_obs=40) + sc.pp.sample(adata, n=40) assert adata.n_obs == 40 - sc.pp.subsample(adata, fraction=0.1) + sc.pp.sample(adata, fraction=0.1) assert adata.n_obs == 4 - sc.pp.subsample(adata, n_obs=201, replace=True) + sc.pp.sample(adata, n=201, replace=True) assert adata.n_obs == 201 + sc.pp.sample(adata, n=10, axis=1) + assert adata.n_vars == 10 + sc.pp.sample(adata, n=11, axis=1, replace=True) + assert adata.n_vars == 11 + sc.pp.sample(adata, fraction=2.0, axis=1, replace=True) + assert adata.n_vars == 22 + adata = AnnData(sp.csr_matrix(np.ones((200, 10)))) + sc.pp.sample(adata, fraction=2.0, axis=1, replace=True) + assert adata.n_vars == 20 -def test_subsample_copy(): + +def test_sample_copy(): adata = AnnData(np.ones((200, 10))) - assert sc.pp.subsample(adata, n_obs=40, copy=True).shape == (40, 10) - assert sc.pp.subsample(adata, fraction=0.1, copy=True).shape == (20, 10) + assert sc.pp.sample(adata, n=40, copy=True).shape == (40, 10) + assert sc.pp.sample(adata, fraction=0.1, copy=True).shape == (20, 10) + assert sc.pp.sample(adata, fraction=0.1, copy=True).shape == (20, 10) + X = sc.pp.sample(adata, fraction=2.0, axis=1, replace=True, copy=True) + assert X.shape == (200, 20) + + adata = AnnData(sp.csr_matrix(np.ones((200, 10)))) + X = sc.pp.sample(adata, fraction=2.0, axis=1, replace=True, copy=True) + assert X.shape == (200, 20) def test_scale(): From 9e0739bb103c7d6138c4ccf36ea5a8e92965de83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=B6k=C3=A7en=20Eraslan?= Date: Mon, 20 Apr 2020 16:43:27 -0400 Subject: [PATCH 03/28] Fix fraction doc --- scanpy/preprocessing/_simple.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scanpy/preprocessing/_simple.py b/scanpy/preprocessing/_simple.py index ad484f6f4f..d02518f68d 100644 --- a/scanpy/preprocessing/_simple.py +++ b/scanpy/preprocessing/_simple.py @@ -761,8 +761,9 @@ def sample( The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. fraction - Subsample to this `fraction` of the number of observations or variables. - See `axis`. + Sample to this `fraction` of the number of observations or variables. + This can be larger than 1.0, if replace=True. + See `axis` and `replace`. n Sample to this number of observations or variables. See `axis`. random_state From 8ec8cf38aa0a570f216e9f6a877dcc3da8ee02a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=B6k=C3=A7en=20Eraslan?= Date: Mon, 20 Apr 2020 16:46:28 -0400 Subject: [PATCH 04/28] Add to release notes --- docs/release-latest.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/release-latest.rst b/docs/release-latest.rst index e568bd594e..ee52c332b3 100644 --- a/docs/release-latest.rst +++ b/docs/release-latest.rst @@ -4,6 +4,10 @@ On master ~~~~~~~~~~ +.. rubric:: New functionality + +- :func:`~scanpy.pp.sample` supports both upsampling and downsampling of observations and variables. :func:`~scanpy.pp.subsample` is now deprecated. + .. rubric:: Performance - :func:`~scanpy.pp.pca` now uses efficient implicit centering for sparse matrices. This can lead to signifigantly improved performance for large datasets :pr:`1066` :smaller:`A Tarashansky` From fdf524afd3fb4b2476f3d8295502280dfb21702c Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 14 Nov 2024 13:41:09 +0100 Subject: [PATCH 05/28] refactor --- src/scanpy/preprocessing/_simple.py | 130 ++++++++++++++++++---------- 1 file changed, 86 insertions(+), 44 deletions(-) diff --git a/src/scanpy/preprocessing/_simple.py b/src/scanpy/preprocessing/_simple.py index b5d76b323e..38e908d31a 100644 --- a/src/scanpy/preprocessing/_simple.py +++ b/src/scanpy/preprocessing/_simple.py @@ -8,7 +8,7 @@ import warnings from functools import singledispatch from itertools import repeat -from typing import TYPE_CHECKING, TypeVar +from typing import TYPE_CHECKING, TypeVar, overload import numba import numpy as np @@ -22,6 +22,7 @@ from .._settings import settings as sett from .._utils import ( _check_array_function_arguments, + _resolve_axis, axis_sum, is_backed_type, raise_not_implemented_error_if_backed_type, @@ -33,7 +34,6 @@ from ._distributed import materialize_as_ndarray from ._utils import _to_dense -# install dask if available try: import dask.array as da except ImportError: @@ -49,10 +49,13 @@ import pandas as pd from numpy.typing import NDArray + from scipy.sparse import csc_matrix from .._compat import DaskArray from .._utils import AnyRandom + CSMatrix = csr_matrix | csc_matrix + @old_positionals( "min_counts", "min_genes", "max_counts", "max_genes", "inplace", "copy" @@ -825,16 +828,49 @@ def _regress_out_chunk( return np.vstack(responses_chunk_list) +@overload def sample( - data: AnnData | np.ndarray | spmatrix, + data: AnnData, + fraction: float | None = None, + *, + n: int | None = None, + random_state: AnyRandom = 0, + copy: Literal[False] = False, + replace: bool = False, + axis: Literal["obs", 0, "var", 1] = "obs", +) -> None: ... +@overload +def sample( + data: AnnData, + fraction: float | None = None, + *, + n: int | None = None, + random_state: AnyRandom = 0, + copy: Literal[True], + replace: bool = False, + axis: Literal["obs", 0, "var", 1] = "obs", +) -> AnnData | None: ... +@overload +def sample( + data: np.ndarray | CSMatrix, fraction: float | None = None, *, n: int | None = None, random_state: AnyRandom = 0, copy: bool = False, replace: bool = False, - axis: int = 0, -) -> AnnData | None: + axis: Literal["obs", 0, "var", 1] = "obs", +) -> tuple[np.ndarray | CSMatrix, NDArray[np.int64]]: ... +def sample( + data: AnnData | np.ndarray | CSMatrix, + fraction: float | None = None, + *, + n: int | None = None, + random_state: AnyRandom = 0, + copy: bool = False, + replace: bool = False, + axis: Literal["obs", 0, "var", 1] = "obs", +) -> AnnData | None | tuple[np.ndarray | CSMatrix, NDArray[np.int64]]: """\ Sample observations or variables with or without replacement. @@ -845,7 +881,7 @@ def sample( Rows correspond to cells and columns to genes. fraction Sample to this `fraction` of the number of observations or variables. - This can be larger than 1.0, if replace=True. + This can be larger than 1.0, if `replace=True`. See `axis` and `replace`. n Sample to this number of observations or variables. See `axis`. @@ -857,58 +893,64 @@ def sample( replace If True, samples are drawn with replacement. axis - Sample observations (axis=0) or variables (axis=1). Default is 0. + Sample `obs`\\ ervations (axis 0) or `var`\\ iables (axis 1). Returns ------- - Returns `X[indices] or X[:, indices], indices` depending on the axis - argument if data is array-like, otherwise samples the passed - :class:`~anndata.AnnData` (`copy == False`) or returns a sampled - copy of it (`copy == True`). + If `isinstance(data, AnnData)` and `copy=False`, + this function returns `None`. Otherwise: + + `data[indices, :]` | `data[:, indices]` (depending on `axis`) + If `data` is array-like or `copy=True`, returns the subset. + `indices` : numpy.ndarray + If `data` is array-like, also returns the indices into the original. """ - np.random.seed(random_state) + axis, axis_name = _resolve_axis(axis) + match (fraction, n): + case (None, None): + msg = "Either `fraction` or `n` must be set." + raise TypeError(msg) + case (float(), int()): + msg = "Providing both `fraction` and `n` is not allowed." + raise TypeError(msg) + old_n = data.shape[axis] - if axis not in (0, 1): - raise ValueError("`axis` must be either 0 or 1.") - if fraction is None and n is None: - raise ValueError("Either `fraction` or `n` must be set.") - if fraction is not None and n is not None: - raise ValueError("Providing both `fraction` and `n` is not allowed.") if n is not None: new_n = n elif fraction is not None: if fraction < 0: - raise ValueError(f"`fraction needs to be nonnegative`, not {fraction}") + msg = f"fraction needs to be nonnegative, not {fraction}" + raise ValueError(msg) if not replace and fraction > 1: - raise ValueError( - f"If replace=False, `fraction` needs to be within [0, 1], not {fraction}" - ) + msg = f"If replace=False, `fraction` needs to be within [0, 1], not {fraction}" + raise ValueError(msg) new_n = int(fraction * old_n) - obs_or_var_str = "observations" if axis == 0 else "variables" - logg.debug(f"... sampled to {new_n} {obs_or_var_str}") + logg.debug(f"... sampled to {new_n} {axis_name}") else: - raise ValueError("Either pass `n_obs` or `fraction`.") + msg = "Either pass `n_obs` or `fraction`." + raise ValueError(msg) + + np.random.seed(random_state) indices = np.random.choice(old_n, size=new_n, replace=replace) - if isinstance(data, AnnData): - if data.isbacked: - if copy: - view = data[indices] if axis == 0 else data[:, indices] - return view.to_memory() - else: - raise NotImplementedError( - "Inplace sampling is not implemented for backed objects." - ) - else: - if copy: - view = data[indices] if axis == 0 else data[:, indices] - return view.copy() - elif axis == 0: - data._inplace_subset_obs(indices) - else: - data._inplace_subset_var(indices) + subset = data[indices] if axis_name == "obs" else data[:, indices] + + if not isinstance(data, AnnData): + assert not isinstance(subset, AnnData) + if copy: + subset = subset.copy() + return subset, indices + assert isinstance(subset, AnnData) + if copy: + return subset.to_memory() if data.isbacked else subset.copy() + + # in-place + if data.isbacked: + msg = "Inplace sampling (`copy=False`) is not implemented for backed objects." + raise NotImplementedError(msg) + if axis_name == "obs": + data._inplace_subset_obs(indices) else: - X = data - return X[indices] if axis == 0 else X[:, indices], indices + data._inplace_subset_var(indices) @old_positionals("n_obs", "random_state", "copy") From 061a19d4682118691684321272cbf9d1114c8038 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 14 Nov 2024 14:16:41 +0100 Subject: [PATCH 06/28] Refactor tests --- src/scanpy/preprocessing/_simple.py | 8 +-- tests/test_preprocessing.py | 96 +++++++++++++++++------------ 2 files changed, 62 insertions(+), 42 deletions(-) diff --git a/src/scanpy/preprocessing/_simple.py b/src/scanpy/preprocessing/_simple.py index 38e908d31a..bc4f50b201 100644 --- a/src/scanpy/preprocessing/_simple.py +++ b/src/scanpy/preprocessing/_simple.py @@ -849,7 +849,7 @@ def sample( copy: Literal[True], replace: bool = False, axis: Literal["obs", 0, "var", 1] = "obs", -) -> AnnData | None: ... +) -> AnnData: ... @overload def sample( data: np.ndarray | CSMatrix, @@ -927,7 +927,7 @@ def sample( new_n = int(fraction * old_n) logg.debug(f"... sampled to {new_n} {axis_name}") else: - msg = "Either pass `n_obs` or `fraction`." + msg = "Either pass `n` or `fraction`." raise ValueError(msg) np.random.seed(random_state) @@ -955,13 +955,13 @@ def sample( @old_positionals("n_obs", "random_state", "copy") def subsample( - data: AnnData | np.ndarray | spmatrix, + data: AnnData | np.ndarray | CSMatrix, fraction: float | None = None, *, n_obs: int | None = None, random_state: AnyRandom = 0, copy: bool = False, -) -> AnnData | tuple[np.ndarray | spmatrix, NDArray[np.int64]] | None: +) -> AnnData | tuple[np.ndarray | CSMatrix, NDArray[np.int64]] | None: """\ Subsample to a fraction of the number of observations. diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 0939e3e74c..1354777ed5 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -3,6 +3,7 @@ import warnings from itertools import product from pathlib import Path +from typing import TYPE_CHECKING import numpy as np import pandas as pd @@ -23,6 +24,13 @@ from testing.scanpy._helpers.data import pbmc3k, pbmc68k_reduced from testing.scanpy._pytest.params import ARRAY_TYPES +if TYPE_CHECKING: + from collections.abc import Callable + from typing import Literal + + CSMatrix = sp.csc_matrix | sp.csr_matrix + + HERE = Path(__file__).parent DATA_PATH = HERE / "_data" @@ -135,52 +143,64 @@ def test_normalize_per_cell(): assert adata.X.sum(axis=1).tolist() == adata_sparse.X.sum(axis=1).A1.tolist() -def test_sample(): - warnings.filterwarnings("ignore", r".*names are not unique", UserWarning) - adata = AnnData(np.ones((200, 10))) - sc.pp.sample(adata, n=40) - assert adata.n_obs == 40 - sc.pp.sample(adata, fraction=0.1) - assert adata.n_obs == 4 - sc.pp.sample(adata, n=201, replace=True) - assert adata.n_obs == 201 - sc.pp.sample(adata, n=10, axis=1) - assert adata.n_vars == 10 - sc.pp.sample(adata, n=11, axis=1, replace=True) - assert adata.n_vars == 11 - sc.pp.sample(adata, fraction=2.0, axis=1, replace=True) - assert adata.n_vars == 22 - - adata = AnnData(sp.csr_matrix(np.ones((200, 10)))) - sc.pp.sample(adata, fraction=2.0, axis=1, replace=True) - assert adata.n_vars == 20 - - -def test_sample_copy(): - adata = AnnData(np.ones((200, 10))) - assert sc.pp.sample(adata, n=40, copy=True).shape == (40, 10) - assert sc.pp.sample(adata, fraction=0.1, copy=True).shape == (20, 10) - assert sc.pp.sample(adata, fraction=0.1, copy=True).shape == (20, 10) - X = sc.pp.sample(adata, fraction=2.0, axis=1, replace=True, copy=True) - assert X.shape == (200, 20) - - adata = AnnData(sp.csr_matrix(np.ones((200, 10)))) - X = sc.pp.sample(adata, fraction=2.0, axis=1, replace=True, copy=True) - assert X.shape == (200, 20) +@pytest.mark.parametrize("array_type", ARRAY_TYPES) +@pytest.mark.parametrize("copy", [True, False], ids=["copy", "inplace"]) +@pytest.mark.parametrize( + ("axis", "fraction", "n", "replace", "expected"), + [ + pytest.param(0, None, 40, False, 40, id="obs-40-no_replace"), + pytest.param(0, 0.1, None, False, 20, id="obs-0.1-no_replace"), + pytest.param(0, None, 201, True, 201, id="obs-201-replace"), + pytest.param(1, None, 10, False, 10, id="var-10-no_replace"), + pytest.param(1, None, 11, True, 11, id="var-11-replace"), + pytest.param(1, 2.0, None, True, 20, id="var-2.0-replace"), + ], +) +@pytest.mark.filterwarnings("ignore::dask.array.PerformanceWarning") +def test_sample( + *, + array_type: Callable[[np.ndarray], np.ndarray | CSMatrix], + copy: bool, + axis: Literal[0, 1], + fraction: float | None, + n: int | None, + replace: bool, + expected: int, +): + adata = AnnData(array_type(np.ones((200, 10)))) + + # can’t guarantee that duplicates are drawn when `replace=True`, + # so we just ignore the warning instead using `with pytest.warns(...)` + warnings.filterwarnings( + "ignore" if replace else "error", r".*names are not unique", UserWarning + ) + rv = sc.pp.sample(adata, fraction, n=n, replace=replace, axis=axis, copy=copy) + + if copy: + assert adata.shape == (200, 10) + subset = rv + else: + assert rv is None + subset = adata + + assert subset.shape == ((expected, 10) if axis == 0 else (200, expected)) def test_sample_copy_backed(tmp_path): - A = np.random.rand(200, 10).astype(np.float32) - adata_m = AnnData(A.copy()) - adata_d = AnnData(A.copy()) - filename = tmp_path / "test.h5ad" - adata_d.filename = filename - # This should not throw an error + adata_m = AnnData(np.random.rand(200, 10).astype(np.float32)) + adata_d = adata_m.copy() + adata_d.filename = tmp_path / "test.h5ad" + assert sc.pp.sample(adata_d, n=40, copy=True).shape == (40, 10) np.testing.assert_array_equal( sc.pp.sample(adata_m, n=40, copy=True).X, sc.pp.sample(adata_d, n=40, copy=True).X, ) + + +def test_sample_copy_backed_error(tmp_path): + adata_d = AnnData(np.random.rand(200, 10).astype(np.float32)) + adata_d.filename = tmp_path / "test.h5ad" with pytest.raises(NotImplementedError): sc.pp.sample(adata_d, n=40, copy=False) From 06d428006798727a3b3ea3fa97f2ddce916b7812 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 14 Nov 2024 14:36:05 +0100 Subject: [PATCH 07/28] handle array case in test --- tests/test_preprocessing.py | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 1354777ed5..36c1726333 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -144,7 +144,7 @@ def test_normalize_per_cell(): @pytest.mark.parametrize("array_type", ARRAY_TYPES) -@pytest.mark.parametrize("copy", [True, False], ids=["copy", "inplace"]) +@pytest.mark.parametrize("which", ["copy", "inplace", "array"]) @pytest.mark.parametrize( ("axis", "fraction", "n", "replace", "expected"), [ @@ -160,7 +160,7 @@ def test_normalize_per_cell(): def test_sample( *, array_type: Callable[[np.ndarray], np.ndarray | CSMatrix], - copy: bool, + which: Literal["copy", "inplace", "array"], axis: Literal[0, 1], fraction: float | None, n: int | None, @@ -174,14 +174,30 @@ def test_sample( warnings.filterwarnings( "ignore" if replace else "error", r".*names are not unique", UserWarning ) - rv = sc.pp.sample(adata, fraction, n=n, replace=replace, axis=axis, copy=copy) + rv = sc.pp.sample( + adata.X if which == "array" else adata, + fraction, + n=n, + replace=replace, + axis=axis, + # `copy` only effects AnnData inputs + copy=dict(copy=True, inplace=False, array=False)[which], + ) - if copy: - assert adata.shape == (200, 10) - subset = rv - else: - assert rv is None - subset = adata + match which: + case "copy": + subset = rv + assert rv is not adata + assert adata.shape == (200, 10) + case "inplace": + subset = adata + assert rv is None + case "array": + subset, indices = rv + assert len(indices) == expected + assert adata.shape == (200, 10) + case _: + pytest.fail(f"Unknown `{which=}`") assert subset.shape == ((expected, 10) if axis == 0 else (200, expected)) From 6eeab2ea641a95a4506b39da3768c22f899d3ee5 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 14 Nov 2024 14:58:49 +0100 Subject: [PATCH 08/28] Test errors --- src/scanpy/preprocessing/_simple.py | 33 +++++++++++++--------------- tests/test_preprocessing.py | 34 ++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 19 deletions(-) diff --git a/src/scanpy/preprocessing/_simple.py b/src/scanpy/preprocessing/_simple.py index bc4f50b201..26f025ad95 100644 --- a/src/scanpy/preprocessing/_simple.py +++ b/src/scanpy/preprocessing/_simple.py @@ -906,32 +906,29 @@ def sample( If `data` is array-like, also returns the indices into the original. """ axis, axis_name = _resolve_axis(axis) + old_n = data.shape[axis] match (fraction, n): case (None, None): msg = "Either `fraction` or `n` must be set." raise TypeError(msg) - case (float(), int()): + case (None, _): + pass + case (_, None): + if fraction < 0: + msg = f"`{fraction=}` needs to be nonnegative." + raise ValueError(msg) + if not replace and fraction > 1: + msg = f"If `replace=False`, `{fraction=}` needs to be within [0, 1]." + raise ValueError(msg) + n = int(fraction * old_n) + logg.debug(f"... sampled to {n} {axis_name}") + case _: msg = "Providing both `fraction` and `n` is not allowed." raise TypeError(msg) - - old_n = data.shape[axis] - if n is not None: - new_n = n - elif fraction is not None: - if fraction < 0: - msg = f"fraction needs to be nonnegative, not {fraction}" - raise ValueError(msg) - if not replace and fraction > 1: - msg = f"If replace=False, `fraction` needs to be within [0, 1], not {fraction}" - raise ValueError(msg) - new_n = int(fraction * old_n) - logg.debug(f"... sampled to {new_n} {axis_name}") - else: - msg = "Either pass `n` or `fraction`." - raise ValueError(msg) + del fraction np.random.seed(random_state) - indices = np.random.choice(old_n, size=new_n, replace=replace) + indices = np.random.choice(old_n, size=n, replace=replace) subset = data[indices] if axis_name == "obs" else data[:, indices] if not isinstance(data, AnnData): diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 36c1726333..3a5be7d257 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -26,7 +26,7 @@ if TYPE_CHECKING: from collections.abc import Callable - from typing import Literal + from typing import Any, Literal CSMatrix = sp.csc_matrix | sp.csr_matrix @@ -202,6 +202,38 @@ def test_sample( assert subset.shape == ((expected, 10) if axis == 0 else (200, expected)) +@pytest.mark.parametrize( + ("args", "exc", "pattern"), + [ + pytest.param( + dict(), TypeError, r"Either `fraction` or `n` must be set", id="empty" + ), + pytest.param( + dict(n=10, fraction=0.2), + TypeError, + r"Providing both `fraction` and `n` is not allowed", + id="both", + ), + pytest.param( + dict(fraction=2), + ValueError, + r"If `replace=False`, `fraction=2` needs to be", + id="frac>1", + ), + pytest.param( + dict(fraction=-0.3), + ValueError, + r"`fraction=-0\.3` needs to be nonnegative", + id="frac<0", + ), + ], +) +def test_sample_error(args: dict[str, Any], exc: type[Exception], pattern: str): + adata = AnnData(np.ones((200, 10))) + with pytest.raises(exc, match=pattern): + sc.pp.sample(adata, **args) + + def test_sample_copy_backed(tmp_path): adata_m = AnnData(np.random.rand(200, 10).astype(np.float32)) adata_d = adata_m.copy() From b1f50610f41c937f0097f9719d603f67f449c904 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 14 Nov 2024 15:02:57 +0100 Subject: [PATCH 09/28] prettier deprecations --- .../_deprecated/highly_variable_genes.py | 22 ++++++++--------- src/scanpy/preprocessing/_simple.py | 24 +++++++++---------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/scanpy/preprocessing/_deprecated/highly_variable_genes.py b/src/scanpy/preprocessing/_deprecated/highly_variable_genes.py index f2c3ce971b..e322013d47 100644 --- a/src/scanpy/preprocessing/_deprecated/highly_variable_genes.py +++ b/src/scanpy/preprocessing/_deprecated/highly_variable_genes.py @@ -48,18 +48,18 @@ def filter_genes_dispersion( """\ Extract highly variable genes :cite:p:`Satija2015,Zheng2017`. - .. warning:: - .. deprecated:: 1.3.6 - Use :func:`~scanpy.pp.highly_variable_genes` - instead. The new function is equivalent to the present - function, except that + .. deprecated:: 1.3.6 - * the new function always expects logarithmized data - * `subset=False` in the new function, it suffices to - merely annotate the genes, tools like `pp.pca` will - detect the annotation - * you can now call: `sc.pl.highly_variable_genes(adata)` - * `copy` is replaced by `inplace` + Use :func:`~scanpy.pp.highly_variable_genes` + instead. The new function is equivalent to the present + function, except that + + * the new function always expects logarithmized data + * `subset=False` in the new function, it suffices to + merely annotate the genes, tools like `pp.pca` will + detect the annotation + * you can now call: `sc.pl.highly_variable_genes(adata)` + * `copy` is replaced by `inplace` If trying out parameters, pass the data matrix instead of AnnData. diff --git a/src/scanpy/preprocessing/_simple.py b/src/scanpy/preprocessing/_simple.py index 26f025ad95..e578cfdb90 100644 --- a/src/scanpy/preprocessing/_simple.py +++ b/src/scanpy/preprocessing/_simple.py @@ -501,16 +501,16 @@ def normalize_per_cell( """\ Normalize total counts per cell. - .. warning:: - .. deprecated:: 1.3.7 - Use :func:`~scanpy.pp.normalize_total` instead. - The new function is equivalent to the present - function, except that + .. deprecated:: 1.3.7 - * the new function doesn't filter cells based on `min_counts`, - use :func:`~scanpy.pp.filter_cells` if filtering is needed. - * some arguments were renamed - * `copy` is replaced by `inplace` + Use :func:`~scanpy.pp.normalize_total` instead. + The new function is equivalent to the present + function, except that + + * the new function doesn't filter cells based on `min_counts`, + use :func:`~scanpy.pp.filter_cells` if filtering is needed. + * some arguments were renamed + * `copy` is replaced by `inplace` Normalize each cell by total counts over all genes, so that every cell has the same total count after normalization. @@ -962,9 +962,9 @@ def subsample( """\ Subsample to a fraction of the number of observations. - .. warning:: - .. deprecated:: 1.4.7 - Use :func:`~scanpy.pp.sample` instead. + .. deprecated:: 1.11.0 + + Use :func:`~scanpy.pp.sample` instead. Parameters ---------- From cec8affe0c1fe881bad146846c4ba856081f6030 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 14 Nov 2024 15:13:21 +0100 Subject: [PATCH 10/28] docs --- docs/api/deprecated.md | 1 + docs/api/preprocessing.md | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/api/deprecated.md b/docs/api/deprecated.md index 4511f4b3a7..d09c1af405 100644 --- a/docs/api/deprecated.md +++ b/docs/api/deprecated.md @@ -11,4 +11,5 @@ pp.filter_genes_dispersion pp.normalize_per_cell + pp.subsample ``` diff --git a/docs/api/preprocessing.md b/docs/api/preprocessing.md index 4b17567a6b..36e732a6dc 100644 --- a/docs/api/preprocessing.md +++ b/docs/api/preprocessing.md @@ -31,7 +31,7 @@ For visual quality control, see {func}`~scanpy.pl.highest_expr_genes` and pp.normalize_total pp.regress_out pp.scale - pp.subsample + pp.sample pp.downsample_counts ``` From daa147ef681483ba5bb0808f5da0724ff10a5ce9 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 14 Nov 2024 15:38:16 +0100 Subject: [PATCH 11/28] ignore dask warning correctly --- tests/test_preprocessing.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 3a5be7d257..e168f66e14 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -22,7 +22,7 @@ maybe_dask_process_context, ) from testing.scanpy._helpers.data import pbmc3k, pbmc68k_reduced -from testing.scanpy._pytest.params import ARRAY_TYPES +from testing.scanpy._pytest.params import ARRAY_TYPES, param_with if TYPE_CHECKING: from collections.abc import Callable @@ -143,7 +143,16 @@ def test_normalize_per_cell(): assert adata.X.sum(axis=1).tolist() == adata_sparse.X.sum(axis=1).A1.tolist() -@pytest.mark.parametrize("array_type", ARRAY_TYPES) +ignore_dask_perf = pytest.mark.filterwarnings("ignore::dask.array.PerformanceWarning") + + +@pytest.mark.parametrize( + "array_type", + [ + param_with(p, marks=[ignore_dask_perf] if "dask" in (p.id or "") else []) + for p in ARRAY_TYPES + ], +) @pytest.mark.parametrize("which", ["copy", "inplace", "array"]) @pytest.mark.parametrize( ("axis", "fraction", "n", "replace", "expected"), @@ -156,7 +165,6 @@ def test_normalize_per_cell(): pytest.param(1, 2.0, None, True, 20, id="var-2.0-replace"), ], ) -@pytest.mark.filterwarnings("ignore::dask.array.PerformanceWarning") def test_sample( *, array_type: Callable[[np.ndarray], np.ndarray | CSMatrix], From 3c31abd898f816b61c9e7b867d8130929e5e50d3 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 14 Nov 2024 15:43:06 +0100 Subject: [PATCH 12/28] sig exception --- tests/test_package_structure.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_package_structure.py b/tests/test_package_structure.py index 834c06d8b4..3541c561a5 100644 --- a/tests/test_package_structure.py +++ b/tests/test_package_structure.py @@ -138,6 +138,7 @@ class ExpectedSig(TypedDict): copy_sigs["sc.pp.filter_cells"] = None # unclear `inplace` situation copy_sigs["sc.pp.filter_genes"] = None # unclear `inplace` situation copy_sigs["sc.pp.subsample"] = None # returns indices along matrix +copy_sigs["sc.pp.sample"] = None # returns indices along matrix # partial exceptions: “data” instead of “adata” copy_sigs["sc.pp.log1p"]["first_name"] = "data" copy_sigs["sc.pp.normalize_per_cell"]["first_name"] = "data" From d3504114d2d36580b08ee7b0dc794ab7d0b24f1b Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Mon, 18 Nov 2024 16:57:27 +0100 Subject: [PATCH 13/28] WIP --- src/scanpy/_compat.py | 22 ++++++++++++++++++++++ src/scanpy/preprocessing/_simple.py | 27 ++++++++++++++------------- tests/test_utils.py | 21 ++++++++++++++++++++- 3 files changed, 56 insertions(+), 14 deletions(-) diff --git a/src/scanpy/_compat.py b/src/scanpy/_compat.py index c5fa4dbe84..d3de7b73ef 100644 --- a/src/scanpy/_compat.py +++ b/src/scanpy/_compat.py @@ -9,12 +9,14 @@ from pathlib import Path from typing import TYPE_CHECKING, Literal, ParamSpec, TypeVar, cast, overload +import numpy as np from packaging.version import Version if TYPE_CHECKING: from collections.abc import Callable from importlib.metadata import PackageMetadata + P = ParamSpec("P") R = TypeVar("R") @@ -194,3 +196,23 @@ def _numba_threading_layer() -> Layer: f" ({available=}, {numba.config.THREADING_LAYER_PRIORITY=})" ) raise ValueError(msg) + + +_LegacyRandom = int | np.random.RandomState | None + + +def _legacy_numpy_gen( + random_state: _LegacyRandom | None = None, +) -> np.random.RandomState: + """Return a random generator that behaves like the legacy one.""" + + if random_state is not None: + if isinstance(random_state, np.random.RandomState): + np.random.set_state(random_state.get_state(legacy=False)) + return random_state + np.random.seed(random_state) + state = np.random.get_state(legacy=True) + assert isinstance(state, tuple) + bit_gen = np.random.MT19937() + bit_gen.state = state + return np.random.RandomState(bit_gen) diff --git a/src/scanpy/preprocessing/_simple.py b/src/scanpy/preprocessing/_simple.py index e578cfdb90..9ec518f412 100644 --- a/src/scanpy/preprocessing/_simple.py +++ b/src/scanpy/preprocessing/_simple.py @@ -18,7 +18,7 @@ from sklearn.utils import check_array, sparsefuncs from .. import logging as logg -from .._compat import njit, old_positionals +from .._compat import _legacy_numpy_gen, njit, old_positionals from .._settings import settings as sett from .._utils import ( _check_array_function_arguments, @@ -51,8 +51,8 @@ from numpy.typing import NDArray from scipy.sparse import csc_matrix - from .._compat import DaskArray - from .._utils import AnyRandom + from .._compat import DaskArray, _LegacyRandom + from .._utils import RNGLike, SeedLike CSMatrix = csr_matrix | csc_matrix @@ -834,7 +834,7 @@ def sample( fraction: float | None = None, *, n: int | None = None, - random_state: AnyRandom = 0, + rng: RNGLike | SeedLike | None = 0, copy: Literal[False] = False, replace: bool = False, axis: Literal["obs", 0, "var", 1] = "obs", @@ -845,7 +845,7 @@ def sample( fraction: float | None = None, *, n: int | None = None, - random_state: AnyRandom = 0, + rng: RNGLike | SeedLike | None = None, copy: Literal[True], replace: bool = False, axis: Literal["obs", 0, "var", 1] = "obs", @@ -856,7 +856,7 @@ def sample( fraction: float | None = None, *, n: int | None = None, - random_state: AnyRandom = 0, + rng: RNGLike | SeedLike | None = None, copy: bool = False, replace: bool = False, axis: Literal["obs", 0, "var", 1] = "obs", @@ -866,7 +866,7 @@ def sample( fraction: float | None = None, *, n: int | None = None, - random_state: AnyRandom = 0, + rng: RNGLike | SeedLike | None = None, copy: bool = False, replace: bool = False, axis: Literal["obs", 0, "var", 1] = "obs", @@ -927,8 +927,9 @@ def sample( raise TypeError(msg) del fraction - np.random.seed(random_state) - indices = np.random.choice(old_n, size=n, replace=replace) + if not isinstance(rng, np.random.RandomState): + rng = np.random.default_rng(rng) + indices = rng.choice(old_n, size=n, replace=replace) subset = data[indices] if axis_name == "obs" else data[:, indices] if not isinstance(data, AnnData): @@ -956,7 +957,7 @@ def subsample( fraction: float | None = None, *, n_obs: int | None = None, - random_state: AnyRandom = 0, + random_state: _LegacyRandom = 0, copy: bool = False, ) -> AnnData | tuple[np.ndarray | CSMatrix, NDArray[np.int64]] | None: """\ @@ -991,7 +992,7 @@ def subsample( data=data, fraction=fraction, n=n_obs, - random_state=random_state, + rng=_legacy_numpy_gen(random_state), copy=copy, replace=False, axis=0, @@ -1004,7 +1005,7 @@ def downsample_counts( counts_per_cell: int | Collection[int] | None = None, total_counts: int | None = None, *, - random_state: AnyRandom = 0, + random_state: _LegacyRandom = 0, replace: bool = False, copy: bool = False, ) -> AnnData | None: @@ -1140,7 +1141,7 @@ def _downsample_array( col: np.ndarray, target: int, *, - random_state: AnyRandom = 0, + random_state: _LegacyRandom = 0, replace: bool = True, inplace: bool = False, ): diff --git a/tests/test_utils.py b/tests/test_utils.py index f8a38a5f9d..aebd6b7ec5 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -9,7 +9,7 @@ from packaging.version import Version from scipy.sparse import csr_matrix, issparse -from scanpy._compat import DaskArray, pkg_version +from scanpy._compat import DaskArray, _legacy_numpy_gen, pkg_version from scanpy._utils import ( axis_mul_or_truediv, axis_sum, @@ -247,3 +247,22 @@ def test_is_constant_dask(request: pytest.FixtureRequest, axis, expected, block_ x = da.from_array(np.array(x_data), chunks=2).map_blocks(block_type) result = is_constant(x, axis=axis).compute() np.testing.assert_array_equal(expected, result) + + +@pytest.mark.parametrize("seed", [0, 1, 1256712675]) +@pytest.mark.parametrize("func", ["choice"]) +def test_legacy_numpy_gen(seed: int, func: str): + arr_module = _mk_random(seed, func, legacy=True) + arr_generator = _mk_random(seed, func, legacy=False) + np.testing.assert_array_equal(arr_module, arr_generator) + + +def _mk_random(seed: int, func: str, *, legacy: bool) -> np.ndarray: + np.random.seed(seed) + gen = np.random if legacy else _legacy_numpy_gen() + match func: + case "choice": + arr = np.arange(1000) + return gen.choice(arr, size=(100, 100)) + case _: + pytest.fail(f"Unknown {func=}") From c24e9b21a6add88c3b2234aeeaf7e9cbc3feb8fe Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Tue, 19 Nov 2024 09:40:22 +0100 Subject: [PATCH 14/28] remove duplicate _LegacyRandom --- src/scanpy/_compat.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/scanpy/_compat.py b/src/scanpy/_compat.py index 6e14f4c2c4..1301d2c1b9 100644 --- a/src/scanpy/_compat.py +++ b/src/scanpy/_compat.py @@ -200,9 +200,6 @@ def _numba_threading_layer() -> Layer: raise ValueError(msg) -_LegacyRandom = int | np.random.RandomState | None - - def _legacy_numpy_gen( random_state: _LegacyRandom | None = None, ) -> np.random.RandomState: From e246f02d51548f3de77cea02921480bf137b26f4 Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Tue, 19 Nov 2024 09:43:29 +0100 Subject: [PATCH 15/28] undo compat thing --- src/scanpy/preprocessing/_simple.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/scanpy/preprocessing/_simple.py b/src/scanpy/preprocessing/_simple.py index 9ec518f412..ef0c568d60 100644 --- a/src/scanpy/preprocessing/_simple.py +++ b/src/scanpy/preprocessing/_simple.py @@ -927,8 +927,7 @@ def sample( raise TypeError(msg) del fraction - if not isinstance(rng, np.random.RandomState): - rng = np.random.default_rng(rng) + rng = np.random.default_rng(rng) indices = rng.choice(old_n, size=n, replace=replace) subset = data[indices] if axis_name == "obs" else data[:, indices] From 4ad40b71b61d16fac71936d59a7abbb9569e6a7f Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Tue, 19 Nov 2024 10:06:27 +0100 Subject: [PATCH 16/28] fix backwards compat --- src/scanpy/preprocessing/_simple.py | 17 ++++++++--------- tests/test_preprocessing.py | 4 ++-- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/scanpy/preprocessing/_simple.py b/src/scanpy/preprocessing/_simple.py index ef0c568d60..5ad611d419 100644 --- a/src/scanpy/preprocessing/_simple.py +++ b/src/scanpy/preprocessing/_simple.py @@ -8,7 +8,7 @@ import warnings from functools import singledispatch from itertools import repeat -from typing import TYPE_CHECKING, TypeVar, overload +from typing import TYPE_CHECKING, TypeVar, cast, overload import numba import numpy as np @@ -927,7 +927,10 @@ def sample( raise TypeError(msg) del fraction - rng = np.random.default_rng(rng) + # Our backwards compat code passes a `RandomState` here + rng: RNGLike | SeedLike | np.random.RandomState | None + if not isinstance(rng, np.random.RandomState): + rng = np.random.default_rng(rng) indices = rng.choice(old_n, size=n, replace=replace) subset = data[indices] if axis_name == "obs" else data[:, indices] @@ -987,14 +990,10 @@ def subsample( subsamples the passed :class:`~anndata.AnnData` (`copy == False`) or returns a subsampled copy of it (`copy == True`). """ + # `sample` can use `RandomState`, but we don’t want to advertise that + rng = cast(np.random.Generator, _legacy_numpy_gen(random_state)) return sample( - data=data, - fraction=fraction, - n=n_obs, - rng=_legacy_numpy_gen(random_state), - copy=copy, - replace=False, - axis=0, + data=data, fraction=fraction, n=n_obs, rng=rng, copy=copy, replace=False, axis=0 ) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index e168f66e14..1df495cf3a 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -249,8 +249,8 @@ def test_sample_copy_backed(tmp_path): assert sc.pp.sample(adata_d, n=40, copy=True).shape == (40, 10) np.testing.assert_array_equal( - sc.pp.sample(adata_m, n=40, copy=True).X, - sc.pp.sample(adata_d, n=40, copy=True).X, + sc.pp.sample(adata_m, n=40, copy=True, rng=0).X, + sc.pp.sample(adata_d, n=40, copy=True, rng=0).X, ) From 1b8c81e7aaeb4d00abf3af58c217706f4cffa6fc Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Tue, 19 Nov 2024 17:01:10 +0100 Subject: [PATCH 17/28] Use fake Generator --- docs/release-notes/943.feature.md | 2 +- src/scanpy/_compat.py | 31 ++++++++++++++++++++++++++--- src/scanpy/preprocessing/_simple.py | 10 +++------- 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/docs/release-notes/943.feature.md b/docs/release-notes/943.feature.md index f47662ef4e..4f5474d762 100644 --- a/docs/release-notes/943.feature.md +++ b/docs/release-notes/943.feature.md @@ -1 +1 @@ -{func}`~scanpy.pp.sample` supports both upsampling and downsampling of observations and variables. {func}`~scanpy.pp.subsample` is now deprecated. {smaller}`G Eraslan` +{func}`~scanpy.pp.sample` supports both upsampling and downsampling of observations and variables. {func}`~scanpy.pp.subsample` is now deprecated. {smaller}`G Eraslan` & {smaller}`P Angerer` diff --git a/src/scanpy/_compat.py b/src/scanpy/_compat.py index 1301d2c1b9..b8ad666906 100644 --- a/src/scanpy/_compat.py +++ b/src/scanpy/_compat.py @@ -202,16 +202,41 @@ def _numba_threading_layer() -> Layer: def _legacy_numpy_gen( random_state: _LegacyRandom | None = None, -) -> np.random.RandomState: +) -> np.random.Generator: """Return a random generator that behaves like the legacy one.""" if random_state is not None: if isinstance(random_state, np.random.RandomState): np.random.set_state(random_state.get_state(legacy=False)) - return random_state + return _FakeRandomGen(random_state) np.random.seed(random_state) state = np.random.get_state(legacy=True) assert isinstance(state, tuple) bit_gen = np.random.MT19937() bit_gen.state = state - return np.random.RandomState(bit_gen) + return _FakeRandomGen(np.random.RandomState(bit_gen)) + + +class _FakeRandomGen(np.random.Generator): + _state: np.random.RandomState + + def __init__(self, random_state: np.random.RandomState) -> None: + self._state = random_state + + @classmethod + def _delegate(cls) -> None: + for name, meth in np.random.Generator.__dict__.items(): + if name.startswith("_") or not callable(meth): + continue + + def mk_wrapper(name: str): + @wraps(meth) + def wrapper(self: _FakeRandomGen, *args, **kwargs): + return getattr(self._state, name)(*args, **kwargs) + + return wrapper + + setattr(cls, name, mk_wrapper(name)) + + +_FakeRandomGen._delegate() diff --git a/src/scanpy/preprocessing/_simple.py b/src/scanpy/preprocessing/_simple.py index 5ad611d419..2b26b162a8 100644 --- a/src/scanpy/preprocessing/_simple.py +++ b/src/scanpy/preprocessing/_simple.py @@ -8,7 +8,7 @@ import warnings from functools import singledispatch from itertools import repeat -from typing import TYPE_CHECKING, TypeVar, cast, overload +from typing import TYPE_CHECKING, TypeVar, overload import numba import numpy as np @@ -927,10 +927,7 @@ def sample( raise TypeError(msg) del fraction - # Our backwards compat code passes a `RandomState` here - rng: RNGLike | SeedLike | np.random.RandomState | None - if not isinstance(rng, np.random.RandomState): - rng = np.random.default_rng(rng) + rng = np.random.default_rng(rng) indices = rng.choice(old_n, size=n, replace=replace) subset = data[indices] if axis_name == "obs" else data[:, indices] @@ -990,8 +987,7 @@ def subsample( subsamples the passed :class:`~anndata.AnnData` (`copy == False`) or returns a subsampled copy of it (`copy == True`). """ - # `sample` can use `RandomState`, but we don’t want to advertise that - rng = cast(np.random.Generator, _legacy_numpy_gen(random_state)) + rng = _legacy_numpy_gen(random_state) return sample( data=data, fraction=fraction, n=n_obs, rng=rng, copy=copy, replace=False, axis=0 ) From 594d961eb895288970df7f5a6478f047f807243a Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Tue, 19 Nov 2024 17:08:59 +0100 Subject: [PATCH 18/28] backwards compat test --- tests/test_preprocessing.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 1df495cf3a..4bf2d128aa 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -242,6 +242,15 @@ def test_sample_error(args: dict[str, Any], exc: type[Exception], pattern: str): sc.pp.sample(adata, **args) +def test_sample_backwards_compat(): + expected = np.array( + [26, 86, 2, 55, 75, 93, 16, 73, 54, 95, 53, 92, 78, 13, 7, 30, 22, 24, 33, 8] + ) + legacy_result, indices = sc.pp.subsample(np.arange(100), n_obs=20) + assert np.array_equal(indices, legacy_result), "arange choices should match indices" + assert np.array_equal(legacy_result, expected) + + def test_sample_copy_backed(tmp_path): adata_m = AnnData(np.random.rand(200, 10).astype(np.float32)) adata_d = adata_m.copy() From 59a171cce5e97e9f81daa20686bd776ac8beb2af Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 21 Nov 2024 13:03:58 +0100 Subject: [PATCH 19/28] Fix tests for old Pythons --- src/scanpy/_compat.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/scanpy/_compat.py b/src/scanpy/_compat.py index b8ad666906..720bebc6e2 100644 --- a/src/scanpy/_compat.py +++ b/src/scanpy/_compat.py @@ -4,7 +4,7 @@ import sys import warnings from dataclasses import dataclass, field -from functools import cache, partial, wraps +from functools import WRAPPER_ASSIGNMENTS, cache, partial, wraps from importlib.util import find_spec from pathlib import Path from typing import TYPE_CHECKING, Literal, ParamSpec, TypeVar, cast, overload @@ -230,7 +230,8 @@ def _delegate(cls) -> None: continue def mk_wrapper(name: str): - @wraps(meth) + # Old pytest versions try to run the doctests + @wraps(meth, assigned=set(WRAPPER_ASSIGNMENTS) - {"__doc__"}) def wrapper(self: _FakeRandomGen, *args, **kwargs): return getattr(self._state, name)(*args, **kwargs) From 59adc76818b7a005a35ee4b4f2783c7e86eab632 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 21 Nov 2024 13:31:51 +0100 Subject: [PATCH 20/28] test that random state is modified --- src/scanpy/_compat.py | 17 ++++++----------- tests/test_utils.py | 25 +++++++++++++++++++------ 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/src/scanpy/_compat.py b/src/scanpy/_compat.py index 720bebc6e2..9c89f94f27 100644 --- a/src/scanpy/_compat.py +++ b/src/scanpy/_compat.py @@ -204,17 +204,12 @@ def _legacy_numpy_gen( random_state: _LegacyRandom | None = None, ) -> np.random.Generator: """Return a random generator that behaves like the legacy one.""" - - if random_state is not None: - if isinstance(random_state, np.random.RandomState): - np.random.set_state(random_state.get_state(legacy=False)) - return _FakeRandomGen(random_state) - np.random.seed(random_state) - state = np.random.get_state(legacy=True) - assert isinstance(state, tuple) - bit_gen = np.random.MT19937() - bit_gen.state = state - return _FakeRandomGen(np.random.RandomState(bit_gen)) + if random_state is None: + return _FakeRandomGen(np.random.RandomState(np.random.get_bit_generator())) + if isinstance(random_state, np.random.RandomState): + np.random.set_state(random_state.get_state(legacy=False)) + return _FakeRandomGen(random_state) + np.random.seed(random_state) class _FakeRandomGen(np.random.Generator): diff --git a/tests/test_utils.py b/tests/test_utils.py index aebd6b7ec5..aba6456089 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -252,14 +252,27 @@ def test_is_constant_dask(request: pytest.FixtureRequest, axis, expected, block_ @pytest.mark.parametrize("seed", [0, 1, 1256712675]) @pytest.mark.parametrize("func", ["choice"]) def test_legacy_numpy_gen(seed: int, func: str): - arr_module = _mk_random(seed, func, legacy=True) - arr_generator = _mk_random(seed, func, legacy=False) - np.testing.assert_array_equal(arr_module, arr_generator) + np.random.seed(seed) + state_before = np.random.get_state(legacy=False) + + arrs = {} + states_after = {} + for direct in [True, False]: + np.random.seed(seed) + arrs[direct] = _mk_random(func, direct=direct) + states_after[direct] = np.random.get_state(legacy=False) + + np.testing.assert_array_equal(arrs[True], arrs[False]) + np.testing.assert_equal( + *states_after.values(), err_msg="both should affect global state the same" + ) + # they should affect the global state + with pytest.raises(AssertionError): + np.testing.assert_equal(states_after[True], state_before) -def _mk_random(seed: int, func: str, *, legacy: bool) -> np.ndarray: - np.random.seed(seed) - gen = np.random if legacy else _legacy_numpy_gen() +def _mk_random(func: str, *, direct: bool) -> np.ndarray: + gen = np.random if direct else _legacy_numpy_gen() match func: case "choice": arr = np.arange(1000) From ef27db0de5f948df25fecc39e16058ad949cf70f Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 21 Nov 2024 13:41:00 +0100 Subject: [PATCH 21/28] Fix util --- src/scanpy/_compat.py | 13 +++++++------ tests/test_utils.py | 14 +++++++++----- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/scanpy/_compat.py b/src/scanpy/_compat.py index 9c89f94f27..bf13cd2320 100644 --- a/src/scanpy/_compat.py +++ b/src/scanpy/_compat.py @@ -204,12 +204,13 @@ def _legacy_numpy_gen( random_state: _LegacyRandom | None = None, ) -> np.random.Generator: """Return a random generator that behaves like the legacy one.""" - if random_state is None: - return _FakeRandomGen(np.random.RandomState(np.random.get_bit_generator())) - if isinstance(random_state, np.random.RandomState): - np.random.set_state(random_state.get_state(legacy=False)) - return _FakeRandomGen(random_state) - np.random.seed(random_state) + + if random_state is not None: + if isinstance(random_state, np.random.RandomState): + np.random.set_state(random_state.get_state(legacy=False)) + return _FakeRandomGen(random_state) + np.random.seed(random_state) + return _FakeRandomGen(np.random.RandomState(np.random.get_bit_generator())) class _FakeRandomGen(np.random.Generator): diff --git a/tests/test_utils.py b/tests/test_utils.py index aba6456089..2cee7a3e30 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -250,16 +250,18 @@ def test_is_constant_dask(request: pytest.FixtureRequest, axis, expected, block_ @pytest.mark.parametrize("seed", [0, 1, 1256712675]) +@pytest.mark.parametrize("pass_seed", [True, False], ids=["pass_seed", "set_seed"]) @pytest.mark.parametrize("func", ["choice"]) -def test_legacy_numpy_gen(seed: int, func: str): +def test_legacy_numpy_gen(*, seed: int, pass_seed: bool, func: str): np.random.seed(seed) state_before = np.random.get_state(legacy=False) arrs = {} states_after = {} for direct in [True, False]: - np.random.seed(seed) - arrs[direct] = _mk_random(func, direct=direct) + if not pass_seed: + np.random.seed(seed) + arrs[direct] = _mk_random(func, direct=direct, seed=seed if pass_seed else None) states_after[direct] = np.random.get_state(legacy=False) np.testing.assert_array_equal(arrs[True], arrs[False]) @@ -271,8 +273,10 @@ def test_legacy_numpy_gen(seed: int, func: str): np.testing.assert_equal(states_after[True], state_before) -def _mk_random(func: str, *, direct: bool) -> np.ndarray: - gen = np.random if direct else _legacy_numpy_gen() +def _mk_random(func: str, *, direct: bool, seed: int | None) -> np.ndarray: + if direct and seed is not None: + np.random.seed(seed) + gen = np.random if direct else _legacy_numpy_gen(seed) match func: case "choice": arr = np.arange(1000) From c471e94d4edf39002a1b352afa2597c9f0a8a8d4 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 21 Nov 2024 13:42:05 +0100 Subject: [PATCH 22/28] types --- tests/test_utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 2cee7a3e30..81369a6938 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,6 +2,7 @@ from operator import mul, truediv from types import ModuleType +from typing import TYPE_CHECKING import numpy as np import pytest @@ -26,6 +27,9 @@ ARRAY_TYPES_SPARSE_DASK_UNSUPPORTED, ) +if TYPE_CHECKING: + from typing import Any + def test_descend_classes_and_funcs(): # create module hierarchy @@ -256,8 +260,8 @@ def test_legacy_numpy_gen(*, seed: int, pass_seed: bool, func: str): np.random.seed(seed) state_before = np.random.get_state(legacy=False) - arrs = {} - states_after = {} + arrs: dict[bool, np.ndarray] = {} + states_after: dict[bool, dict[str, Any]] = {} for direct in [True, False]: if not pass_seed: np.random.seed(seed) From 3028dff59435f94a1fa90eae7a2f8bcb746ddbc9 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 21 Nov 2024 13:49:48 +0100 Subject: [PATCH 23/28] move deprecated stuff --- src/scanpy/preprocessing/__init__.py | 2 +- .../preprocessing/_deprecated/sampling.py | 60 +++++++++++++++++++ src/scanpy/preprocessing/_simple.py | 48 +-------------- 3 files changed, 62 insertions(+), 48 deletions(-) create mode 100644 src/scanpy/preprocessing/_deprecated/sampling.py diff --git a/src/scanpy/preprocessing/__init__.py b/src/scanpy/preprocessing/__init__.py index c407850282..4307cbb6c9 100644 --- a/src/scanpy/preprocessing/__init__.py +++ b/src/scanpy/preprocessing/__init__.py @@ -3,6 +3,7 @@ from ..neighbors import neighbors from ._combat import combat from ._deprecated.highly_variable_genes import filter_genes_dispersion +from ._deprecated.sampling import subsample from ._highly_variable_genes import highly_variable_genes from ._normalization import normalize_total from ._pca import pca @@ -19,7 +20,6 @@ regress_out, sample, sqrt, - subsample, ) __all__ = [ diff --git a/src/scanpy/preprocessing/_deprecated/sampling.py b/src/scanpy/preprocessing/_deprecated/sampling.py new file mode 100644 index 0000000000..02619a2364 --- /dev/null +++ b/src/scanpy/preprocessing/_deprecated/sampling.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from ..._compat import _legacy_numpy_gen, old_positionals +from .._simple import sample + +if TYPE_CHECKING: + import numpy as np + from anndata import AnnData + from numpy.typing import NDArray + from scipy.sparse import csc_matrix, csr_matrix + + from ..._compat import _LegacyRandom + + CSMatrix = csr_matrix | csc_matrix + + +@old_positionals("n_obs", "random_state", "copy") +def subsample( + data: AnnData | np.ndarray | CSMatrix, + fraction: float | None = None, + *, + n_obs: int | None = None, + random_state: _LegacyRandom = 0, + copy: bool = False, +) -> AnnData | tuple[np.ndarray | CSMatrix, NDArray[np.int64]] | None: + """\ + Subsample to a fraction of the number of observations. + + .. deprecated:: 1.11.0 + + Use :func:`~scanpy.pp.sample` instead. + + Parameters + ---------- + data + The (annotated) data matrix of shape `n_obs` × `n_vars`. + Rows correspond to cells and columns to genes. + fraction + Subsample to this `fraction` of the number of observations. + n_obs + Subsample to this number of observations. + random_state + Random seed to change subsampling. + copy + If an :class:`~anndata.AnnData` is passed, + determines whether a copy is returned. + + Returns + ------- + Returns `X[obs_indices], obs_indices` if data is array-like, otherwise + subsamples the passed :class:`~anndata.AnnData` (`copy == False`) or + returns a subsampled copy of it (`copy == True`). + """ + + rng = _legacy_numpy_gen(random_state) + return sample( + data=data, fraction=fraction, n=n_obs, rng=rng, copy=copy, replace=False, axis=0 + ) diff --git a/src/scanpy/preprocessing/_simple.py b/src/scanpy/preprocessing/_simple.py index 2b26b162a8..b9d548c771 100644 --- a/src/scanpy/preprocessing/_simple.py +++ b/src/scanpy/preprocessing/_simple.py @@ -18,7 +18,7 @@ from sklearn.utils import check_array, sparsefuncs from .. import logging as logg -from .._compat import _legacy_numpy_gen, njit, old_positionals +from .._compat import njit, old_positionals from .._settings import settings as sett from .._utils import ( _check_array_function_arguments, @@ -39,9 +39,6 @@ except ImportError: da = None -# backwards compat -from ._deprecated.highly_variable_genes import filter_genes_dispersion # noqa: F401 - if TYPE_CHECKING: from collections.abc import Collection, Iterable, Sequence from numbers import Number @@ -950,49 +947,6 @@ def sample( data._inplace_subset_var(indices) -@old_positionals("n_obs", "random_state", "copy") -def subsample( - data: AnnData | np.ndarray | CSMatrix, - fraction: float | None = None, - *, - n_obs: int | None = None, - random_state: _LegacyRandom = 0, - copy: bool = False, -) -> AnnData | tuple[np.ndarray | CSMatrix, NDArray[np.int64]] | None: - """\ - Subsample to a fraction of the number of observations. - - .. deprecated:: 1.11.0 - - Use :func:`~scanpy.pp.sample` instead. - - Parameters - ---------- - data - The (annotated) data matrix of shape `n_obs` × `n_vars`. - Rows correspond to cells and columns to genes. - fraction - Subsample to this `fraction` of the number of observations. - n_obs - Subsample to this number of observations. - random_state - Random seed to change subsampling. - copy - If an :class:`~anndata.AnnData` is passed, - determines whether a copy is returned. - - Returns - ------- - Returns `X[obs_indices], obs_indices` if data is array-like, otherwise - subsamples the passed :class:`~anndata.AnnData` (`copy == False`) or - returns a subsampled copy of it (`copy == True`). - """ - rng = _legacy_numpy_gen(random_state) - return sample( - data=data, fraction=fraction, n=n_obs, rng=rng, copy=copy, replace=False, axis=0 - ) - - @renamed_arg("target_counts", "counts_per_cell") def downsample_counts( adata: AnnData, From f11b6ba93edbdd763acd824cb176cb64a9a63e39 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 21 Nov 2024 14:04:06 +0100 Subject: [PATCH 24/28] Use deprecation decorator --- pyproject.toml | 1 + src/scanpy/_compat.py | 13 ++++++++++ src/scanpy/plotting/_preprocessing.py | 3 ++- .../_deprecated/highly_variable_genes.py | 24 +++++++++---------- src/scanpy/preprocessing/_simple.py | 21 ++++++++-------- 5 files changed, 39 insertions(+), 23 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cfb7ffd28a..324c4c4262 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,6 +66,7 @@ dependencies = [ "packaging>=21.3", "session-info", "legacy-api-wrap>=1.4", # for positional API deprecations + "typing-extensions; python_version < '3.13'", ] dynamic = ["version"] diff --git a/src/scanpy/_compat.py b/src/scanpy/_compat.py index dca6c84c4e..b97b1a8603 100644 --- a/src/scanpy/_compat.py +++ b/src/scanpy/_compat.py @@ -48,6 +48,10 @@ class ZappyArray: "fullname", "pkg_metadata", "pkg_version", + "old_positionals", + "deprecated", + "njit", + "_numba_threading_layer", ] @@ -102,6 +106,15 @@ def old_positionals(*old_positionals: str): return lambda func: func +if sys.version_info >= (3, 13): + from warnings import deprecated as _deprecated +else: + from typing_extensions import deprecated as _deprecated + + +deprecated = partial(_deprecated, category=FutureWarning) + + @overload def njit(fn: Callable[P, R], /) -> Callable[P, R]: ... @overload diff --git a/src/scanpy/plotting/_preprocessing.py b/src/scanpy/plotting/_preprocessing.py index e6c7808be1..b51688082e 100644 --- a/src/scanpy/plotting/_preprocessing.py +++ b/src/scanpy/plotting/_preprocessing.py @@ -6,7 +6,7 @@ from matplotlib import pyplot as plt from matplotlib import rcParams -from .._compat import old_positionals +from .._compat import deprecated, old_positionals from .._settings import settings from . import _utils @@ -103,6 +103,7 @@ def highly_variable_genes( # backwards compat +@deprecated("Use sc.pl.highly_variable_genes instead") @old_positionals("log", "show", "save") def filter_genes_dispersion( result: np.recarray, diff --git a/src/scanpy/preprocessing/_deprecated/highly_variable_genes.py b/src/scanpy/preprocessing/_deprecated/highly_variable_genes.py index f2c3ce971b..27e8f1f846 100644 --- a/src/scanpy/preprocessing/_deprecated/highly_variable_genes.py +++ b/src/scanpy/preprocessing/_deprecated/highly_variable_genes.py @@ -9,7 +9,7 @@ from scipy.sparse import issparse from ... import logging as logg -from ..._compat import old_positionals +from ..._compat import deprecated, old_positionals from .._distributed import materialize_as_ndarray from .._utils import _get_mean_var @@ -19,6 +19,7 @@ from scipy.sparse import spmatrix +@deprecated("Use sc.pp.highly_variable_genes instead") @old_positionals( "flavor", "min_disp", @@ -48,18 +49,17 @@ def filter_genes_dispersion( """\ Extract highly variable genes :cite:p:`Satija2015,Zheng2017`. - .. warning:: - .. deprecated:: 1.3.6 - Use :func:`~scanpy.pp.highly_variable_genes` - instead. The new function is equivalent to the present - function, except that + .. deprecated:: 1.3.6 - * the new function always expects logarithmized data - * `subset=False` in the new function, it suffices to - merely annotate the genes, tools like `pp.pca` will - detect the annotation - * you can now call: `sc.pl.highly_variable_genes(adata)` - * `copy` is replaced by `inplace` + Use :func:`~scanpy.pp.highly_variable_genes` instead. + The new function is equivalent to the present function, except that + + * the new function always expects logarithmized data + * `subset=False` in the new function, it suffices to + merely annotate the genes, tools like `pp.pca` will + detect the annotation + * you can now call: `sc.pl.highly_variable_genes(adata)` + * `copy` is replaced by `inplace` If trying out parameters, pass the data matrix instead of AnnData. diff --git a/src/scanpy/preprocessing/_simple.py b/src/scanpy/preprocessing/_simple.py index 01936414a5..eaf9648690 100644 --- a/src/scanpy/preprocessing/_simple.py +++ b/src/scanpy/preprocessing/_simple.py @@ -18,7 +18,7 @@ from sklearn.utils import check_array, sparsefuncs from .. import logging as logg -from .._compat import njit, old_positionals +from .._compat import deprecated, njit, old_positionals from .._settings import settings as sett from .._utils import ( _check_array_function_arguments, @@ -474,6 +474,7 @@ def sqrt( return X.sqrt() +@deprecated("Use sc.pp.normalize_total instead") @old_positionals( "counts_per_cell_after", "counts_per_cell", @@ -497,16 +498,16 @@ def normalize_per_cell( """\ Normalize total counts per cell. - .. warning:: - .. deprecated:: 1.3.7 - Use :func:`~scanpy.pp.normalize_total` instead. - The new function is equivalent to the present - function, except that + .. deprecated:: 1.3.7 - * the new function doesn't filter cells based on `min_counts`, - use :func:`~scanpy.pp.filter_cells` if filtering is needed. - * some arguments were renamed - * `copy` is replaced by `inplace` + Use :func:`~scanpy.pp.normalize_total` instead. + The new function is equivalent to the present + function, except that + + * the new function doesn't filter cells based on `min_counts`, + use :func:`~scanpy.pp.filter_cells` if filtering is needed. + * some arguments were renamed + * `copy` is replaced by `inplace` Normalize each cell by total counts over all genes, so that every cell has the same total count after normalization. From 735f00aaa8622fcc3c3c3fff872088d9f060b13c Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 21 Nov 2024 14:15:09 +0100 Subject: [PATCH 25/28] relnote --- docs/release-notes/3380.bugfix.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/release-notes/3380.bugfix.md diff --git a/docs/release-notes/3380.bugfix.md b/docs/release-notes/3380.bugfix.md new file mode 100644 index 0000000000..633ce346af --- /dev/null +++ b/docs/release-notes/3380.bugfix.md @@ -0,0 +1 @@ +Raise {exc}`FutureWarning` when calling deprecated {mod}`scanpy.pp` functions {smaller}`P Angerer` From 0a5b284bbe75a21d76ad41a793de041eac08edf1 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 21 Nov 2024 15:24:57 +0100 Subject: [PATCH 26/28] fix dask warning stuff --- tests/test_preprocessing.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 4bf2d128aa..835e8ec394 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -1,6 +1,7 @@ from __future__ import annotations import warnings +from importlib.util import find_spec from itertools import product from pathlib import Path from typing import TYPE_CHECKING @@ -22,7 +23,7 @@ maybe_dask_process_context, ) from testing.scanpy._helpers.data import pbmc3k, pbmc68k_reduced -from testing.scanpy._pytest.params import ARRAY_TYPES, param_with +from testing.scanpy._pytest.params import ARRAY_TYPES if TYPE_CHECKING: from collections.abc import Callable @@ -143,16 +144,7 @@ def test_normalize_per_cell(): assert adata.X.sum(axis=1).tolist() == adata_sparse.X.sum(axis=1).A1.tolist() -ignore_dask_perf = pytest.mark.filterwarnings("ignore::dask.array.PerformanceWarning") - - -@pytest.mark.parametrize( - "array_type", - [ - param_with(p, marks=[ignore_dask_perf] if "dask" in (p.id or "") else []) - for p in ARRAY_TYPES - ], -) +@pytest.mark.parametrize("array_type", ARRAY_TYPES) @pytest.mark.parametrize("which", ["copy", "inplace", "array"]) @pytest.mark.parametrize( ("axis", "fraction", "n", "replace", "expected"), @@ -177,6 +169,11 @@ def test_sample( ): adata = AnnData(array_type(np.ones((200, 10)))) + # ignoring this warning declaratively is a pain so do it here + if find_spec("dask"): + import dask.array as da + + warnings.filterwarnings(category=da.PerformanceWarning) # can’t guarantee that duplicates are drawn when `replace=True`, # so we just ignore the warning instead using `with pytest.warns(...)` warnings.filterwarnings( From 0ca941111997a7592267584ba92699fb96c677ab Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 21 Nov 2024 15:56:19 +0100 Subject: [PATCH 27/28] oops --- tests/test_preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 835e8ec394..15318f2c52 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -173,7 +173,7 @@ def test_sample( if find_spec("dask"): import dask.array as da - warnings.filterwarnings(category=da.PerformanceWarning) + warnings.filterwarnings("ignore", category=da.PerformanceWarning) # can’t guarantee that duplicates are drawn when `replace=True`, # so we just ignore the warning instead using `with pytest.warns(...)` warnings.filterwarnings( From 396b21a217711249e1803a01357a14da54bc6106 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Fri, 22 Nov 2024 16:04:13 +0100 Subject: [PATCH 28/28] Bump numpy to version that has get_bit_generator --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 324c4c4262..b882604268 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ classifiers = [ ] dependencies = [ "anndata>=0.8", - "numpy>=1.23", + "numpy>=1.24", "matplotlib>=3.6", "pandas >=1.5", "scipy>=1.8",