Add denoise tests

flatironinstitute · Feb 29, 2024 · 0b36e10 · 0b36e10
1 parent 2e6dfb7
commit 0b36e10
Show file tree

Hide file tree

Showing 7 changed files with 299 additions and 53 deletions.
diff --git a/inferelator_velocity/denoise.py b/inferelator_velocity/denoise.py
@@ -1,5 +1,7 @@
 import numpy as np
+import scipy.sparse as sps
 
+from inferelator_velocity.utils.math import array_sum
 from inferelator_velocity.utils.noise2self import (
     _dist_to_row_stochastic,
     dot
@@ -15,7 +17,10 @@ def denoise(
     layer='X',
     graph_key=NOISE2SELF_DIST_KEY,
     output_layer=NOISE2SELF_DENOISED_KEY,
-    dense=True
+    dense=True,
+    chunk_size=10000,
+    zero_threshold=None,
+    obs_count_key=None
 ):
 
     lref = data.X if layer == 'X' else data.layers[layer]
@@ -26,13 +31,92 @@ def denoise(
             f"run global_graph() first"
         )
 
-    data.layers[output_layer] = np.zeros(lref.shape, dtype=np.float32)
+    if data.obsp[graph_key].dtype != lref.dtype:
+        raise RuntimeError(
+            f"Graph dtype {data.obsp[graph_key].dtype} is not the "
+            f"same as data dtype {lref.dtype}; "
+            "these must match and be float32 or float64"
+        )
 
-    dot(
-        _dist_to_row_stochastic(data.obsp[graph_key]),
-        lref,
-        dense=dense,
-        out=data.layers[output_layer]
-    )
+    _n_obs = lref.shape[0]
+
+    if chunk_size is not None:
+        _n_chunks = int(np.ceil(_n_obs / chunk_size))
+    else:
+        _n_chunks = 1
+
+    if _n_chunks == 1:
+        data.layers[output_layer] = _denoise_chunk(
+            lref,
+            _dist_to_row_stochastic(data.obsp[graph_key]),
+            dense=dense,
+            zero_threshold=zero_threshold
+        )
+
+    elif dense or not sps.issparse(lref):
+        data.layers[output_layer] = np.zeros(lref.shape, dtype=np.float32)
+
+        for i in range(_n_chunks):
+            _start, _stop = i * chunk_size, min((i + 1) * chunk_size, _n_obs)
+
+            _denoise_chunk(
+                lref,
+                _dist_to_row_stochastic(
+                    data.obsp[graph_key][_start:_stop, :]
+                ),
+                dense=True,
+                out=data.layers[output_layer][_start:_stop, :],
+                zero_threshold=zero_threshold
+            )
+
+    else:
+        data.layers[output_layer] = sps.vstack(
+            tuple(
+                _denoise_chunk(
+                    lref,
+                    _dist_to_row_stochastic(
+                        data.obsp[graph_key][
+                            i * chunk_size:min((i + 1) * chunk_size, _n_obs),
+                            :
+                        ]
+                    ),
+                    zero_threshold=zero_threshold
+                )
+                for i in range(_n_chunks)
+            )
+        )
+
+    if obs_count_key is not None:
+        data.obs[obs_count_key] = array_sum(
+            data.layers[output_layer],
+            axis=1
+        )
 
     return data
+
+
+def _denoise_chunk(
+    x,
+    graph,
+    zero_threshold=None,
+    out=None,
+    dense=False
+):
+
+    if not sps.issparse(x):
+        dense = True
+
+    out = dot(
+        graph,
+        x,
+        out=out,
+        dense=dense
+    )
+
+    if zero_threshold is not None and dense:
+        out[out < zero_threshold] = 0
+    elif zero_threshold:
+        out.data[out.data < zero_threshold] = 0
+        out.eliminate_zeros()
+
+    return out
diff --git a/inferelator_velocity/tests/_stubs.py b/inferelator_velocity/tests/_stubs.py
@@ -0,0 +1,63 @@
+import numpy as np
+import anndata as ad
+import scipy.sparse as sps
+
+KNN_N = 10
+
+DIST = np.tile(np.arange(KNN_N), (KNN_N, 1)).astype(float)
+CONN = 1 / (DIST + 1)
+
+KNN = np.diag(np.ones(KNN_N - 1), -1) + np.diag(np.ones(KNN_N - 1), 1)
+KNN[0, KNN_N - 1] = 1
+KNN[KNN_N - 1, 0] = 1
+
+rng = np.random.default_rng(100)
+COUNTS = rng.negative_binomial(5, 0.5, (1000, 10))
+
+N = 1000
+BINS = 10
+
+EXPRESSION = np.zeros((N, 6), dtype=int)
+EXPRESSION[:, 0] = (100 * np.random.default_rng(222222).random(N)).astype(int)
+EXPRESSION[:, 1] = EXPRESSION[:, 0] * 1.75 - 0.5
+EXPRESSION[:, 2] = EXPRESSION[:, 0] ** 2
+EXPRESSION[:, 3] = 0
+EXPRESSION[:, 4] = np.arange(N)
+EXPRESSION[:, 5] = np.arange(N) * 2 + 10
+
+K = 15
+EXPR_KNN = sps.csr_matrix(
+    (
+        rng.uniform(low=0.1, high=2., size=K * N),
+        (
+            np.repeat(np.arange(N), K),
+            np.concatenate(
+                tuple(
+                    rng.choice(np.arange(1000), size=(K, ), replace=False)
+                    for _ in range(N)
+                )
+            )
+        )
+    ),
+    dtype=np.float32
+)
+
+EXPRESSION_ADATA = ad.AnnData(EXPRESSION.astype(int))
+
+ADATA_UNS_PROGRAM_KEYS = [
+    'metric',
+    'leiden_correlation',
+    'metric_genes',
+    'information_distance',
+    'cluster_program_map',
+    'n_comps',
+    'n_programs',
+    'program_names',
+    'molecular_cv_loss'
+]
+
+PROGRAMS = ['0', '0', '0', '-1', '1', '1']
+PROGRAMS_ASSIGNED = ['0', '0', '0', '0', '1', '1']
+
+TIMES_0 = EXPRESSION[:, 0] / 100
+TIMES_1 = np.arange(N).astype(float)
diff --git a/inferelator_velocity/tests/test_denoise.py b/inferelator_velocity/tests/test_denoise.py
@@ -0,0 +1,119 @@
+import unittest
+import numpy as np
+import numpy.testing as npt
+import scipy.sparse as sps
+
+from ._stubs import (
+    EXPR_KNN,
+    EXPRESSION_ADATA
+)
+
+from inferelator_velocity.denoise import (
+    _dist_to_row_stochastic,
+    denoise
+)
+
+DENOISE_EXPR = np.dot(
+    _dist_to_row_stochastic(EXPR_KNN).A,
+    EXPRESSION_ADATA.X
+)
+
+
+class TestDenoise(unittest.TestCase):
+
+    threshold = None
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        cls.expect = DENOISE_EXPR.copy()
+
+        if cls.threshold is not None:
+            cls.expect[cls.expect < cls.threshold] = 0
+
+        return super().setUpClass()
+
+    def setUp(self) -> None:
+        self.data = EXPRESSION_ADATA.copy()
+        self.data.X = self.data.X.astype(np.float32)
+        self.data.obsp['graph'] = EXPR_KNN
+
+        return super().setUp()
+
+    def test_errors(self):
+
+        with self.assertRaises(RuntimeError):
+            denoise(
+                self.data,
+                chunk_size=500,
+                output_layer='abc',
+                zero_threshold=self.threshold
+            )
+
+        self.data.X = self.data.X.astype(int)
+        with self.assertRaises(RuntimeError):
+            denoise(
+                self.data,
+                chunk_size=500,
+                graph_key='graph',
+                output_layer='abc',
+                zero_threshold=self.threshold
+            )
+
+    def test_denoise_dense_to_dense(self):
+
+        denoise(
+            self.data,
+            chunk_size=500,
+            graph_key='graph',
+            output_layer='abc',
+            zero_threshold=self.threshold
+        )
+
+        npt.assert_almost_equal(
+            self.expect,
+            self.data.layers['abc'],
+            decimal=2
+        )
+
+    def test_denoise_sparse_to_dense(self):
+
+        self.data.X = sps.csr_array(self.data.X)
+
+        denoise(
+            self.data,
+            chunk_size=500,
+            graph_key='graph',
+            output_layer='abc',
+            dense=True,
+            zero_threshold=self.threshold
+        )
+
+        npt.assert_almost_equal(
+            self.expect,
+            self.data.layers['abc'],
+            decimal=2
+        )
+
+    def test_denoise_sparse_to_sparse(self):
+
+        self.data.X = sps.csr_array(self.data.X)
+
+        denoise(
+            self.data,
+            chunk_size=500,
+            graph_key='graph',
+            output_layer='abc',
+            dense=False,
+            zero_threshold=self.threshold
+        )
+
+        npt.assert_almost_equal(
+            self.expect,
+            self.data.layers['abc'].A,
+            decimal=2
+        )
+
+
+class TestDenoiseThreshold(TestDenoise):
+
+    threshold = 100
diff --git a/inferelator_velocity/tests/test_graph.py b/inferelator_velocity/tests/test_graph.py
@@ -4,16 +4,17 @@
 import numpy as np
 import numpy.testing as npt
 
-from inferelator_velocity.utils.graph import (get_shortest_paths, get_total_path, local_optimal_knn)
-
-N = 10
-
-DIST = np.tile(np.arange(N), (N, 1)).astype(float)
-CONN = 1 / (DIST + 1)
-
-KNN = np.diag(np.ones(N - 1), -1) + np.diag(np.ones(N - 1), 1)
-KNN[0, N - 1] = 1
-KNN[N - 1, 0] = 1
+from inferelator_velocity.utils.graph import (
+    get_shortest_paths,
+    get_total_path,
+    local_optimal_knn
+)
+from ._stubs import (
+    KNN_N as N,
+    DIST,
+    CONN,
+    KNN
+)
 
 COL = {
     'a': ('b', 0, 0.5),

diff --git a/inferelator_velocity/tests/test_mcv.py b/inferelator_velocity/tests/test_mcv.py
@@ -4,9 +4,9 @@
 
 from inferelator_velocity.utils.mcv import mcv_pcs
 
-
-rng = np.random.default_rng(100)
-COUNTS = rng.negative_binomial(5, 0.5, (1000, 10))
+from ._stubs import (
+    COUNTS
+)
 
 
 class TestMCV(unittest.TestCase):

diff --git a/inferelator_velocity/tests/test_misc.py b/inferelator_velocity/tests/test_misc.py