Merge pull request #53 from cytomining/v0.4.0

Bump version
cytomining · Feb 6, 2024 · 89b2c62 · 89b2c62
2 parents 91a282b + bd17afb
commit 89b2c62
Show file tree

Hide file tree

Showing 20 changed files with 836 additions and 763 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "copairs"
-version = "0.3.3"
+version = "0.4.0"
 description = "Find pairs and compute metrics between them"
 readme = "README.md"
 requires-python = ">=3.8"

diff --git a/src/copairs/__init__.py b/src/copairs/__init__.py
@@ -1,4 +1,4 @@
-'''
+"""
 Package to create pairwise lists based on sameby and diffby criteria
-'''
+"""
 from .matching import Matcher, MatcherMultilabel
diff --git a/src/copairs/compute.py b/src/copairs/compute.py
@@ -7,8 +7,9 @@
 import numpy as np
 from tqdm.autonotebook import tqdm
 
+
 def parallel_map(par_func, items):
-    '''Execute par_func(i) for every i in items using ThreadPool and tqdm.'''
+    """Execute par_func(i) for every i in items using ThreadPool and tqdm."""
     num_items = len(items)
     pool_size = min(num_items, os.cpu_count())
     chunksize = num_items // pool_size
@@ -18,29 +19,34 @@ def parallel_map(par_func, items):
             pass
 
 
-def batch_processing(pairwise_op: Callable[[np.ndarray, np.ndarray],np.ndarray],):
-    '''Decorator adding the batch_size param to run the function with multithreading using a list of paired indices'''
+def batch_processing(
+    pairwise_op: Callable[[np.ndarray, np.ndarray], np.ndarray],
+):
+    """Decorator adding the batch_size param to run the function with
+    multithreading using a list of paired indices"""
 
     def batched_fn(feats: np.ndarray, pair_ix: np.ndarray, batch_size: int):
         num_pairs = len(pair_ix)
         result = np.empty(num_pairs, dtype=np.float32)
+
         def par_func(i):
-            x_sample = feats[pair_ix[i:i + batch_size, 0]]
-            y_sample = feats[pair_ix[i:i + batch_size, 1]]
-            result[i:i + len(x_sample)] = pairwise_op(x_sample, y_sample)
+            x_sample = feats[pair_ix[i : i + batch_size, 0]]
+            y_sample = feats[pair_ix[i : i + batch_size, 1]]
+            result[i : i + len(x_sample)] = pairwise_op(x_sample, y_sample)
 
         parallel_map(par_func, np.arange(0, num_pairs, batch_size))
 
         return result
+
     return batched_fn
 
 
 @batch_processing
 def pairwise_corr(x_sample: np.ndarray, y_sample: np.ndarray) -> np.ndarray:
-    '''
+    """
     Compute pearson correlation between two matrices in a paired row-wise
     fashion. `x_sample` and `y_sample` must be of the same shape.
-    '''
+    """
     x_mean = x_sample.mean(axis=1, keepdims=True)
     y_mean = y_sample.mean(axis=1, keepdims=True)
 
@@ -60,8 +66,8 @@ def pairwise_corr(x_sample: np.ndarray, y_sample: np.ndarray) -> np.ndarray:
 def pairwise_cosine(x_sample: np.ndarray, y_sample: np.ndarray) -> np.ndarray:
     x_norm = x_sample / np.linalg.norm(x_sample, axis=1)[:, np.newaxis]
     y_norm = y_sample / np.linalg.norm(y_sample, axis=1)[:, np.newaxis]
-    c_dist = np.sum(x_norm * y_norm, axis=1)
-    return c_dist
+    c_sim = np.sum(x_norm * y_norm, axis=1)
+    return c_sim
 
 
 def random_binary_matrix(n, m, k, rng):
@@ -80,8 +86,8 @@ def random_binary_matrix(n, m, k, rng):
     return matrix
 
 
-def compute_ap(rel_k) -> np.ndarray:
-    '''Compute average precision based on binary list sorted by relevance'''
+def average_precision(rel_k) -> np.ndarray:
+    """Compute average precision based on binary list sorted by relevance"""
     tp = np.cumsum(rel_k, axis=1)
     num_pos = tp[:, -1]
     k = np.arange(1, rel_k.shape[1] + 1)
@@ -90,8 +96,8 @@ def compute_ap(rel_k) -> np.ndarray:
     return ap
 
 
-def compute_ap_contiguous(rel_k_list, counts):
-    '''Compute average precision from a list of contiguous values'''
+def ap_contiguous(rel_k_list, counts):
+    """Compute average precision from a list of contiguous values"""
     cutoffs = to_cutoffs(counts)
 
     num_pos = np.add.reduceat(rel_k_list, cutoffs)
@@ -108,17 +114,16 @@ def compute_ap_contiguous(rel_k_list, counts):
 
 
 def random_ap(num_perm: int, num_pos: int, total: int, seed) -> np.ndarray:
-    '''Compute multiple average_precision scores generated at random'''
+    """Compute multiple average_precision scores generated at random"""
     rng = np.random.default_rng(seed)
     rel_k = random_binary_matrix(num_perm, total, num_pos, rng)
-    null_dist = compute_ap(rel_k)
-    null_dist.sort()
+    null_dist = average_precision(rel_k)
     return null_dist
 
 
 def null_dist_cached(num_pos, total, seed, null_size, cache_dir):
     if seed is not None:
-        cache_file = cache_dir / f'n{total}_k{num_pos}.npy'
+        cache_file = cache_dir / f"n{total}_k{num_pos}.npy"
         if cache_file.is_file():
             null_dist = np.load(cache_file)
         else:
@@ -130,34 +135,36 @@ def null_dist_cached(num_pos, total, seed, null_size, cache_dir):
 
 
 def get_null_dists(confs, null_size, seed):
-    cache_dir = Path.home() / f'.copairs/seed{seed}/ns{null_size}'
+    cache_dir = Path.home() / f".copairs/seed{seed}/ns{null_size}"
     cache_dir.mkdir(parents=True, exist_ok=True)
     num_confs = len(confs)
     rng = np.random.default_rng(seed)
     seeds = rng.integers(8096, size=num_confs)
 
     null_dists = np.empty([len(confs), null_size], dtype=np.float32)
+
     def par_func(i):
         num_pos, total = confs[i]
-        null_dists[i] = null_dist_cached(num_pos, total, seeds[i],
-                                         null_size, cache_dir)
+        null_dists[i] = null_dist_cached(num_pos, total, seeds[i], null_size, cache_dir)
+
     parallel_map(par_func, np.arange(num_confs))
     return null_dists
 
 
-def compute_p_values(ap_scores, null_confs, null_size: int, seed):
+def p_values(ap_scores, null_confs, null_size: int, seed):
     confs, rev_ix = np.unique(null_confs, axis=0, return_inverse=True)
     null_dists = get_null_dists(confs, null_size, seed)
-    p_values = np.empty(len(ap_scores), dtype=np.float32)
+    null_dists.sort(axis=1)
+    pvals = np.empty(len(ap_scores), dtype=np.float32)
     for i, (ap_score, ix) in enumerate(zip(ap_scores, rev_ix)):
         # Reverse to get from hi to low
         num = null_size - np.searchsorted(null_dists[ix], ap_score)
-        p_values[i] = (num + 1) / (null_size + 1)
-    return p_values
+        pvals[i] = (num + 1) / (null_size + 1)
+    return pvals
 
 
 def concat_ranges(start: np.ndarray, end: np.ndarray) -> np.ndarray:
-    '''Create a 1-d array concatenating multiple ranges'''
+    """Create a 1-d array concatenating multiple ranges"""
     slices = map(range, start, end)
     slices = itertools.chain.from_iterable(slices)
     count = (end - start).sum()
@@ -166,7 +173,7 @@ def concat_ranges(start: np.ndarray, end: np.ndarray) -> np.ndarray:
 
 
 def to_cutoffs(counts: np.ndarray):
-    '''Convert a list of counts into cutoff indices.'''
+    """Convert a list of counts into cutoff indices."""
     cutoffs = np.empty_like(counts)
     cutoffs[0], cutoffs[1:] = 0, counts.cumsum()[:-1]
     return cutoffs