Skip to content

Commit

Permalink
Merge pull request #53 from cytomining/v0.4.0
Browse files Browse the repository at this point in the history
Bump version
  • Loading branch information
johnarevalo authored Feb 6, 2024
2 parents 91a282b + bd17afb commit 89b2c62
Show file tree
Hide file tree
Showing 20 changed files with 836 additions and 763 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "copairs"
version = "0.3.3"
version = "0.4.0"
description = "Find pairs and compute metrics between them"
readme = "README.md"
requires-python = ">=3.8"
Expand Down
4 changes: 2 additions & 2 deletions src/copairs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
'''
"""
Package to create pairwise lists based on sameby and diffby criteria
'''
"""
from .matching import Matcher, MatcherMultilabel
61 changes: 34 additions & 27 deletions src/copairs/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
import numpy as np
from tqdm.autonotebook import tqdm


def parallel_map(par_func, items):
'''Execute par_func(i) for every i in items using ThreadPool and tqdm.'''
"""Execute par_func(i) for every i in items using ThreadPool and tqdm."""
num_items = len(items)
pool_size = min(num_items, os.cpu_count())
chunksize = num_items // pool_size
Expand All @@ -18,29 +19,34 @@ def parallel_map(par_func, items):
pass


def batch_processing(pairwise_op: Callable[[np.ndarray, np.ndarray],np.ndarray],):
'''Decorator adding the batch_size param to run the function with multithreading using a list of paired indices'''
def batch_processing(
pairwise_op: Callable[[np.ndarray, np.ndarray], np.ndarray],
):
"""Decorator adding the batch_size param to run the function with
multithreading using a list of paired indices"""

def batched_fn(feats: np.ndarray, pair_ix: np.ndarray, batch_size: int):
num_pairs = len(pair_ix)
result = np.empty(num_pairs, dtype=np.float32)

def par_func(i):
x_sample = feats[pair_ix[i:i + batch_size, 0]]
y_sample = feats[pair_ix[i:i + batch_size, 1]]
result[i:i + len(x_sample)] = pairwise_op(x_sample, y_sample)
x_sample = feats[pair_ix[i : i + batch_size, 0]]
y_sample = feats[pair_ix[i : i + batch_size, 1]]
result[i : i + len(x_sample)] = pairwise_op(x_sample, y_sample)

parallel_map(par_func, np.arange(0, num_pairs, batch_size))

return result

return batched_fn


@batch_processing
def pairwise_corr(x_sample: np.ndarray, y_sample: np.ndarray) -> np.ndarray:
'''
"""
Compute pearson correlation between two matrices in a paired row-wise
fashion. `x_sample` and `y_sample` must be of the same shape.
'''
"""
x_mean = x_sample.mean(axis=1, keepdims=True)
y_mean = y_sample.mean(axis=1, keepdims=True)

Expand All @@ -60,8 +66,8 @@ def pairwise_corr(x_sample: np.ndarray, y_sample: np.ndarray) -> np.ndarray:
def pairwise_cosine(x_sample: np.ndarray, y_sample: np.ndarray) -> np.ndarray:
x_norm = x_sample / np.linalg.norm(x_sample, axis=1)[:, np.newaxis]
y_norm = y_sample / np.linalg.norm(y_sample, axis=1)[:, np.newaxis]
c_dist = np.sum(x_norm * y_norm, axis=1)
return c_dist
c_sim = np.sum(x_norm * y_norm, axis=1)
return c_sim


def random_binary_matrix(n, m, k, rng):
Expand All @@ -80,8 +86,8 @@ def random_binary_matrix(n, m, k, rng):
return matrix


def compute_ap(rel_k) -> np.ndarray:
'''Compute average precision based on binary list sorted by relevance'''
def average_precision(rel_k) -> np.ndarray:
"""Compute average precision based on binary list sorted by relevance"""
tp = np.cumsum(rel_k, axis=1)
num_pos = tp[:, -1]
k = np.arange(1, rel_k.shape[1] + 1)
Expand All @@ -90,8 +96,8 @@ def compute_ap(rel_k) -> np.ndarray:
return ap


def compute_ap_contiguous(rel_k_list, counts):
'''Compute average precision from a list of contiguous values'''
def ap_contiguous(rel_k_list, counts):
"""Compute average precision from a list of contiguous values"""
cutoffs = to_cutoffs(counts)

num_pos = np.add.reduceat(rel_k_list, cutoffs)
Expand All @@ -108,17 +114,16 @@ def compute_ap_contiguous(rel_k_list, counts):


def random_ap(num_perm: int, num_pos: int, total: int, seed) -> np.ndarray:
'''Compute multiple average_precision scores generated at random'''
"""Compute multiple average_precision scores generated at random"""
rng = np.random.default_rng(seed)
rel_k = random_binary_matrix(num_perm, total, num_pos, rng)
null_dist = compute_ap(rel_k)
null_dist.sort()
null_dist = average_precision(rel_k)
return null_dist


def null_dist_cached(num_pos, total, seed, null_size, cache_dir):
if seed is not None:
cache_file = cache_dir / f'n{total}_k{num_pos}.npy'
cache_file = cache_dir / f"n{total}_k{num_pos}.npy"
if cache_file.is_file():
null_dist = np.load(cache_file)
else:
Expand All @@ -130,34 +135,36 @@ def null_dist_cached(num_pos, total, seed, null_size, cache_dir):


def get_null_dists(confs, null_size, seed):
cache_dir = Path.home() / f'.copairs/seed{seed}/ns{null_size}'
cache_dir = Path.home() / f".copairs/seed{seed}/ns{null_size}"
cache_dir.mkdir(parents=True, exist_ok=True)
num_confs = len(confs)
rng = np.random.default_rng(seed)
seeds = rng.integers(8096, size=num_confs)

null_dists = np.empty([len(confs), null_size], dtype=np.float32)

def par_func(i):
num_pos, total = confs[i]
null_dists[i] = null_dist_cached(num_pos, total, seeds[i],
null_size, cache_dir)
null_dists[i] = null_dist_cached(num_pos, total, seeds[i], null_size, cache_dir)

parallel_map(par_func, np.arange(num_confs))
return null_dists


def compute_p_values(ap_scores, null_confs, null_size: int, seed):
def p_values(ap_scores, null_confs, null_size: int, seed):
confs, rev_ix = np.unique(null_confs, axis=0, return_inverse=True)
null_dists = get_null_dists(confs, null_size, seed)
p_values = np.empty(len(ap_scores), dtype=np.float32)
null_dists.sort(axis=1)
pvals = np.empty(len(ap_scores), dtype=np.float32)
for i, (ap_score, ix) in enumerate(zip(ap_scores, rev_ix)):
# Reverse to get from hi to low
num = null_size - np.searchsorted(null_dists[ix], ap_score)
p_values[i] = (num + 1) / (null_size + 1)
return p_values
pvals[i] = (num + 1) / (null_size + 1)
return pvals


def concat_ranges(start: np.ndarray, end: np.ndarray) -> np.ndarray:
'''Create a 1-d array concatenating multiple ranges'''
"""Create a 1-d array concatenating multiple ranges"""
slices = map(range, start, end)
slices = itertools.chain.from_iterable(slices)
count = (end - start).sum()
Expand All @@ -166,7 +173,7 @@ def concat_ranges(start: np.ndarray, end: np.ndarray) -> np.ndarray:


def to_cutoffs(counts: np.ndarray):
'''Convert a list of counts into cutoff indices.'''
"""Convert a list of counts into cutoff indices."""
cutoffs = np.empty_like(counts)
cutoffs[0], cutoffs[1:] = 0, counts.cumsum()[:-1]
return cutoffs
Loading

0 comments on commit 89b2c62

Please sign in to comment.