Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ambiguous greedy & parallel dissimilarity computation #226

Merged
merged 28 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
7104bc5
updated state freq computation to account for ambiguous alleles
mattjones315 Sep 26, 2023
cbfea70
added tests for computing frequencies in ambiguous setting
mattjones315 Sep 26, 2023
58e3a05
robust duplicate dropping for ambiguous characters
mattjones315 Sep 26, 2023
2cfba2b
formatting
mattjones315 Sep 26, 2023
98a786b
implemented greedy splitting with ambiguous states
mattjones315 Sep 26, 2023
66dbc33
full support for greedy with ambiguous states
mattjones315 Sep 27, 2023
085a409
updated when ccphylo tests would be run
mattjones315 Sep 27, 2023
ac3deb4
added mixin utilities test
mattjones315 Sep 27, 2023
a58480e
updated duplicated leaf addition to account for ambiguity
mattjones315 Sep 27, 2023
6418ba5
added test to catch ambiguous states where theyre not supposed to be
mattjones315 Sep 27, 2023
7c22236
Merge branch 'master' into ambiguous_greedy
mattjones315 Sep 27, 2023
1097521
updated missing data classification when ambig states are present
mattjones315 Sep 27, 2023
e641932
formatting
mattjones315 Sep 27, 2023
ccfd54e
added breaking test for collapsing ambiguous edges
mattjones315 Sep 27, 2023
d27cd57
updated ancestral reconstruction with ambiguous states
mattjones315 Sep 27, 2023
54bb343
updated branch length calculation with ambiguous states
mattjones315 Sep 27, 2023
bfb1ad1
parallel distance computation
mattjones315 Oct 5, 2023
d493646
appending duplicates correctly for hybridsolver
mattjones315 Oct 6, 2023
2b3d554
updated ccphylo config and gitignore
mattjones315 Oct 10, 2023
4196559
added shared memory buffer
mattjones315 Oct 10, 2023
2071fc4
updated docstring in DistanceSolver for threads
mattjones315 Oct 10, 2023
c2a56ba
updated docstring in NeighborJoiningSolver
mattjones315 Oct 10, 2023
fbeb105
updated neighborjoining_solve_tests with new threads parameter
mattjones315 Oct 10, 2023
97656e0
updated ccphylo search
mattjones315 Oct 10, 2023
877c9bd
updated docstrings
mattjones315 Oct 10, 2023
1368c49
updated README with ccphylo instructions
mattjones315 Oct 10, 2023
9964571
added more tests for cluster linkage
mattjones315 Oct 10, 2023
60a914e
Merge branch 'master' into ambiguous_greedy
mattjones315 Oct 10, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cassiopeia/config.ini
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be removed from the commit. I've also had issues with cassiopeia/config.ini changes being included even though its in the .gitignore. Do you know how to fix this?

Copy link
Collaborator Author

@mattjones315 mattjones315 Oct 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, that is weird. I agree, this should be removed form the commit.

I think the issue here is that the config is tracked, but only as a default. So .gitignore gets confused because the file does exist. I propose we put the default config in ./data and specify in the README how to utilize it.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That solution sounds good to me

Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[Paths]
ccphylo_path = /path/to/ccphylo/ccphylo
ccphylo_path = /Users/mgjones/software/ccphylo/ccphylo
9 changes: 8 additions & 1 deletion cassiopeia/data/CassiopeiaTree.py
Original file line number Diff line number Diff line change
Expand Up @@ -1401,7 +1401,11 @@ def get_mutations_along_edge(

mutations = []
for i in range(self.n_character):
if parent_states[i] != child_states[i]:

mattjones315 marked this conversation as resolved.
Show resolved Hide resolved
parent_state = (list(parent_states[i]) if is_ambiguous_state(parent_states[i]) else [parent_states[i]])
child_state = (list(child_states[i]) if is_ambiguous_state(child_states[i]) else [child_states[i]])

if len(np.intersect1d(parent_state, child_state)) < 1:
if treat_missing_as_mutations:
mutations.append((i, child_states[i]))
elif (
Expand Down Expand Up @@ -1828,6 +1832,7 @@ def compute_dissimilarity_map(
] = None,
prior_transformation: str = "negative_log",
layer: Optional[str] = None,
threads: int = 1
) -> None:
"""Computes a dissimilarity map.

Expand All @@ -1854,6 +1859,7 @@ def compute_dissimilarity_map(
the square root of 1/p
layer: Character matrix layer to use. If not specified, use the
default :attr:`character_matrix`.
threads: Number of threads to use for dissimilarity map computation.
"""

if layer is not None:
Expand Down Expand Up @@ -1884,6 +1890,7 @@ def compute_dissimilarity_map(
dissimilarity_function,
weights,
self.missing_state_indicator,
threads=threads,
)

dissimilarity_map = scipy.spatial.distance.squareform(dissimilarity_map)
Expand Down
208 changes: 166 additions & 42 deletions cassiopeia/data/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@
General utilities for the datasets encountered in Cassiopeia.
"""
import collections
from joblib import delayed
import multiprocessing
from typing import Callable, Dict, List, Optional, Tuple, Union
import warnings

import ete3
import networkx as nx
import ngs_tools
import numba
import numpy as np
import pandas as pd
Expand All @@ -29,6 +32,12 @@ def get_lca_characters(
character if only one of the constituent vectors has a missing value at that
index, and imputes missing value if all have a missing value at that index.

Importantly, this method will infer ancestral characters for an ambiguous
state. If the intersection between two states (even ambiguous) is non-zero
and not the missing state, and has length exactly 1, we assign the ancestral
state this value. Else, if the intersection length is greater than 1, the
value '0' is assigned.

Args:
vecs: A list of character vectors to generate an LCA for
missing_state_indicator: The character representing missing values
Expand All @@ -42,19 +51,26 @@ def get_lca_characters(
assert len(i) == k
lca_vec = [0] * len(vecs[0])
for i in range(k):
chars = set()
for vec in vecs:
if is_ambiguous_state(vec[i]):
chars = chars.union(vec[i])
else:
chars.add(vec[i])
if len(chars) == 1:
lca_vec[i] = list(chars)[0]
if np.all(
np.array([vec[i] for vec in vecs], dtype=object)
== missing_state_indicator
):
lca_vec[i] = missing_state_indicator
else:
if missing_state_indicator in chars:
chars.remove(missing_state_indicator)
if len(chars) == 1:
lca_vec[i] = list(chars)[0]
all_states = [
vec[i] for vec in vecs if vec[i] != missing_state_indicator
]
chars = set.intersection(
*map(
set,
[
state if is_ambiguous_state(state) else [state]
for state in all_states
],
)
)
if len(chars) == 1:
lca_vec[i] = list(chars)[0]
return lca_vec


Expand Down Expand Up @@ -136,11 +152,12 @@ def _to_newick_str(g, node):


def compute_dissimilarity_map(
cm: np.array,
cm: np.array([[]]),
C: int,
dissimilarity_function: Callable,
weights: Optional[Dict[int, Dict[int, float]]] = None,
missing_state_indicator: int = -1,
threads: int = 1,
) -> np.array:
"""Compute the dissimilarity between all samples

Expand All @@ -151,15 +168,35 @@ def compute_dissimilarity_map(
cm: Character matrix
C: Number of samples
weights: Weights to use for comparing states.
dissimilarity_function: Dissimilarity function that returns the distance
between two character states.
missing_state_indicator: State indicating missing data
threads: Number of threads to use for distance computation.

Returns:
A dissimilarity mapping as a flattened array.
"""
# check to see if any ambiguous characters are present
ambiguous_present = np.any(
[(cm[:, i].dtype == "object") for i in range(cm.shape[1])]
)

# Try to numbaize the dissimilarity function, but fallback to python
numbaize = True
try:
dissimilarity_func = numba.jit(dissimilarity_function, nopython=True)
if not ambiguous_present:
dissimilarity_func = numba.jit(
dissimilarity_function, nopython=True
)
else:
dissimilarity_func = numba.jit(
dissimilarity_function,
nopython=False,
forceobj=True,
parallel=True,
)
numbaize = False

# When cluster_dissimilarity is used, the dissimilarity_function is wrapped
# in a partial, which raises a TypeError when trying to numbaize.
except TypeError:
Expand All @@ -170,12 +207,84 @@ def compute_dissimilarity_map(
numbaize = False
dissimilarity_func = dissimilarity_function

if threads > 1:
dm = np.zeros(C * (C - 1) // 2, dtype=np.float64)
k, m = divmod(len(dm), threads)
batches = [
np.arange(len(dm))[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)]
for i in range(threads)
]

with multiprocessing.Pool(processes=threads) as pool:
colganwi marked this conversation as resolved.
Show resolved Hide resolved
results = list(
pool.starmap(
__compute_dissimilarity_map_wrapper,
[
(
dissimilarity_func,
cm,
batch,
weights,
missing_state_indicator,
numbaize,
ambiguous_present,
)
for batch in batches
],
),
)

for batch_indices, batch_results in results:
dm[batch_indices] = batch_results
else:
_, dm = __compute_dissimilarity_map_wrapper(dissimilarity_func, cm, np.arange(C*(C-1) // 2), weights, missing_state_indicator, numbaize, ambiguous_present)
mattjones315 marked this conversation as resolved.
Show resolved Hide resolved

return dm


def __compute_dissimilarity_map_wrapper(
colganwi marked this conversation as resolved.
Show resolved Hide resolved
dissimilarity_func: Callable,
cm: np.array([[]]),
batch_indices: np.array([]),
weights: Optional[Dict[int, Dict[int, float]]] = None,
missing_state_indicator: int = -1,
numbaize: bool = True,
ambiguous_present: bool = False,
) -> Tuple[np.array, np.array]:
"""Wrapper function for parallel computation of dissimilarity maps.

This is a wrapper function that is intended to interface with
compute_dissimilarity_map. The reason why this is necessary is because
specific numba objects are not compatible with the multiprocessing library
used for parallel computation of the dissimilarity matrix.

While there is a minor hit when using multiple threads for a function
that can be jit compiled with numba, this effect is negligible and the
benefits of a parallel dissimilarity matrix computation for
non-jit-compatible function far outweighs the minor slow down.

Args:
dissimilarity_func: A pre-compiled dissimilarity function.
cm: Character matrix
batch_indices: Batch indicies. These refer to a set of compressed
indices for a final square dissimilarity matrix. This function
will only compute the dissimilarity for these indices.
missing_state_indicator: Integer value representing missing states.
weights: Weights to use for comparing states.
numbaize: Whether or not to numbaize the final dissimilarity map
computation, based on whether or not the dissimilarity function
was compatible with jit-compilation.
ambiguous_present: Whether or not ambiguous states are present.

Returns:
A tuple of (batch_indices, batch_results) indicating the dissimilarities
for the comparisons specified by batch_indices.
"""
nb_weights = numba.typed.Dict.empty(
numba.types.int64,
numba.types.DictType(numba.types.int64, numba.types.float64),
)
if weights:

for k, v in weights.items():
nb_char_weights = numba.typed.Dict.empty(
numba.types.int64, numba.types.float64
Expand All @@ -184,34 +293,49 @@ def compute_dissimilarity_map(
nb_char_weights[state] = prior
nb_weights[k] = nb_char_weights

def _compute_dissimilarity_map(cm, C, missing_state_indicator, nb_weights):

dm = np.zeros(C * (C - 1) // 2, dtype=np.float64)
def _compute_dissimilarity_map(
cm=np.array([[]]),
batch_indices=np.array([]),
missing_state_indicator=-1,
nb_weights={},
):
batch_results = np.zeros(len(batch_indices), dtype=np.float64)
k = 0
for i in range(C - 1):
for j in range(i + 1, C):

s1 = cm[i, :]
s2 = cm[j, :]
dm[k] = dissimilarity_func(
s1, s2, missing_state_indicator, nb_weights
)
k += 1

return dm
n = cm.shape[0]
b = 1 - 2 * n
for index in batch_indices:
i = int(np.floor((-b - np.sqrt(b**2 - 8 * index)) / 2))
j = int(index + i * (b + i + 2) / 2 + 1)
s1 = cm[i, :]
s2 = cm[j, :]
batch_results[k] = dissimilarity_func(
s1, s2, missing_state_indicator, nb_weights
)
k += 1
return batch_indices, batch_results

# Numbaize _compute_dissimilarity_map in nopython mode only if the
# dissimilarity function has been successfully numbaized. Otherwise,
# numbaize in object mode.
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=numba.NumbaDeprecationWarning)
warnings.simplefilter("ignore", category=numba.NumbaWarning)
_compute_dissimilarity_map = numba.jit(
_compute_dissimilarity_map, nopython=numbaize
)

if not ambiguous_present:
_compute_dissimilarity_map = numba.jit(
_compute_dissimilarity_map, nopython=numbaize
)
else:
_compute_dissimilarity_map = numba.jit(
_compute_dissimilarity_map,
nopython=False,
forceobj=True,
parallel=True,
)

return _compute_dissimilarity_map(
cm, C, missing_state_indicator, nb_weights
cm,
batch_indices,
missing_state_indicator,
nb_weights,
)


Expand Down Expand Up @@ -244,7 +368,6 @@ def sample_bootstrap_character_matrices(
bootstrap_samples = []
M = character_matrix.shape[1]
for _ in range(num_bootstraps):

if random_state:
sampled_cut_sites = random_state.choice(M, M, replace=True)
else:
Expand Down Expand Up @@ -309,8 +432,10 @@ def sample_bootstrap_allele_tables(
allele_table
)

lineage_profile = preprocessing_utilities.convert_alleletable_to_lineage_profile(
allele_table, cut_sites
lineage_profile = (
preprocessing_utilities.convert_alleletable_to_lineage_profile(
allele_table, cut_sites
)
)

intbcs = allele_table["intBC"].unique()
Expand All @@ -319,7 +444,6 @@ def sample_bootstrap_allele_tables(
bootstrap_samples = []

for _ in range(num_bootstraps):

if random_state:
sampled_intbcs = random_state.choice(intbcs, M, replace=True)
else:
Expand Down Expand Up @@ -401,10 +525,8 @@ def compute_phylogenetic_weight_matrix(
W = pd.DataFrame(np.zeros((N, N)), index=tree.leaves, columns=tree.leaves)

for leaf1 in tree.leaves:

distances = tree.get_distances(leaf1, leaves_only=True)
for leaf2, _d in distances.items():

if inverse:
_d = inverse_fn(_d) if _d > 0 else np.inf

Expand All @@ -413,7 +535,8 @@ def compute_phylogenetic_weight_matrix(
np.fill_diagonal(W.values, 0)

return W



@numba.jit(nopython=True)
def net_relatedness_index(
dissimilarity_map: np.array, indices_1: np.array, indices_2: np.array
Expand Down Expand Up @@ -441,6 +564,7 @@ def net_relatedness_index(

return nri / (len(indices_1) * len(indices_2))


def compute_inter_cluster_distances(
tree: CassiopeiaTree,
meta_item: Optional[str] = None,
Expand Down
Loading