YosefLab · mattjones315 · Oct 11, 2023 · Sep 26, 2023 · Sep 26, 2023 · Sep 26, 2023
diff --git a/cassiopeia/config.ini b/cassiopeia/config.ini
@@ -1,2 +1,2 @@
 [Paths]
-ccphylo_path = /path/to/ccphylo/ccphylo
+ccphylo_path = /Users/mgjones/software/ccphylo/ccphylo
diff --git a/cassiopeia/data/CassiopeiaTree.py b/cassiopeia/data/CassiopeiaTree.py
@@ -1401,7 +1401,11 @@ def get_mutations_along_edge(
 
         mutations = []
         for i in range(self.n_character):
-            if parent_states[i] != child_states[i]:
+
+            parent_state = (list(parent_states[i]) if is_ambiguous_state(parent_states[i]) else [parent_states[i]])
+            child_state = (list(child_states[i]) if is_ambiguous_state(child_states[i]) else [child_states[i]])
+
+            if len(np.intersect1d(parent_state, child_state)) < 1:
                 if treat_missing_as_mutations:
                     mutations.append((i, child_states[i]))
                 elif (
@@ -1828,6 +1832,7 @@ def compute_dissimilarity_map(
         ] = None,
         prior_transformation: str = "negative_log",
         layer: Optional[str] = None,
+        threads: int = 1
     ) -> None:
         """Computes a dissimilarity map.
 
@@ -1854,6 +1859,7 @@ def compute_dissimilarity_map(
                         the square root of 1/p
             layer: Character matrix layer to use. If not specified, use the
                 default :attr:`character_matrix`.
+            threads: Number of threads to use for dissimilarity map computation.
         """
 
         if layer is not None:
@@ -1884,6 +1890,7 @@ def compute_dissimilarity_map(
             dissimilarity_function,
             weights,
             self.missing_state_indicator,
+            threads=threads,
         )
 
         dissimilarity_map = scipy.spatial.distance.squareform(dissimilarity_map)

diff --git a/cassiopeia/data/utilities.py b/cassiopeia/data/utilities.py
@@ -2,11 +2,14 @@
 General utilities for the datasets encountered in Cassiopeia.
 """
 import collections
+from joblib import delayed
+import multiprocessing
 from typing import Callable, Dict, List, Optional, Tuple, Union
 import warnings
 
 import ete3
 import networkx as nx
+import ngs_tools
 import numba
 import numpy as np
 import pandas as pd
@@ -29,6 +32,12 @@ def get_lca_characters(
     character if only one of the constituent vectors has a missing value at that
     index, and imputes missing value if all have a missing value at that index.
 
+    Importantly, this method will infer ancestral characters for an ambiguous
+    state. If the intersection between two states (even ambiguous) is non-zero
+    and not the missing state, and has length exactly 1, we assign the ancestral
+    state this value. Else, if the intersection length is greater than 1, the
+    value '0' is assigned.
+
     Args:
         vecs: A list of character vectors to generate an LCA for
         missing_state_indicator: The character representing missing values
@@ -42,19 +51,26 @@ def get_lca_characters(
         assert len(i) == k
     lca_vec = [0] * len(vecs[0])
     for i in range(k):
-        chars = set()
-        for vec in vecs:
-            if is_ambiguous_state(vec[i]):
-                chars = chars.union(vec[i])
-            else:
-                chars.add(vec[i])
-        if len(chars) == 1:
-            lca_vec[i] = list(chars)[0]
+        if np.all(
+            np.array([vec[i] for vec in vecs], dtype=object)
+            == missing_state_indicator
+        ):
+            lca_vec[i] = missing_state_indicator
         else:
-            if missing_state_indicator in chars:
-                chars.remove(missing_state_indicator)
-                if len(chars) == 1:
-                    lca_vec[i] = list(chars)[0]
+            all_states = [
+                vec[i] for vec in vecs if vec[i] != missing_state_indicator
+            ]
+            chars = set.intersection(
+                *map(
+                    set,
+                    [
+                        state if is_ambiguous_state(state) else [state]
+                        for state in all_states
+                    ],
+                )
+            )
+            if len(chars) == 1:
+                lca_vec[i] = list(chars)[0]
     return lca_vec
 
 
@@ -136,11 +152,12 @@ def _to_newick_str(g, node):
 
 
 def compute_dissimilarity_map(
-    cm: np.array,
+    cm: np.array([[]]),
     C: int,
     dissimilarity_function: Callable,
     weights: Optional[Dict[int, Dict[int, float]]] = None,
     missing_state_indicator: int = -1,
+    threads: int = 1,
 ) -> np.array:
     """Compute the dissimilarity between all samples
 
@@ -151,15 +168,35 @@ def compute_dissimilarity_map(
         cm: Character matrix
         C: Number of samples
         weights: Weights to use for comparing states.
+        dissimilarity_function: Dissimilarity function that returns the distance
+            between two character states.
         missing_state_indicator: State indicating missing data
+        threads: Number of threads to use for distance computation.
 
     Returns:
         A dissimilarity mapping as a flattened array.
     """
+    # check to see if any ambiguous characters are present
+    ambiguous_present = np.any(
+        [(cm[:, i].dtype == "object") for i in range(cm.shape[1])]
+    )
+
     # Try to numbaize the dissimilarity function, but fallback to python
     numbaize = True
     try:
-        dissimilarity_func = numba.jit(dissimilarity_function, nopython=True)
+        if not ambiguous_present:
+            dissimilarity_func = numba.jit(
+                dissimilarity_function, nopython=True
+            )
+        else:
+            dissimilarity_func = numba.jit(
+                dissimilarity_function,
+                nopython=False,
+                forceobj=True,
+                parallel=True,
+            )
+            numbaize = False
+
     # When cluster_dissimilarity is used, the dissimilarity_function is wrapped
     # in a partial, which raises a TypeError when trying to numbaize.
     except TypeError:
@@ -170,12 +207,84 @@ def compute_dissimilarity_map(
         numbaize = False
         dissimilarity_func = dissimilarity_function
 
+    if threads > 1:
+        dm = np.zeros(C * (C - 1) // 2, dtype=np.float64)
+        k, m = divmod(len(dm), threads)
+        batches = [
+            np.arange(len(dm))[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)]
+            for i in range(threads)
+        ]
+
+        with multiprocessing.Pool(processes=threads) as pool:
+            results = list(
+                pool.starmap(
+                    __compute_dissimilarity_map_wrapper,
+                    [
+                        (
+                            dissimilarity_func,
+                            cm,
+                            batch,
+                            weights,
+                            missing_state_indicator,
+                            numbaize,
+                            ambiguous_present,
+                        )
+                        for batch in batches
+                    ],
+                ),
+            )
+
+        for batch_indices, batch_results in results:
+            dm[batch_indices] = batch_results
+    else:
+        _, dm = __compute_dissimilarity_map_wrapper(dissimilarity_func, cm, np.arange(C*(C-1) // 2), weights, missing_state_indicator, numbaize, ambiguous_present)
+
+    return dm
+
+
+def __compute_dissimilarity_map_wrapper(
+    dissimilarity_func: Callable,
+    cm: np.array([[]]),
+    batch_indices: np.array([]),
+    weights: Optional[Dict[int, Dict[int, float]]] = None,
+    missing_state_indicator: int = -1,
+    numbaize: bool = True,
+    ambiguous_present: bool = False,
+) -> Tuple[np.array, np.array]:
+    """Wrapper function for parallel computation of dissimilarity maps.
+
+    This is a wrapper function that is intended to interface with
+    compute_dissimilarity_map. The reason why this is necessary is because
+    specific numba objects are not compatible with the multiprocessing library
+    used for parallel computation of the dissimilarity matrix.
+
+    While there is a minor hit when using multiple threads for a function
+    that can be jit compiled with numba, this effect is negligible and the
+    benefits of a parallel dissimilarity matrix computation for
+    non-jit-compatible function far outweighs the minor slow down.
+
+    Args:
+        dissimilarity_func: A pre-compiled dissimilarity function.
+        cm: Character matrix
+        batch_indices: Batch indicies. These refer to a set of compressed
+            indices for a final square dissimilarity matrix. This function
+            will only compute the dissimilarity for these indices.
+        missing_state_indicator: Integer value representing missing states.
+        weights: Weights to use for comparing states.
+        numbaize: Whether or not to numbaize the final dissimilarity map
+            computation, based on whether or not the dissimilarity function
+            was compatible with jit-compilation.
+        ambiguous_present: Whether or not ambiguous states are present.
+
+    Returns:
+        A tuple of (batch_indices, batch_results) indicating the dissimilarities
+            for the comparisons specified by batch_indices.
+    """
     nb_weights = numba.typed.Dict.empty(
         numba.types.int64,
         numba.types.DictType(numba.types.int64, numba.types.float64),
     )
     if weights:
-
         for k, v in weights.items():
             nb_char_weights = numba.typed.Dict.empty(
                 numba.types.int64, numba.types.float64
@@ -184,34 +293,49 @@ def compute_dissimilarity_map(
                 nb_char_weights[state] = prior
             nb_weights[k] = nb_char_weights
 
-    def _compute_dissimilarity_map(cm, C, missing_state_indicator, nb_weights):
-
-        dm = np.zeros(C * (C - 1) // 2, dtype=np.float64)
+    def _compute_dissimilarity_map(
+        cm=np.array([[]]),
+        batch_indices=np.array([]),
+        missing_state_indicator=-1,
+        nb_weights={},
+    ):
+        batch_results = np.zeros(len(batch_indices), dtype=np.float64)
         k = 0
-        for i in range(C - 1):
-            for j in range(i + 1, C):
-
-                s1 = cm[i, :]
-                s2 = cm[j, :]
-                dm[k] = dissimilarity_func(
-                    s1, s2, missing_state_indicator, nb_weights
-                )
-                k += 1
 
-        return dm
+        n = cm.shape[0]
+        b = 1 - 2 * n
+        for index in batch_indices:
+            i = int(np.floor((-b - np.sqrt(b**2 - 8 * index)) / 2))
+            j = int(index + i * (b + i + 2) / 2 + 1)
+            s1 = cm[i, :]
+            s2 = cm[j, :]
+            batch_results[k] = dissimilarity_func(
+                s1, s2, missing_state_indicator, nb_weights
+            )
+            k += 1
+        return batch_indices, batch_results
 
-    # Numbaize _compute_dissimilarity_map in nopython mode only if the
-    # dissimilarity function has been successfully numbaized. Otherwise,
-    # numbaize in object mode.
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", category=numba.NumbaDeprecationWarning)
         warnings.simplefilter("ignore", category=numba.NumbaWarning)
-        _compute_dissimilarity_map = numba.jit(
-            _compute_dissimilarity_map, nopython=numbaize
-        )
+
+        if not ambiguous_present:
+            _compute_dissimilarity_map = numba.jit(
+                _compute_dissimilarity_map, nopython=numbaize
+            )
+        else:
+            _compute_dissimilarity_map = numba.jit(
+                _compute_dissimilarity_map,
+                nopython=False,
+                forceobj=True,
+                parallel=True,
+            )
 
         return _compute_dissimilarity_map(
-            cm, C, missing_state_indicator, nb_weights
+            cm,
+            batch_indices,
+            missing_state_indicator,
+            nb_weights,
         )
 
 
@@ -244,7 +368,6 @@ def sample_bootstrap_character_matrices(
     bootstrap_samples = []
     M = character_matrix.shape[1]
     for _ in range(num_bootstraps):
-
         if random_state:
             sampled_cut_sites = random_state.choice(M, M, replace=True)
         else:
@@ -309,8 +432,10 @@ def sample_bootstrap_allele_tables(
             allele_table
         )
 
-    lineage_profile = preprocessing_utilities.convert_alleletable_to_lineage_profile(
-        allele_table, cut_sites
+    lineage_profile = (
+        preprocessing_utilities.convert_alleletable_to_lineage_profile(
+            allele_table, cut_sites
+        )
     )
 
     intbcs = allele_table["intBC"].unique()
@@ -319,7 +444,6 @@ def sample_bootstrap_allele_tables(
     bootstrap_samples = []
 
     for _ in range(num_bootstraps):
-
         if random_state:
             sampled_intbcs = random_state.choice(intbcs, M, replace=True)
         else:
@@ -401,10 +525,8 @@ def compute_phylogenetic_weight_matrix(
     W = pd.DataFrame(np.zeros((N, N)), index=tree.leaves, columns=tree.leaves)
 
     for leaf1 in tree.leaves:
-
         distances = tree.get_distances(leaf1, leaves_only=True)
         for leaf2, _d in distances.items():
-
             if inverse:
                 _d = inverse_fn(_d) if _d > 0 else np.inf
 
@@ -413,7 +535,8 @@ def compute_phylogenetic_weight_matrix(
     np.fill_diagonal(W.values, 0)
 
     return W
-
+
+
 @numba.jit(nopython=True)
 def net_relatedness_index(
     dissimilarity_map: np.array, indices_1: np.array, indices_2: np.array
@@ -441,6 +564,7 @@ def net_relatedness_index(
 
     return nri / (len(indices_1) * len(indices_2))
 
+
 def compute_inter_cluster_distances(
     tree: CassiopeiaTree,
     meta_item: Optional[str] = None,