Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Evolutionary coupling #154

Merged
merged 14 commits into from
Nov 11, 2021
2 changes: 2 additions & 0 deletions cassiopeia/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
from .CassiopeiaTree import CassiopeiaTree
from .utilities import (
compute_dissimilarity_map,
compute_inter_cluster_distances,
compute_phylogenetic_weight_matrix,
get_lca_characters,
net_relatedness_index,
sample_bootstrap_allele_tables,
sample_bootstrap_character_matrices,
to_newick,
Expand Down
95 changes: 95 additions & 0 deletions cassiopeia/data/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from cassiopeia.data import CassiopeiaTree
from cassiopeia.mixins import CassiopeiaTreeWarning, is_ambiguous_state
from cassiopeia.mixins.errors import CassiopeiaError
from cassiopeia.preprocess import utilities as preprocessing_utilities


Expand Down Expand Up @@ -412,3 +413,97 @@ def compute_phylogenetic_weight_matrix(
np.fill_diagonal(W.values, 0)

return W

@numba.jit(nopython=True)
def net_relatedness_index(
dissimilarity_map: np.array, indices_1: np.array, indices_2: np.array
) -> float:
"""Computes the net relatedness index between indices.
Using the dissimilarity map specified and the indices of samples, compute
the net relatedness index, defined as:
sum(distances over i,j in indices_1,indices_2) / (|indices_1| x |indices_2|)
Args:
dissimilarity_map: Dissimilarity map between all samples.
indices_1: Indices corresponding to the first group.
indices_2: Indices corresponding to the second group.
Returns:
The Net Relatedness Index (NRI)
"""

nri = 0
for i in indices_1:
for j in indices_2:
nri += dissimilarity_map[i, j]

return nri / (len(indices_1) * len(indices_2))

def compute_inter_cluster_distances(
tree: CassiopeiaTree,
meta_item: Optional[str] = None,
meta_data: Optional[pd.DataFrame] = None,
dissimilarity_map: Optional[pd.DataFrame] = None,
distance_function: Callable = net_relatedness_index,
**kwargs,
) -> pd.DataFrame:
"""Computes mean distance between clusters.
Compute the mean distance between categories in a categorical variable. By
default, the phylogenetic weight matrix will be computed and used for this
distance calculation, but a user can optionally provide a dissimilarity
map instead.
This function performs the computation in O(K^2)*O(distance_function) time
for a variable with K categories.
Args:
tree: CassiopeiaTree
meta_item: Column in the cell meta data of the tree. If `meta_data` is
specified, this is ignored.
meta_data: Meta data to use for this calculation. This argument takes
priority over meta_item.
dissimilarity_map: Dissimilarity map to use for distances. If this is
specified, the phylogenetic weight matrix is not computed.
number_of_neighbors: Number of nearest neighbors to use for computing
the mean distances. If this is not specified, then all cells are
used.
**kwargs: Arguments to pass to the distance function.
Returns:
A K x K distance matrix.
"""
meta_data = tree.cell_meta[meta_item] if (meta_data is None) else meta_data

# ensure that the meta data is categorical
if not pd.api.types.is_string_dtype(meta_data):
raise CassiopeiaError("Meta data must be categorical or a string.")

D = (
compute_phylogenetic_weight_matrix(tree)
if (dissimilarity_map is None)
else dissimilarity_map
)

unique_states = meta_data.unique()
K = len(unique_states)
inter_cluster_distances = pd.DataFrame(
np.zeros((K, K)), index=unique_states, columns=unique_states
)

# align distance matrix and meta_data
D = D.loc[meta_data.index.values, meta_data.index.values]

for state1 in unique_states:
indices_1 = np.where(np.array(meta_data) == state1)[0]
for state2 in unique_states:
indices_2 = np.where(np.array(meta_data) == state2)[0]

distance = distance_function(
D.values, indices_1, indices_2, **kwargs
)
inter_cluster_distances.loc[state1, state2] = distance

return inter_cluster_distances
1 change: 1 addition & 0 deletions cassiopeia/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@

from .autocorrelation import compute_morans_i
from .branch_length_estimator import IIDExponentialBayesian, IIDExponentialMLE
from .coupling import compute_evolutionary_coupling
from .small_parsimony import fitch_count, fitch_hartigan, score_small_parsimony
from .topology import compute_cophenetic_correlation, compute_expansion_pvalues
125 changes: 125 additions & 0 deletions cassiopeia/tools/coupling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""
File storing functionality for computing coupling statistics between meta
variables on a tree.
"""
from typing import Callable, Optional

from collections import defaultdict
import numpy as np
import pandas as pd
from tqdm import tqdm

from cassiopeia.data import CassiopeiaTree
from cassiopeia.data import utilities as data_utilities


def compute_evolutionary_coupling(
tree: CassiopeiaTree,
meta_variable: str,
minimum_proportion: float = 0.05,
number_of_shuffles: int = 500,
random_state: Optional[np.random.RandomState] = None,
dissimilarity_map: Optional[pd.DataFrame] = None,
cluster_comparison_function: Callable = data_utilities.net_relatedness_index,
**comparison_kwargs,
) -> pd.DataFrame:
"""Computes Evolutionary Coupling of categorical variables.
Using the methodology described in Yang, Jones et al, BioRxiv (2021), this
function will compute the "evolutionary coupling" statistic between values
that a categorical variable can take on with the tree. For example, this
categorical variable can be a "cell type", and this function will compute
the evolutionary couplings between all types of cell types. This indicates
how closely related these cell types are to one another.
Briefly, this statistic is the Z-normalized mean distance between categories
in the specified categorical variable. Note that empirical nulls that have a
standard deviation of 0 lead to NaNs in the resulting evolutionary coupling
matrix.
The computational complexity of this function is
O(n^2 log n + (B+1)(K^2 * O(distance_function)) for a tree with n leaves, a
variable with K categories, and B random shuffles.
Args:
tree: CassiopeiaTree
meta_variable: Column in `tree.cell_meta` that stores a categorical
variable with K categories.
minimum_proportion: Minimum proportion of cells that a category needs
to appear in to be considered.
number_of_shuffles: Number of times to shuffle the data to compute the
empirical Z score.
random_state: Numpy random state to parameterize the shuffling.
dissimilarity_map: A precomputed dissimilarity map between all leaves.
cluster_comparison_function: A function for comparing the mean distance
between groups. By default, this is the Net Relatedness Index.
**comparison_kwargs: Extra arguments to pass to the cluster comparison
function.
Returns:
A K x K evolutionary coupling dataframe.
"""

W = (
data_utilities.compute_phylogenetic_weight_matrix(tree)
if (dissimilarity_map is None)
else dissimilarity_map
)

meta_data = tree.cell_meta[meta_variable]

# subset meta data by minimum proportion
if minimum_proportion > 0:
filter_threshold = int(len(tree.leaves) * minimum_proportion)
category_frequencies = meta_data.value_counts()
passing_categories = category_frequencies[
category_frequencies > filter_threshold
].index.values
meta_data = meta_data[meta_data.isin(passing_categories)]
W = W.loc[meta_data.index.values, meta_data.index.values]

# compute inter-cluster distances
inter_cluster_distances = data_utilities.compute_inter_cluster_distances(
tree,
meta_data=meta_data,
dissimilarity_map=W,
distance_function=cluster_comparison_function,
**comparison_kwargs,
)

# compute background for Z-scoring
background = defaultdict(list)
for _ in tqdm(
range(number_of_shuffles), desc="Creating empirical background"
):
permuted_assignments = meta_data.copy()
if random_state:
permuted_assignments.index = random_state.permutation(
meta_data.index.values
)
else:
permuted_assignments.index = np.random.permutation(
meta_data.index.values
)
background_distances = data_utilities.compute_inter_cluster_distances(
tree,
meta_data=permuted_assignments,
dissimilarity_map=W,
distance_function=cluster_comparison_function,
**comparison_kwargs,
)
for s1 in background_distances.index:
for s2 in background_distances.columns:
background[(s1, s2)].append(background_distances.loc[s1, s2])

Z_scores = inter_cluster_distances.copy()
for s1 in Z_scores.index:
for s2 in Z_scores.columns:
mean = np.mean(background[(s1, s2)])
sd = np.std(background[(s1, s2)])

Z_scores.loc[s1, s2] = (
inter_cluster_distances.loc[s1, s2] - mean
) / sd

return Z_scores
1 change: 0 additions & 1 deletion cassiopeia/tools/topology.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import pandas as pd
from scipy import spatial, stats


from cassiopeia.data import CassiopeiaTree, compute_phylogenetic_weight_matrix
from cassiopeia.mixins import CassiopeiaError
from cassiopeia.solver import dissimilarity_functions
Expand Down
4 changes: 1 addition & 3 deletions docs/api/plotting.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,4 @@ Currently, our plotting functionality is linked to the rich iTOL framework:
.. autosummary::
:toctree: reference/

pl.upload_and_export_itol


pl.upload_and_export_itol
13 changes: 0 additions & 13 deletions docs/api/reference/cassiopeia.data.CassiopeiaTree.rst

This file was deleted.

This file was deleted.

This file was deleted.

6 changes: 0 additions & 6 deletions docs/api/reference/cassiopeia.data.to_newick.rst

This file was deleted.

6 changes: 0 additions & 6 deletions docs/api/reference/cassiopeia.pl.upload_and_export_itol.rst

This file was deleted.

6 changes: 0 additions & 6 deletions docs/api/reference/cassiopeia.pp.align_sequences.rst

This file was deleted.

6 changes: 0 additions & 6 deletions docs/api/reference/cassiopeia.pp.call_alleles.rst

This file was deleted.

6 changes: 0 additions & 6 deletions docs/api/reference/cassiopeia.pp.call_lineage_groups.rst

This file was deleted.

6 changes: 0 additions & 6 deletions docs/api/reference/cassiopeia.pp.collapse_umis.rst

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

6 changes: 0 additions & 6 deletions docs/api/reference/cassiopeia.pp.error_correct_umis.rst

This file was deleted.

6 changes: 0 additions & 6 deletions docs/api/reference/cassiopeia.pp.filter_cells.rst

This file was deleted.

Loading