diff --git a/cassiopeia/data/__init__.py b/cassiopeia/data/__init__.py index c48b9e2b..026cfd77 100644 --- a/cassiopeia/data/__init__.py +++ b/cassiopeia/data/__init__.py @@ -3,8 +3,10 @@ from .CassiopeiaTree import CassiopeiaTree from .utilities import ( compute_dissimilarity_map, + compute_inter_cluster_distances, compute_phylogenetic_weight_matrix, get_lca_characters, + net_relatedness_index, sample_bootstrap_allele_tables, sample_bootstrap_character_matrices, to_newick, diff --git a/cassiopeia/data/utilities.py b/cassiopeia/data/utilities.py index eaa06562..77e4c127 100755 --- a/cassiopeia/data/utilities.py +++ b/cassiopeia/data/utilities.py @@ -14,6 +14,7 @@ from cassiopeia.data import CassiopeiaTree from cassiopeia.mixins import CassiopeiaTreeWarning, is_ambiguous_state +from cassiopeia.mixins.errors import CassiopeiaError from cassiopeia.preprocess import utilities as preprocessing_utilities @@ -412,3 +413,97 @@ def compute_phylogenetic_weight_matrix( np.fill_diagonal(W.values, 0) return W + +@numba.jit(nopython=True) +def net_relatedness_index( + dissimilarity_map: np.array, indices_1: np.array, indices_2: np.array +) -> float: + """Computes the net relatedness index between indices. + + Using the dissimilarity map specified and the indices of samples, compute + the net relatedness index, defined as: + + sum(distances over i,j in indices_1,indices_2) / (|indices_1| x |indices_2|) + + Args: + dissimilarity_map: Dissimilarity map between all samples. + indices_1: Indices corresponding to the first group. + indices_2: Indices corresponding to the second group. + + Returns: + The Net Relatedness Index (NRI) + """ + + nri = 0 + for i in indices_1: + for j in indices_2: + nri += dissimilarity_map[i, j] + + return nri / (len(indices_1) * len(indices_2)) + +def compute_inter_cluster_distances( + tree: CassiopeiaTree, + meta_item: Optional[str] = None, + meta_data: Optional[pd.DataFrame] = None, + dissimilarity_map: Optional[pd.DataFrame] = None, + distance_function: Callable = net_relatedness_index, + **kwargs, +) -> pd.DataFrame: + """Computes mean distance between clusters. + + Compute the mean distance between categories in a categorical variable. By + default, the phylogenetic weight matrix will be computed and used for this + distance calculation, but a user can optionally provide a dissimilarity + map instead. + + This function performs the computation in O(K^2)*O(distance_function) time + for a variable with K categories. + + Args: + tree: CassiopeiaTree + meta_item: Column in the cell meta data of the tree. If `meta_data` is + specified, this is ignored. + meta_data: Meta data to use for this calculation. This argument takes + priority over meta_item. + dissimilarity_map: Dissimilarity map to use for distances. If this is + specified, the phylogenetic weight matrix is not computed. + number_of_neighbors: Number of nearest neighbors to use for computing + the mean distances. If this is not specified, then all cells are + used. + **kwargs: Arguments to pass to the distance function. + + Returns: + A K x K distance matrix. + """ + meta_data = tree.cell_meta[meta_item] if (meta_data is None) else meta_data + + # ensure that the meta data is categorical + if not pd.api.types.is_string_dtype(meta_data): + raise CassiopeiaError("Meta data must be categorical or a string.") + + D = ( + compute_phylogenetic_weight_matrix(tree) + if (dissimilarity_map is None) + else dissimilarity_map + ) + + unique_states = meta_data.unique() + K = len(unique_states) + inter_cluster_distances = pd.DataFrame( + np.zeros((K, K)), index=unique_states, columns=unique_states + ) + + # align distance matrix and meta_data + D = D.loc[meta_data.index.values, meta_data.index.values] + + for state1 in unique_states: + indices_1 = np.where(np.array(meta_data) == state1)[0] + for state2 in unique_states: + indices_2 = np.where(np.array(meta_data) == state2)[0] + + distance = distance_function( + D.values, indices_1, indices_2, **kwargs + ) + inter_cluster_distances.loc[state1, state2] = distance + + return inter_cluster_distances diff --git a/cassiopeia/tools/__init__.py b/cassiopeia/tools/__init__.py index 792a38ce..de98134f 100644 --- a/cassiopeia/tools/__init__.py +++ b/cassiopeia/tools/__init__.py @@ -2,5 +2,6 @@ from .autocorrelation import compute_morans_i from .branch_length_estimator import IIDExponentialBayesian, IIDExponentialMLE +from .coupling import compute_evolutionary_coupling from .small_parsimony import fitch_count, fitch_hartigan, score_small_parsimony from .topology import compute_cophenetic_correlation, compute_expansion_pvalues diff --git a/cassiopeia/tools/coupling.py b/cassiopeia/tools/coupling.py new file mode 100644 index 00000000..2ba982c7 --- /dev/null +++ b/cassiopeia/tools/coupling.py @@ -0,0 +1,125 @@ +""" +File storing functionality for computing coupling statistics between meta +variables on a tree. +""" +from typing import Callable, Optional + +from collections import defaultdict +import numpy as np +import pandas as pd +from tqdm import tqdm + +from cassiopeia.data import CassiopeiaTree +from cassiopeia.data import utilities as data_utilities + + +def compute_evolutionary_coupling( + tree: CassiopeiaTree, + meta_variable: str, + minimum_proportion: float = 0.05, + number_of_shuffles: int = 500, + random_state: Optional[np.random.RandomState] = None, + dissimilarity_map: Optional[pd.DataFrame] = None, + cluster_comparison_function: Callable = data_utilities.net_relatedness_index, + **comparison_kwargs, +) -> pd.DataFrame: + """Computes Evolutionary Coupling of categorical variables. + + Using the methodology described in Yang, Jones et al, BioRxiv (2021), this + function will compute the "evolutionary coupling" statistic between values + that a categorical variable can take on with the tree. For example, this + categorical variable can be a "cell type", and this function will compute + the evolutionary couplings between all types of cell types. This indicates + how closely related these cell types are to one another. + + Briefly, this statistic is the Z-normalized mean distance between categories + in the specified categorical variable. Note that empirical nulls that have a + standard deviation of 0 lead to NaNs in the resulting evolutionary coupling + matrix. + + The computational complexity of this function is + O(n^2 log n + (B+1)(K^2 * O(distance_function)) for a tree with n leaves, a + variable with K categories, and B random shuffles. + + Args: + tree: CassiopeiaTree + meta_variable: Column in `tree.cell_meta` that stores a categorical + variable with K categories. + minimum_proportion: Minimum proportion of cells that a category needs + to appear in to be considered. + number_of_shuffles: Number of times to shuffle the data to compute the + empirical Z score. + random_state: Numpy random state to parameterize the shuffling. + dissimilarity_map: A precomputed dissimilarity map between all leaves. + cluster_comparison_function: A function for comparing the mean distance + between groups. By default, this is the Net Relatedness Index. + **comparison_kwargs: Extra arguments to pass to the cluster comparison + function. + + Returns: + A K x K evolutionary coupling dataframe. + """ + + W = ( + data_utilities.compute_phylogenetic_weight_matrix(tree) + if (dissimilarity_map is None) + else dissimilarity_map + ) + + meta_data = tree.cell_meta[meta_variable] + + # subset meta data by minimum proportion + if minimum_proportion > 0: + filter_threshold = int(len(tree.leaves) * minimum_proportion) + category_frequencies = meta_data.value_counts() + passing_categories = category_frequencies[ + category_frequencies > filter_threshold + ].index.values + meta_data = meta_data[meta_data.isin(passing_categories)] + W = W.loc[meta_data.index.values, meta_data.index.values] + + # compute inter-cluster distances + inter_cluster_distances = data_utilities.compute_inter_cluster_distances( + tree, + meta_data=meta_data, + dissimilarity_map=W, + distance_function=cluster_comparison_function, + **comparison_kwargs, + ) + + # compute background for Z-scoring + background = defaultdict(list) + for _ in tqdm( + range(number_of_shuffles), desc="Creating empirical background" + ): + permuted_assignments = meta_data.copy() + if random_state: + permuted_assignments.index = random_state.permutation( + meta_data.index.values + ) + else: + permuted_assignments.index = np.random.permutation( + meta_data.index.values + ) + background_distances = data_utilities.compute_inter_cluster_distances( + tree, + meta_data=permuted_assignments, + dissimilarity_map=W, + distance_function=cluster_comparison_function, + **comparison_kwargs, + ) + for s1 in background_distances.index: + for s2 in background_distances.columns: + background[(s1, s2)].append(background_distances.loc[s1, s2]) + + Z_scores = inter_cluster_distances.copy() + for s1 in Z_scores.index: + for s2 in Z_scores.columns: + mean = np.mean(background[(s1, s2)]) + sd = np.std(background[(s1, s2)]) + + Z_scores.loc[s1, s2] = ( + inter_cluster_distances.loc[s1, s2] - mean + ) / sd + + return Z_scores diff --git a/cassiopeia/tools/topology.py b/cassiopeia/tools/topology.py index 39f5fc9f..33af2250 100644 --- a/cassiopeia/tools/topology.py +++ b/cassiopeia/tools/topology.py @@ -9,7 +9,6 @@ import pandas as pd from scipy import spatial, stats - from cassiopeia.data import CassiopeiaTree, compute_phylogenetic_weight_matrix from cassiopeia.mixins import CassiopeiaError from cassiopeia.solver import dissimilarity_functions diff --git a/docs/api/plotting.rst b/docs/api/plotting.rst index c738dba3..bc065672 100644 --- a/docs/api/plotting.rst +++ b/docs/api/plotting.rst @@ -13,6 +13,4 @@ Currently, our plotting functionality is linked to the rich iTOL framework: .. autosummary:: :toctree: reference/ - pl.upload_and_export_itol - - \ No newline at end of file + pl.upload_and_export_itol \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.data.CassiopeiaTree.rst b/docs/api/reference/cassiopeia.data.CassiopeiaTree.rst deleted file mode 100644 index d0d90838..00000000 --- a/docs/api/reference/cassiopeia.data.CassiopeiaTree.rst +++ /dev/null @@ -1,13 +0,0 @@ -cassiopeia.data.CassiopeiaTree -============================== - -.. currentmodule:: cassiopeia.data - -.. autoclass:: CassiopeiaTree - :members: - :undoc-members: - - .. rubric:: Methods - - .. autoautosummary:: CassiopeiaTree - :methods: \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.data.sample_bootstrap_allele_tables.rst b/docs/api/reference/cassiopeia.data.sample_bootstrap_allele_tables.rst deleted file mode 100644 index 1639e437..00000000 --- a/docs/api/reference/cassiopeia.data.sample_bootstrap_allele_tables.rst +++ /dev/null @@ -1,6 +0,0 @@ -cassiopeia.data.sample\_bootstrap\_allele\_tables -================================================= - -.. currentmodule:: cassiopeia.data - -.. autofunction:: sample_bootstrap_allele_tables \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.data.sample_bootstrap_character_matrices.rst b/docs/api/reference/cassiopeia.data.sample_bootstrap_character_matrices.rst deleted file mode 100644 index 45ba28e1..00000000 --- a/docs/api/reference/cassiopeia.data.sample_bootstrap_character_matrices.rst +++ /dev/null @@ -1,6 +0,0 @@ -cassiopeia.data.sample\_bootstrap\_character\_matrices -====================================================== - -.. currentmodule:: cassiopeia.data - -.. autofunction:: sample_bootstrap_character_matrices \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.data.to_newick.rst b/docs/api/reference/cassiopeia.data.to_newick.rst deleted file mode 100644 index 6b0ff7e3..00000000 --- a/docs/api/reference/cassiopeia.data.to_newick.rst +++ /dev/null @@ -1,6 +0,0 @@ -cassiopeia.data.to\_newick -========================== - -.. currentmodule:: cassiopeia.data - -.. autofunction:: to_newick \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.pl.upload_and_export_itol.rst b/docs/api/reference/cassiopeia.pl.upload_and_export_itol.rst deleted file mode 100644 index a8e70af8..00000000 --- a/docs/api/reference/cassiopeia.pl.upload_and_export_itol.rst +++ /dev/null @@ -1,6 +0,0 @@ -cassiopeia.pl.upload\_and\_export\_itol -======================================= - -.. currentmodule:: cassiopeia.pl - -.. autofunction:: upload_and_export_itol \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.pp.align_sequences.rst b/docs/api/reference/cassiopeia.pp.align_sequences.rst deleted file mode 100644 index 11927b42..00000000 --- a/docs/api/reference/cassiopeia.pp.align_sequences.rst +++ /dev/null @@ -1,6 +0,0 @@ -cassiopeia.pp.align\_sequences -============================== - -.. currentmodule:: cassiopeia.pp - -.. autofunction:: align_sequences \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.pp.call_alleles.rst b/docs/api/reference/cassiopeia.pp.call_alleles.rst deleted file mode 100644 index 9bea85ee..00000000 --- a/docs/api/reference/cassiopeia.pp.call_alleles.rst +++ /dev/null @@ -1,6 +0,0 @@ -cassiopeia.pp.call\_alleles -=========================== - -.. currentmodule:: cassiopeia.pp - -.. autofunction:: call_alleles \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.pp.call_lineage_groups.rst b/docs/api/reference/cassiopeia.pp.call_lineage_groups.rst deleted file mode 100644 index 8df5914d..00000000 --- a/docs/api/reference/cassiopeia.pp.call_lineage_groups.rst +++ /dev/null @@ -1,6 +0,0 @@ -cassiopeia.pp.call\_lineage\_groups -=================================== - -.. currentmodule:: cassiopeia.pp - -.. autofunction:: call_lineage_groups \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.pp.collapse_umis.rst b/docs/api/reference/cassiopeia.pp.collapse_umis.rst deleted file mode 100644 index 020d2ee7..00000000 --- a/docs/api/reference/cassiopeia.pp.collapse_umis.rst +++ /dev/null @@ -1,6 +0,0 @@ -cassiopeia.pp.collapse\_umis -============================ - -.. currentmodule:: cassiopeia.pp - -.. autofunction:: collapse_umis \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.pp.compute_empirical_indel_priors.rst b/docs/api/reference/cassiopeia.pp.compute_empirical_indel_priors.rst deleted file mode 100644 index 1b0f4e5c..00000000 --- a/docs/api/reference/cassiopeia.pp.compute_empirical_indel_priors.rst +++ /dev/null @@ -1,6 +0,0 @@ -cassiopeia.pp.compute\_empirical\_indel\_priors -=============================================== - -.. currentmodule:: cassiopeia.pp - -.. autofunction:: compute_empirical_indel_priors \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.pp.convert_alleletable_to_character_matrix.rst b/docs/api/reference/cassiopeia.pp.convert_alleletable_to_character_matrix.rst deleted file mode 100644 index 5e11c83b..00000000 --- a/docs/api/reference/cassiopeia.pp.convert_alleletable_to_character_matrix.rst +++ /dev/null @@ -1,6 +0,0 @@ -cassiopeia.pp.convert\_alleletable\_to\_character\_matrix -========================================================= - -.. currentmodule:: cassiopeia.pp - -.. autofunction:: convert_alleletable_to_character_matrix \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.pp.convert_alleletable_to_lineage_profile.rst b/docs/api/reference/cassiopeia.pp.convert_alleletable_to_lineage_profile.rst deleted file mode 100644 index 370d0832..00000000 --- a/docs/api/reference/cassiopeia.pp.convert_alleletable_to_lineage_profile.rst +++ /dev/null @@ -1,6 +0,0 @@ -cassiopeia.pp.convert\_alleletable\_to\_lineage\_profile -======================================================== - -.. currentmodule:: cassiopeia.pp - -.. autofunction:: convert_alleletable_to_lineage_profile \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.pp.convert_lineage_profile_to_character_matrix.rst b/docs/api/reference/cassiopeia.pp.convert_lineage_profile_to_character_matrix.rst deleted file mode 100644 index 893a4f42..00000000 --- a/docs/api/reference/cassiopeia.pp.convert_lineage_profile_to_character_matrix.rst +++ /dev/null @@ -1,6 +0,0 @@ -cassiopeia.pp.convert\_lineage\_profile\_to\_character\_matrix -============================================================== - -.. currentmodule:: cassiopeia.pp - -.. autofunction:: convert_lineage_profile_to_character_matrix \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.pp.error_correct_umis.rst b/docs/api/reference/cassiopeia.pp.error_correct_umis.rst deleted file mode 100644 index 95101d0d..00000000 --- a/docs/api/reference/cassiopeia.pp.error_correct_umis.rst +++ /dev/null @@ -1,6 +0,0 @@ -cassiopeia.pp.error\_correct\_umis -================================== - -.. currentmodule:: cassiopeia.pp - -.. autofunction:: error_correct_umis \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.pp.filter_cells.rst b/docs/api/reference/cassiopeia.pp.filter_cells.rst deleted file mode 100644 index db1c5704..00000000 --- a/docs/api/reference/cassiopeia.pp.filter_cells.rst +++ /dev/null @@ -1,6 +0,0 @@ -cassiopeia.pp.filter\_cells -=========================== - -.. currentmodule:: cassiopeia.pp - -.. autofunction:: filter_cells \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.pp.filter_molecule_table.rst b/docs/api/reference/cassiopeia.pp.filter_molecule_table.rst deleted file mode 100644 index d062d345..00000000 --- a/docs/api/reference/cassiopeia.pp.filter_molecule_table.rst +++ /dev/null @@ -1,6 +0,0 @@ -cassiopeia.pp.filter\_molecule\_table -===================================== - -.. currentmodule:: cassiopeia.pp - -.. autofunction:: filter_molecule_table \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.pp.filter_umis.rst b/docs/api/reference/cassiopeia.pp.filter_umis.rst deleted file mode 100644 index 962eb79c..00000000 --- a/docs/api/reference/cassiopeia.pp.filter_umis.rst +++ /dev/null @@ -1,6 +0,0 @@ -cassiopeia.pp.filter\_umis -========================== - -.. currentmodule:: cassiopeia.pp - -.. autofunction:: filter_umis \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.pp.resolve_umi_sequence.rst b/docs/api/reference/cassiopeia.pp.resolve_umi_sequence.rst deleted file mode 100644 index a1a1ddc6..00000000 --- a/docs/api/reference/cassiopeia.pp.resolve_umi_sequence.rst +++ /dev/null @@ -1,6 +0,0 @@ -cassiopeia.pp.resolve\_umi\_sequence -==================================== - -.. currentmodule:: cassiopeia.pp - -.. autofunction:: resolve_umi_sequence \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.solver.HybridSolver.rst b/docs/api/reference/cassiopeia.solver.HybridSolver.rst deleted file mode 100644 index 9646d98f..00000000 --- a/docs/api/reference/cassiopeia.solver.HybridSolver.rst +++ /dev/null @@ -1,13 +0,0 @@ -cassiopeia.solver.HybridSolver -============================== - -.. currentmodule:: cassiopeia.solver - -.. autoclass:: HybridSolver - :members: - :undoc-members: - - .. rubric:: Methods - - .. autoautosummary:: HybridSolver - :methods: \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.solver.ILPSolver.rst b/docs/api/reference/cassiopeia.solver.ILPSolver.rst deleted file mode 100644 index eebcd985..00000000 --- a/docs/api/reference/cassiopeia.solver.ILPSolver.rst +++ /dev/null @@ -1,13 +0,0 @@ -cassiopeia.solver.ILPSolver -=========================== - -.. currentmodule:: cassiopeia.solver - -.. autoclass:: ILPSolver - :members: - :undoc-members: - - .. rubric:: Methods - - .. autoautosummary:: ILPSolver - :methods: \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.solver.MaxCutGreedySolver.rst b/docs/api/reference/cassiopeia.solver.MaxCutGreedySolver.rst deleted file mode 100644 index 726bb6a1..00000000 --- a/docs/api/reference/cassiopeia.solver.MaxCutGreedySolver.rst +++ /dev/null @@ -1,13 +0,0 @@ -cassiopeia.solver.MaxCutGreedySolver -==================================== - -.. currentmodule:: cassiopeia.solver - -.. autoclass:: MaxCutGreedySolver - :members: - :undoc-members: - - .. rubric:: Methods - - .. autoautosummary:: MaxCutGreedySolver - :methods: \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.solver.MaxCutSolver.rst b/docs/api/reference/cassiopeia.solver.MaxCutSolver.rst deleted file mode 100644 index 218c45fe..00000000 --- a/docs/api/reference/cassiopeia.solver.MaxCutSolver.rst +++ /dev/null @@ -1,13 +0,0 @@ -cassiopeia.solver.MaxCutSolver -============================== - -.. currentmodule:: cassiopeia.solver - -.. autoclass:: MaxCutSolver - :members: - :undoc-members: - - .. rubric:: Methods - - .. autoautosummary:: MaxCutSolver - :methods: \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.solver.NeighborJoiningSolver.rst b/docs/api/reference/cassiopeia.solver.NeighborJoiningSolver.rst deleted file mode 100644 index 17c83adf..00000000 --- a/docs/api/reference/cassiopeia.solver.NeighborJoiningSolver.rst +++ /dev/null @@ -1,13 +0,0 @@ -cassiopeia.solver.NeighborJoiningSolver -======================================= - -.. currentmodule:: cassiopeia.solver - -.. autoclass:: NeighborJoiningSolver - :members: - :undoc-members: - - .. rubric:: Methods - - .. autoautosummary:: NeighborJoiningSolver - :methods: \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.solver.PercolationSolver.rst b/docs/api/reference/cassiopeia.solver.PercolationSolver.rst deleted file mode 100644 index 8ddcc3c7..00000000 --- a/docs/api/reference/cassiopeia.solver.PercolationSolver.rst +++ /dev/null @@ -1,13 +0,0 @@ -cassiopeia.solver.PercolationSolver -=================================== - -.. currentmodule:: cassiopeia.solver - -.. autoclass:: PercolationSolver - :members: - :undoc-members: - - .. rubric:: Methods - - .. autoautosummary:: PercolationSolver - :methods: \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.solver.SharedMutationJoiningSolver.rst b/docs/api/reference/cassiopeia.solver.SharedMutationJoiningSolver.rst deleted file mode 100644 index 2b8081c3..00000000 --- a/docs/api/reference/cassiopeia.solver.SharedMutationJoiningSolver.rst +++ /dev/null @@ -1,13 +0,0 @@ -cassiopeia.solver.SharedMutationJoiningSolver -============================================= - -.. currentmodule:: cassiopeia.solver - -.. autoclass:: SharedMutationJoiningSolver - :members: - :undoc-members: - - .. rubric:: Methods - - .. autoautosummary:: SharedMutationJoiningSolver - :methods: \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.solver.SpectralGreedySolver.rst b/docs/api/reference/cassiopeia.solver.SpectralGreedySolver.rst deleted file mode 100644 index f9f4cfb6..00000000 --- a/docs/api/reference/cassiopeia.solver.SpectralGreedySolver.rst +++ /dev/null @@ -1,13 +0,0 @@ -cassiopeia.solver.SpectralGreedySolver -====================================== - -.. currentmodule:: cassiopeia.solver - -.. autoclass:: SpectralGreedySolver - :members: - :undoc-members: - - .. rubric:: Methods - - .. autoautosummary:: SpectralGreedySolver - :methods: \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.solver.SpectralSolver.rst b/docs/api/reference/cassiopeia.solver.SpectralSolver.rst deleted file mode 100644 index 7d152bf9..00000000 --- a/docs/api/reference/cassiopeia.solver.SpectralSolver.rst +++ /dev/null @@ -1,13 +0,0 @@ -cassiopeia.solver.SpectralSolver -================================ - -.. currentmodule:: cassiopeia.solver - -.. autoclass:: SpectralSolver - :members: - :undoc-members: - - .. rubric:: Methods - - .. autoautosummary:: SpectralSolver - :methods: \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.solver.UPGMASolver.rst b/docs/api/reference/cassiopeia.solver.UPGMASolver.rst deleted file mode 100644 index 691ac333..00000000 --- a/docs/api/reference/cassiopeia.solver.UPGMASolver.rst +++ /dev/null @@ -1,13 +0,0 @@ -cassiopeia.solver.UPGMASolver -============================= - -.. currentmodule:: cassiopeia.solver - -.. autoclass:: UPGMASolver - :members: - :undoc-members: - - .. rubric:: Methods - - .. autoautosummary:: UPGMASolver - :methods: \ No newline at end of file diff --git a/docs/api/reference/cassiopeia.solver.VanillaGreedySolver.rst b/docs/api/reference/cassiopeia.solver.VanillaGreedySolver.rst deleted file mode 100644 index 7e761f6f..00000000 --- a/docs/api/reference/cassiopeia.solver.VanillaGreedySolver.rst +++ /dev/null @@ -1,13 +0,0 @@ -cassiopeia.solver.VanillaGreedySolver -===================================== - -.. currentmodule:: cassiopeia.solver - -.. autoclass:: VanillaGreedySolver - :members: - :undoc-members: - - .. rubric:: Methods - - .. autoautosummary:: VanillaGreedySolver - :methods: \ No newline at end of file diff --git a/docs/api/tools.rst b/docs/api/tools.rst index 11f6b212..0ded5a5e 100644 --- a/docs/api/tools.rst +++ b/docs/api/tools.rst @@ -14,10 +14,39 @@ Small-Parsimony .. autosummary:: :toctree: reference/ - - tl.compute_cophenetic_correlation - tl.compute_expansion_pvalues - tl.compute_morans_i + tl.fitch_count tl.fitch_hartigan tl.score_small_parsimony + +Branch Length Estimation (BLE) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: reference/ + + tl.IIDExponentialBayesian + tl.IIDExponentialMLE + +Autocorrelation +~~~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: reference/ + + tl.compute_morans_i + +Coupling +~~~~~~~~~~~ + +.. autosummary:: + :toctree: reference/ + + tl.compute_evolutionary_coupling + +Topology +~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: reference/ + + tl.compute_expansion_pvalues \ No newline at end of file diff --git a/test/data_tests/data_utilities_test.py b/test/data_tests/data_utilities_test.py index 8a9d4402..b05dde58 100755 --- a/test/data_tests/data_utilities_test.py +++ b/test/data_tests/data_utilities_test.py @@ -3,7 +3,6 @@ """ import unittest -from typing import Dict, Optional import networkx as nx import numpy as np @@ -11,6 +10,7 @@ from cassiopeia.data import CassiopeiaTree from cassiopeia.data import utilities as data_utilities +from cassiopeia.mixins.errors import CassiopeiaError from cassiopeia.preprocess import utilities as preprocessing_utilities @@ -405,6 +405,116 @@ def test_phylogenetic_weights_matrix_inverse_fn(self): pd.testing.assert_frame_equal(weight_matrix, expected_weight_matrix) + def test_net_relatedness_index(self): + + distances = np.array( + [[0, 1, 2, 4], [1, 0, 3, 6], [2, 3, 0, 5], [4, 6, 5, 0]] + ) + indices_1 = np.array([0, 1]) + indices_2 = np.array([2, 3]) + + nri = data_utilities.net_relatedness_index( + distances, indices_1, indices_2 + ) + self.assertAlmostEqual(15.0 / 4.0, nri, delta=0.0001) + + def test_inter_cluster_distance_basic(self): + + tree = nx.DiGraph() + tree.add_nodes_from(["A", "B", "C", "D", "E", "F"]) + tree.add_edge("F", "A", length=0.1) + tree.add_edge("F", "B", length=0.2) + tree.add_edge("F", "E", length=0.5) + tree.add_edge("E", "C", length=0.3) + tree.add_edge("E", "D", length=0.4) + + meta_data = pd.DataFrame.from_dict( + { + "A": ["TypeA", 10], + "B": ["TypeA", 5], + "C": ["TypeB", 3], + "D": ["TypeB", 22], + }, + orient="index", + columns=["CellType", "nUMI"], + ) + + tree = CassiopeiaTree(tree=tree, cell_meta=meta_data) + + inter_cluster_distances = data_utilities.compute_inter_cluster_distances( + tree, meta_item="CellType" + ) + + expected_distances = pd.DataFrame.from_dict( + {"TypeA": [0.15, 1.0], "TypeB": [1.0, 0.35]}, + orient="index", + columns=["TypeA", "TypeB"], + ) + + pd.testing.assert_frame_equal( + expected_distances, inter_cluster_distances + ) + + self.assertRaises( + CassiopeiaError, + data_utilities.compute_inter_cluster_distances, + tree, + "nUMI", + ) + + def test_inter_cluster_distance_custom_input(self): + + tree = nx.DiGraph() + tree.add_nodes_from(["A", "B", "C", "D", "E", "F"]) + tree.add_edge("F", "A", length=0.1) + tree.add_edge("F", "B", length=0.2) + tree.add_edge("F", "E", length=0.5) + tree.add_edge("E", "C", length=0.3) + tree.add_edge("E", "D", length=0.4) + + meta_data = pd.DataFrame.from_dict( + { + "A": ["TypeA", 10], + "B": ["TypeA", 5], + "C": ["TypeB", 3], + "D": ["TypeB", 22], + }, + orient="index", + columns=["CellType", "nUMI"], + ) + + weight_matrix = pd.DataFrame.from_dict( + { + "A": [0.0, 0.5, 1.2, 0.4], + "B": [0.5, 0.0, 3.0, 1.1], + "C": [1.2, 3.0, 0.0, 0.8], + "D": [0.4, 1.1, 0.8, 0.0], + }, + orient="index", + columns=["A", "B", "C", "D"], + ) + + tree = CassiopeiaTree(tree=tree) + + inter_cluster_distances = data_utilities.compute_inter_cluster_distances( + tree, + meta_data=meta_data["CellType"], + dissimilarity_map=weight_matrix, + ) + + expected_distances = pd.DataFrame.from_dict( + {"TypeA": [0.25, 1.425], "TypeB": [1.425, 0.4]}, + orient="index", + columns=["TypeA", "TypeB"], + ) + + pd.testing.assert_frame_equal( + expected_distances, + inter_cluster_distances, + check_exact=False, + atol=0.001, + ) + if __name__ == "__main__": unittest.main() diff --git a/test/tools_tests/coupling_test.py b/test/tools_tests/coupling_test.py new file mode 100644 index 00000000..b59b39f7 --- /dev/null +++ b/test/tools_tests/coupling_test.py @@ -0,0 +1,213 @@ +""" +Tests for the coupling estimators implemented in cassiopeia/tools/coupling.py +""" +import unittest + +import networkx as nx +import numpy as np +import pandas as pd + +import cassiopeia as cas +from cassiopeia.data import CassiopeiaTree +from cassiopeia.data import utilities as data_utilities +from cassiopeia.mixins import CassiopeiaError + + +class TestDataUtilities(unittest.TestCase): + def setUp(self) -> None: + + tree = nx.DiGraph() + tree.add_edges_from( + [ + ("A", "B"), + ("A", "C"), + ("B", "D"), + ("B", "E"), + ("B", "F"), + ("E", "G"), + ("E", "H"), + ("C", "I"), + ("C", "J"), + ] + ) + + meta_data = pd.DataFrame.from_dict( + { + "D": ["TypeB", 10], + "F": ["TypeA", 5], + "G": ["TypeA", 3], + "H": ["TypeB", 22], + "I": ["TypeC", 2], + "J": ["TypeC", 11], + }, + orient="index", + columns=["CellType", "nUMI"], + ) + + self.tree = CassiopeiaTree(tree=tree, cell_meta=meta_data) + + def test_evolutionary_coupling_basic(self): + + random_state = np.random.RandomState(1231234) + + evolutionary_coupling = cas.tl.compute_evolutionary_coupling( + self.tree, + meta_variable="CellType", + random_state=random_state, + minimum_proportion=0.0, + number_of_shuffles=10, + ) + + inter_cluster_distances = data_utilities.compute_inter_cluster_distances( + self.tree, meta_item="CellType" + ) + + # background computed with random seed set above and 10 shuffles + # (state1, state2): (mean, sd) + expected_summary_stats = { + ("TypeA", "TypeA"): (1.7, 0.6000000000000001), + ("TypeA", "TypeB"): (3.55, 0.4716990566028302), + ("TypeA", "TypeC"): (3.55, 0.4716990566028302), + ("TypeB", "TypeA"): (3.55, 0.4716990566028302), + ("TypeB", "TypeB"): (2.0, 0.5), + ("TypeB", "TypeC"): (3.65, 0.45), + ("TypeC", "TypeA"): (3.55, 0.4716990566028302), + ("TypeC", "TypeB"): (3.65, 0.45), + ("TypeC", "TypeC"): (1.8, 0.5567764362830022), + } + + expected_coupling = inter_cluster_distances.copy() + for s1 in expected_coupling.index: + for s2 in expected_coupling.columns: + mean = expected_summary_stats[(s1, s2)][0] + sd = expected_summary_stats[(s1, s2)][1] + + expected_coupling.loc[s1, s2] = ( + inter_cluster_distances.loc[s1, s2] - mean + ) / sd + + pd.testing.assert_frame_equal( + expected_coupling, evolutionary_coupling, atol=0.001 + ) + + # make sure errors are raised for numerical data + self.assertRaises( + CassiopeiaError, + cas.tl.compute_evolutionary_coupling, + self.tree, + "nUMI", + ) + + def test_evolutionary_coupling_custom_dissimilarity_map(self): + + weight_matrix = pd.DataFrame.from_dict( + { + "D": [0.0, 0.5, 1.2, 0.4, 0.5, 0.6], + "F": [0.5, 0.0, 3.0, 1.1, 3.0, 0.1], + "G": [1.2, 3.0, 0.0, 0.8, 0.2, 0.8], + "H": [0.4, 1.1, 0.8, 0.0, 2.0, 2.1], + "I": [0.5, 3.0, 0.2, 2.0, 0.0, 0.1], + "J": [0.6, 0.1, 1.8, 2.1, 0.1, 0.0], + }, + orient="index", + columns=["D", "F", "G", "H", "I", "J"], + ) + + random_state = np.random.RandomState(1231234) + + evolutionary_coupling = cas.tl.compute_evolutionary_coupling( + self.tree, + meta_variable="CellType", + random_state=random_state, + minimum_proportion=0.0, + number_of_shuffles=10, + dissimilarity_map=weight_matrix, + ) + + inter_cluster_distances = data_utilities.compute_inter_cluster_distances( + self.tree, meta_item="CellType", dissimilarity_map=weight_matrix + ) + + # background computed with random seed set above and 10 shuffles + # (state1, state2): (mean, sd) + expected_summary_stats = { + ("TypeB", "TypeB"): (0.695, 0.5456418239101545), + ("TypeB", "TypeA"): (1.0000000000000002, 0.281291663580704), + ("TypeB", "TypeC"): (1.0925, 0.44763964301656745), + ("TypeA", "TypeB"): (1.0000000000000002, 0.3148412298286232), + ("TypeA", "TypeA"): (0.63, 0.4550824101193101), + ("TypeA", "TypeC"): (1.2349999999999999, 0.391503512117069), + ("TypeC", "TypeB"): (1.0675000000000001, 0.4493119740225047), + ("TypeC", "TypeA"): (1.26, 0.41791147387933725), + ("TypeC", "TypeC"): (0.4699999999999999, 0.41424630354415953), + } + + expected_coupling = inter_cluster_distances.copy() + for s1 in expected_coupling.index: + for s2 in expected_coupling.columns: + mean = expected_summary_stats[(s1, s2)][0] + sd = expected_summary_stats[(s1, s2)][1] + + expected_coupling.loc[s1, s2] = ( + inter_cluster_distances.loc[s1, s2] - mean + ) / sd + + pd.testing.assert_frame_equal( + expected_coupling, evolutionary_coupling, atol=0.001 + ) + + def test_evolutionary_coupling_minimum_proportion(self): + + self.tree.cell_meta.loc["J", "CellType"] = "TypeD" + + random_state = np.random.RandomState(1231234) + + evolutionary_coupling = cas.tl.compute_evolutionary_coupling( + self.tree, + meta_variable="CellType", + random_state=random_state, + minimum_proportion=1 / 6, # This will drop types C and D + number_of_shuffles=10, + ) + + # make sure TypeC and TypeD are not in the evolutionary coupling matrix + expected_types = ["TypeA", "TypeB"] + self.assertCountEqual(expected_types, evolutionary_coupling.index) + self.assertCountEqual(expected_types, evolutionary_coupling.columns) + + # make sure couplings are correct + inter_cluster_distances = data_utilities.compute_inter_cluster_distances( + self.tree, meta_item="CellType" + ) + + inter_cluster_distances = inter_cluster_distances.loc[ + expected_types, expected_types + ] + + expected_summary_stats = { + ("TypeB", "TypeB"): (1.4, 0.19999999999999998), + ("TypeB", "TypeA"): (2.6, 0.19999999999999998), + ("TypeA", "TypeB"): (2.6, 0.19999999999999998), + ("TypeA", "TypeA"): (1.4, 0.19999999999999998), + } + + expected_coupling = inter_cluster_distances.copy() + for s1 in expected_coupling.index: + for s2 in expected_coupling.columns: + mean = expected_summary_stats[(s1, s2)][0] + sd = expected_summary_stats[(s1, s2)][1] + + expected_coupling.loc[s1, s2] = ( + inter_cluster_distances.loc[s1, s2] - mean + ) / sd + + evolutionary_coupling = evolutionary_coupling.loc[ + expected_types, expected_types + ] + pd.testing.assert_frame_equal( + expected_coupling, evolutionary_coupling, atol=0.001 + ) + + +if __name__ == "__main__": + unittest.main()