YosefLab · mattjones315 · Oct 26, 2021 · Oct 25, 2021 · Oct 25, 2021 · Oct 25, 2021
diff --git a/cassiopeia/tools/__init__.py b/cassiopeia/tools/__init__.py
@@ -3,3 +3,4 @@
 from .autocorrelation import compute_morans_i
 from .branch_length_estimator import IIDExponentialBayesian, IIDExponentialMLE
 from .small_parsimony import fitch_count, fitch_hartigan, score_small_parsimony
+from .topology import compute_expansion_probabilities
diff --git a/cassiopeia/tools/topology.py b/cassiopeia/tools/topology.py
@@ -0,0 +1,121 @@
+"""
+Utilities to assess topological properties of a phylogeny, such as balance
+and expansion.
+"""
+from typing import Union
+
+import math
+import numpy as np
+import pandas as pd
+
+from cassiopeia.data import CassiopeiaTree
+from cassiopeia.mixins import CassiopeiaError
+
+
+def compute_expansion_probabilities(
+    tree: CassiopeiaTree,
+    min_clade_size: int = 10,
+    min_depth: int = 1,
+    copy: bool = False,
+) -> Union[CassiopeiaTree, None]:
+    """Call expansion probabilities on a tree.
+
+    Uses the methodology described in Yang, Jones et al, BioRxiv (2021) to
+    assess the expansion probability of a given subclade of a phylogeny.
+    Mathematical treatment of the coalescent probability is described in
+    Griffiths and Tavare, Stochastic Models (1998).
+
+    The probability corresponds to the probability that a given subclade
+    contains the number of cells as would be expected under a simple coalescent
+    model. Often, if the probability is less than some threshold (e.g., 0.05),
+    this might indicate that there exists some subclade under this node that
+    to which this expansion probability can be attributed.  
+
+    This function will add an attribute "expansion_probability" to the tree, and
+    return None unless :param:`copy` is set to True.
+
+    Args:
+        tree: CassiopeiaTree
+        min_clade_size: Minimum number of leaves in a subtree to be considered.
+        min_depth: Minimum depth of clade to be considered. Depth is measured
+            in number of nodes from the root, not branch lengths.
+        copy: Return copy.
+
+    Returns:
+        If copy is set to False, returns the tree with attributes added
+            in place. Else, returns a new CassiopeiaTree.
+    """
+
+    tree = tree.copy() if copy else tree
+
+    # instantiate attributes
+    for node in tree.depth_first_traverse_nodes(postorder=False):
+        tree.set_attribute(node, "expansion_probability", 1.0)
+
+        if tree.is_root(node):
+            tree.set_attribute(node, "depth", 0)
+        else:
+            tree.set_attribute(
+                node, "depth", tree.get_attribute(tree.parent(node), "depth") + 1
+            )
+
+    for node in tree.depth_first_traverse_nodes(postorder=False):
+
+        n = len(tree.leaves_in_subtree(node))
+
+        k = len(tree.children(node))
+        for c in tree.children(node):
+
+            if len(tree.leaves_in_subtree(c)) < min_clade_size:
+                continue
+
+            depth = tree.get_attribute(c, "depth")
+            if depth < min_depth:
+                continue
+
+            b = len(tree.leaves_in_subtree(c))
+            p = np.sum(
+                [
+                    simple_coalescent_probability(n, b2, k)
+                    for b2 in range(b, n - k + 2)
+                ]
+            )
+            tree.set_attribute(c, "expansion_probability", p)
+
+    return tree if copy else None
+
+
+def simple_coalescent_probability(n: int, b: int, k: int) -> float:
+    """Simple coalescent probability of imbalance.
+
+    Assuming a simple coalescent model, compute the probability that a given
+    lineage has exactly b samples, given there are n cells and k lineages
+    overall.
+
+    Args:
+        n: Number of leaves in subtree
+        b: Number of leaves in one lineage
+        k: Number of lineages
+    Returns:
+        Probability of observing b leaves on one lineage in a tree of n total 
+            leaves
+    """
+    return nCk(n - b - 1, k - 2) / nCk(n - 1, k - 1)
+
+
+def nCk(n: int, k: int) -> float:
+    """Compute the quantity n choose k.
+
+    Args:
+        n: Number of items total.
+        k: Number of items to choose.
+
+    Returns:
+        The number of ways to choose k items from n.
+    """
+
+    if k > n:
+        raise CassiopeiaError("Argument k cannot be larger than n.")
+
+    f = math.factorial
+    return f(n) // f(k) // f(n - k)
diff --git a/docs/api/tools.rst b/docs/api/tools.rst
@@ -15,6 +15,7 @@ Small-Parsimony
 .. autosummary::
    :toctree: reference/
 
+   tl.compute_expansion_probabilities
    tl.compute_morans_i
    tl.fitch_count
    tl.fitch_hartigan

diff --git a/test/tools_tests/topology_test.py b/test/tools_tests/topology_test.py
@@ -0,0 +1,180 @@
+"""
+Test suite for the topology functions in
+cassiopeia/tools/topology.py
+"""
+import unittest
+
+import networkx as nx
+import numpy as np
+import pandas as pd
+
+import cassiopeia as cas
+from cassiopeia.mixins import CassiopeiaError, CassiopeiaTreeError
+from cassiopeia.tools import topology
+
+
+class TestTopology(unittest.TestCase):
+    def setUp(self) -> None:
+
+        tree = nx.DiGraph()
+        tree.add_edge("0", "1")
+        tree.add_edge("0", "2")
+        tree.add_edge("1", "3")
+        tree.add_edge("1", "4")
+        tree.add_edge("1", "5")
+        tree.add_edge("2", "6")
+        tree.add_edge("2", "7")
+        tree.add_edge("3", "8")
+        tree.add_edge("3", "9")
+        tree.add_edge("7", "10")
+        tree.add_edge("7", "11")
+        tree.add_edge("8", "12")
+        tree.add_edge("8", "13")
+        tree.add_edge("9", "14")
+        tree.add_edge("9", "15")
+        tree.add_edge("3", "16")
+        tree.add_edge("16", "17")
+        tree.add_edge("16", "18")
+
+        self.tree = cas.data.CassiopeiaTree(tree=tree)
+
+    def test_simple_choose_function(self):
+
+        num_choices = topology.nCk(10, 2)
+
+        self.assertEqual(num_choices, 45)
+
+        self.assertRaises(CassiopeiaError, topology.nCk, 5, 7)
+
+    def test_simple_coalescent_probability(self):
+
+        N = 100
+        B = 2
+        K = 60
+        coalescent_probability = topology.simple_coalescent_probability(N, B, K)
+        self.assertAlmostEqual(coalescent_probability, 0.24, delta=0.01)
+
+        self.assertRaises(
+            CassiopeiaError, topology.simple_coalescent_probability, 50, 2, 60
+        )
+
+    def test_expansion_probability(self):
+
+        # make sure attributes are instantiated correctly
+        cas.tl.compute_expansion_probabilities(self.tree, min_clade_size=20)
+        for node in self.tree.depth_first_traverse_nodes(postorder=False):
+            self.assertEqual(
+                1.0, self.tree.get_attribute(node, "expansion_probability")
+            )
+
+        cas.tl.compute_expansion_probabilities(self.tree, min_clade_size=2)
+        expected_probabilities = {
+            "0": 1.0,
+            "1": 0.3,
+            "2": 0.8,
+            "3": 0.047,
+            "4": 1.0,
+            "5": 1.0,
+            "6": 1.0,
+            "7": 0.5,
+            "8": 0.6,
+            "9": 0.6,
+            "10": 1.0,
+            "11": 1.0,
+            "12": 1.0,
+            "13": 1.0,
+            "14": 1.0,
+            "15": 1.0,
+            "16": 0.6,
+            "17": 1.0,
+            "18": 1.0,
+        }
+
+        for node in self.tree.depth_first_traverse_nodes(postorder=False):
+            expected = expected_probabilities[node]
+            self.assertAlmostEqual(
+                expected,
+                self.tree.get_attribute(node, "expansion_probability"),
+                delta=0.01,
+            )
+
+    def test_expansion_probability_variable_depths(self):
+
+        cas.tl.compute_expansion_probabilities(self.tree, min_clade_size=2, min_depth=3)
+        expected_probabilities = {
+            "0": 1.0,
+            "1": 1.0,
+            "2": 1.0,
+            "3": 1.0,
+            "4": 1.0,
+            "5": 1.0,
+            "6": 1.0,
+            "7": 1.0,
+            "8": 0.6,
+            "9": 0.6,
+            "10": 1.0,
+            "11": 1.0,
+            "12": 1.0,
+            "13": 1.0,
+            "14": 1.0,
+            "15": 1.0,
+            "16": 0.6,
+            "17": 1.0,
+            "18": 1.0,
+        }
+
+        for node in self.tree.depth_first_traverse_nodes(postorder=False):
+            expected = expected_probabilities[node]
+            self.assertAlmostEqual(
+                expected,
+                self.tree.get_attribute(node, "expansion_probability"),
+                delta=0.01,
+            )
+
+    def test_expansion_probability_copy_tree(self):
+
+        tree = cas.tl.compute_expansion_probabilities(
+            self.tree, min_clade_size=2, min_depth=1, copy=True
+        )
+
+        expected_probabilities = {
+            "0": 1.0,
+            "1": 0.3,
+            "2": 0.8,
+            "3": 0.047,
+            "4": 1.0,
+            "5": 1.0,
+            "6": 1.0,
+            "7": 0.5,
+            "8": 0.6,
+            "9": 0.6,
+            "10": 1.0,
+            "11": 1.0,
+            "12": 1.0,
+            "13": 1.0,
+            "14": 1.0,
+            "15": 1.0,
+            "16": 0.6,
+            "17": 1.0,
+            "18": 1.0,
+        }
+
+        for node in self.tree.depth_first_traverse_nodes(postorder=False):
+            expected_copy = expected_probabilities[node]
+
+            self.assertAlmostEqual(
+                expected_copy,
+                tree.get_attribute(node, "expansion_probability"),
+                delta=0.01,
+            )
+
+            self.assertRaises(
+                CassiopeiaTreeError,
+                self.tree.get_attribute,
+                node,
+                "expansion_probability",
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()