Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fast UPGMA and NJ with CCPhylo #205

Merged
merged 9 commits into from
Aug 18, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ For developers:
* Run the command ``gurobi.sh`` from a terminal window
* From the Gurobi installation directory (where there is a setup.py file), use ``python setup.py install --user``

4. Install Cassiopeia by first changing into the Cassiopeia directory and then `pip3 install .`. To install dev and docs requirements, you can run `pip3 install .[dev,docs]`.
4. [Optional] To use fast versions of Neighbor-Joining and UPGMA, install [CCPhylo](https://bitbucket.org/genomicepidemiology/ccphylo/src/master/) then set ccphylo_path in the config.ini file in the cassiopeia directory.

5. Install Cassiopeia by first changing into the Cassiopeia directory and then `pip3 install .`. To install dev and docs requirements, you can run `pip3 install .[dev,docs]`.

To verify that it installed correctly, try running our tests with `pytest`.

Expand Down
2 changes: 2 additions & 0 deletions cassiopeia/config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[Paths]
ccphylo_path = /path/to/ccphylo/ccphylo
138 changes: 136 additions & 2 deletions cassiopeia/solver/DistanceSolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,17 @@
import networkx as nx
import numpy as np
import pandas as pd
import configparser
import subprocess
import os
import tempfile
import ete3

colganwi marked this conversation as resolved.
Show resolved Hide resolved
from cassiopeia.data import CassiopeiaTree
from cassiopeia.mixins import DistanceSolverError
from cassiopeia.solver import CassiopeiaSolver, solver_utilities

from cassiopeia.solver import (CassiopeiaSolver,
solver_utilities)
from cassiopeia.data.utilities import ete3_to_networkx
colganwi marked this conversation as resolved.
Show resolved Hide resolved

class DistanceSolver(CassiopeiaSolver.CassiopeiaSolver):
"""
Expand Down Expand Up @@ -50,13 +56,17 @@ class DistanceSolver(CassiopeiaSolver.CassiopeiaSolver):
"inverse": Transforms each probability p by taking 1/p
"square_root_inverse": Transforms each probability by the
the square root of 1/p
fast: Whether to use fast implementation of the solver. Currently
Neighbor Joining, Dynamic Neighbor Joining, Heuristic Neighbor
Joining, and UPGMA have fast implementations which use CCPhylo.

Attributes:
dissimilarity_function: Function used to compute dissimilarity between
samples.
add_root: Whether or not to add an implicit root the tree.
prior_transformation: Function to use when transforming priors into
weights.
fast: Whether to use fast implementation of the solver.
"""

def __init__(
Expand All @@ -68,12 +78,24 @@ def __init__(
] = None,
add_root: bool = False,
prior_transformation: str = "negative_log",
fast: bool = False,
):

super().__init__(prior_transformation)

self.dissimilarity_function = dissimilarity_function
self.add_root = add_root
self.fast = fast

# Select fast solver to use
if self.fast:
if self.fast_solver == "ccphylo":
self._setup_ccphylo()
self.fast_solve = self._ccphylo_solve
else:
raise DistanceSolverError(
f"Fast solver is not available for {self.__class__}."
)

def get_dissimilarity_map(
self,
Expand Down Expand Up @@ -135,6 +157,12 @@ def solve(
removes artifacts caused by arbitrarily resolving polytomies.
logfile: File location to log output. Not currently used.
"""

# Use fast solver if selected
if self.fast:
self.fast_solve(cassiopeia_tree,layer,collapse_mutationless_edges,logfile)
colganwi marked this conversation as resolved.
Show resolved Hide resolved
return

node_name_generator = solver_utilities.node_name_generator()

dissimilarity_map = self.get_dissimilarity_map(cassiopeia_tree, layer)
Expand Down Expand Up @@ -197,6 +225,112 @@ def solve(
infer_ancestral_characters=True
)

def _ccphylo_solve(
self,
cassiopeia_tree: CassiopeiaTree,
layer: Optional[str] = None,
collapse_mutationless_edges: bool = False,
logfile: str = "stdout.log",
) -> None:
"""Solves a tree using fast distance-based algorithms implemented by
CCPhylo. To call this method the CCPhlyo package must be installed
and the ccphylo_path must be set in the config file. The method attribute
specifies which algorithm to use. The function will update the `tree`.

Args:
cassiopeia_tree: CassiopeiaTree object to be populated
layer: Layer storing the character matrix for solving. If None, the
default character matrix is used in the CassiopeiaTree.
collapse_mutationless_edges: Indicates if the final reconstructed
tree should collapse mutationless edges based on internal states
inferred by Camin-Sokal parsimony. In scoring accuracy, this
removes artifacts caused by arbitrarily resolving polytomies.
logfile: File location to log output. Not currently used.
"""

dissimilarity_map = self.get_dissimilarity_map(cassiopeia_tree, layer)

with tempfile.TemporaryDirectory() as temp_dir:

# save dissimilarity map as phylip file
dis_path = os.path.join(temp_dir, "dist.phylip")
tree_path = os.path.join(temp_dir, "tree.nwk")
solver_utilities.save_dissimilarity_as_phylip(dissimilarity_map, dis_path)
colganwi marked this conversation as resolved.
Show resolved Hide resolved

# run ccphylo
command = f"{self.ccphylo_path} tree -i {dis_path} -o {tree_path} -m {self.fast_method}"
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
T = ete3.Tree(tree_path, format=1)

# remove temporary files
os.remove(dis_path)
os.remove(tree_path)

# Covert to networkx
tree = ete3_to_networkx(T).to_undirected()

# find last split
midpoint = T.get_midpoint_outgroup()
root = T.get_tree_root()
if midpoint in root.children:
last_split = [root.name,midpoint.name]
else:
last_split = [root.name,root.children[0].name]
tree.remove_edge(last_split[0],last_split[1])

# root tree
tree = self.root_tree(tree,cassiopeia_tree.root_sample_name,last_split)

# remove root from character matrix before populating tree
if (
cassiopeia_tree.root_sample_name
in cassiopeia_tree.character_matrix.index
):
cassiopeia_tree.character_matrix = (
cassiopeia_tree.character_matrix.drop(
index=cassiopeia_tree.root_sample_name
)
)

# populate tree
cassiopeia_tree.populate_tree(tree,layer=layer)
cassiopeia_tree.collapse_unifurcations()

# collapse mutationless edges
if collapse_mutationless_edges:
cassiopeia_tree.collapse_mutationless_edges(
infer_ancestral_characters=True
)

def _setup_ccphylo(self) -> None:
"""Sets up the ccphylo solver by getting the ccphylo_path from the
config file and checking that it is valid.
"""

# get ccphylo path
config = configparser.ConfigParser()
config.read(os.path.join(os.path.dirname(__file__),"..","config.ini"))
self.ccphylo_path = config.get("Paths","ccphylo_path")

#check that ccphylo_path is valid
if not os.path.exists(self.ccphylo_path):
raise DistanceSolverError(
f"ccphylo_path {self.ccphylo_path} does not exist. To use fast "
"versions of Neighbor-Joining and UPGMA please install "
"CCPhylo (https://bitbucket.org/genomicepidemiology/ccphylo/src/master/) "
"set the ccphylo_path in the config.ini file then reinstall Cassiopeia."
)

#check that ccphylo_path is executable
if not os.access(self.ccphylo_path, os.X_OK):
raise DistanceSolverError(
f"ccphylo_path {self.ccphylo_path} is not executable. To use fast "
"versions of Neighbor-Joining and UPGMA please install "
"CCPhylo (https://bitbucket.org/genomicepidemiology/ccphylo/src/master/) "
"set the ccphylo_path in the config.ini file then reinstall Cassiopeia."
)

def setup_dissimilarity_map(
self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None
) -> None:
Expand Down
67 changes: 67 additions & 0 deletions cassiopeia/solver/DynamicNeighborJoiningSolver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""
This file stores a subclass of NeighborJoiningSolver, DynamicNeighborJoining.
The inference procedure is the Dymanic Neighbor-Joining algorithm proposed by
Clausen (2023). https://pubmed.ncbi.nlm.nih.gov/36453849/
"""
from typing import Callable, Dict, List, Optional, Tuple, Union

import numpy as np

from cassiopeia.mixins import DistanceSolverError
from cassiopeia.solver import (
NeighborJoiningSolver,
dissimilarity_functions,
)

class DynamicNeighborJoiningSolver(NeighborJoiningSolver):
"""
Dynamic Neighbor-Joining class for Cassiopeia.

Implements the Dynamic Neighbor-Joining algorithm described by Clausen (2023)
as a derived class of CCPhyloSolver. This class inherits the generic
`solve` method. This algorithm is guaranteed to return an exact solution.

Args:
dissimilarity_function: A function by which to compute the dissimilarity
map. Optional if a dissimilarity map is already provided.
add_root: Whether or not to add an implicit root the tree, i.e. a root
with unmutated characters. If set to False, and no explicit root is
provided in the CassiopeiaTree, then will return an unrooted,
undirected tree
prior_transformation: Function to use when transforming priors into
weights. Supports the following transformations:
"negative_log": Transforms each probability by the negative
log (default)
"inverse": Transforms each probability p by taking 1/p
"square_root_inverse": Transforms each probability by the
the square root of 1/p

Attributes:
dissimilarity_function: Function used to compute dissimilarity between
samples.
add_root: Whether or not to add an implicit root the tree.
prior_transformation: Function to use when transforming priors into
weights.

"""

def __init__(
self,
dissimilarity_function: Optional[
Callable[
[np.array, np.array, int, Dict[int, Dict[int, float]]], float
]
] = dissimilarity_functions.weighted_hamming_distance,
add_root: bool = False,
prior_transformation: str = "negative_log",
):
# setup fast solver
self.fast_solver = "ccphylo"
self.fast_method = "dnj"

super().__init__(
dissimilarity_function=dissimilarity_function,
add_root=add_root,
prior_transformation=prior_transformation,
fast = True,
)
68 changes: 68 additions & 0 deletions cassiopeia/solver/HeuristicNeighborJoiningSolver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""
This file stores a subclass of NeighborJoiningSolver, HeuristicNeighborJoiningSolver.
The inference procedure is the Heuristic Neighbor-Joining algorithm proposed by
Clausen (2023). https://pubmed.ncbi.nlm.nih.gov/36453849/
"""
from typing import Callable, Dict, List, Optional, Tuple, Union

import numpy as np

from cassiopeia.data import CassiopeiaTree
from cassiopeia.solver import (
NeighborJoiningSolver,
dissimilarity_functions,
)

class HeuristicNeighborJoiningSolver(NeighborJoiningSolver):
"""
Heuristic Neighbor-Joining class for Cassiopeia.

Implements the Heuristic Neighbor-Joining algorithm described by Clausen (2023)
as a derived class of NeighborJoiningSolver. This class inherits the generic
`solve` method. This algorithm is not guaranteed to return an exact solution.


Args:
dissimilarity_function: A function by which to compute the dissimilarity
map. Optional if a dissimilarity map is already provided.
add_root: Whether or not to add an implicit root the tree, i.e. a root
with unmutated characters. If set to False, and no explicit root is
provided in the CassiopeiaTree, then will return an unrooted,
undirected tree
prior_transformation: Function to use when transforming priors into
weights. Supports the following transformations:
"negative_log": Transforms each probability by the negative
log (default)
"inverse": Transforms each probability p by taking 1/p
"square_root_inverse": Transforms each probability by the
the square root of 1/p

Attributes:
dissimilarity_function: Function used to compute dissimilarity between
samples.
add_root: Whether or not to add an implicit root the tree.
prior_transformation: Function to use when transforming priors into
weights.

"""

def __init__(
mattjones315 marked this conversation as resolved.
Show resolved Hide resolved
self,
dissimilarity_function: Optional[
Callable[
[np.array, np.array, int, Dict[int, Dict[int, float]]], float
]
] = dissimilarity_functions.weighted_hamming_distance,
add_root: bool = False,
prior_transformation: str = "negative_log",
):
# setup fast solver
self.fast_solver = "ccphylo"
self.fast_method = "dnj"

super().__init__(
dissimilarity_function=dissimilarity_function,
add_root=add_root,
prior_transformation=prior_transformation,
fast = True,
)
14 changes: 11 additions & 3 deletions cassiopeia/solver/NeighborJoiningSolver.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
This file stores a subclass of DistanceSolver, NeighborJoining. The
This file stores a subclass of CCPhyloSolver, NeighborJoiningSolver. The
colganwi marked this conversation as resolved.
Show resolved Hide resolved
inference procedure is the Neighbor-Joining algorithm proposed by Saitou and
Nei (1987) that iteratively joins together samples that minimize the Q-criterion
on the dissimilarity map.
Expand All @@ -19,15 +19,15 @@
solver_utilities,
)


class NeighborJoiningSolver(DistanceSolver.DistanceSolver):
"""
Neighbor-Joining class for Cassiopeia.

Implements the Neighbor-Joining algorithm described by Saitou and Nei (1987)
as a derived class of DistanceSolver. This class inherits the generic
`solve` method, but implements its own procedure for finding cherries by
minimizing the Q-criterion between samples.
minimizing the Q-criterion between samples. If fast is set to True, then
the fast Neighbor-Joining implementation from CCPhylo of is used.

Args:
dissimilarity_function: A function by which to compute the dissimilarity
Expand All @@ -43,6 +43,7 @@ class NeighborJoiningSolver(DistanceSolver.DistanceSolver):
"inverse": Transforms each probability p by taking 1/p
"square_root_inverse": Transforms each probability by the
the square root of 1/p
fast: Whether to use the fast CCPhylo implementation of Neighbor-Joining.

Attributes:
dissimilarity_function: Function used to compute dissimilarity between
Expand All @@ -62,12 +63,19 @@ def __init__(
] = dissimilarity_functions.weighted_hamming_distance,
add_root: bool = False,
prior_transformation: str = "negative_log",
fast: bool = False,
):

# setup fast solver
if fast:
self.fast_solver = "ccphylo"
self.fast_method = "nj"

super().__init__(
dissimilarity_function=dissimilarity_function,
add_root=add_root,
prior_transformation=prior_transformation,
fast = fast,
)

def root_tree(
Expand Down
Loading