Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fast UPGMA and NJ with CCPhylo #205

Merged
merged 9 commits into from
Aug 18, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ For developers:
* Run the command ``gurobi.sh`` from a terminal window
* From the Gurobi installation directory (where there is a setup.py file), use ``python setup.py install --user``

4. Install Cassiopeia by first changing into the Cassiopeia directory and then `pip3 install .`. To install dev and docs requirements, you can run `pip3 install .[dev,docs]`.
4. [Optional] To use fast versions of Neighbor-Joining and UPGMA, install [CCPhylo](https://bitbucket.org/genomicepidemiology/ccphylo/src/master/) then set ccphylo_path in the config.ini file in the cassiopeia directory.

5. Install Cassiopeia by first changing into the Cassiopeia directory and then `pip3 install .`. To install dev and docs requirements, you can run `pip3 install .[dev,docs]`.

To verify that it installed correctly, try running our tests with `pytest`.

Expand Down
2 changes: 2 additions & 0 deletions cassiopeia/config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[Paths]
ccphylo_path = /path/to/ccphylo/ccphylo
223 changes: 223 additions & 0 deletions cassiopeia/solver/CCPhyloSolver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
"""
This file stores a subclass of CassiopeiaSolver, the CCPhyloSolver. This is
a wrapper around optimized agglomerative algorithms implemented by CCPhylo
(https://bitbucket.org/genomicepidemiology/ccphylo/src/master/). Methods
that will inherit from this class by default are DynamicNeighborJoiningSolver,
HeuristicNeighborJoiningSolver, NeighborJoiningSolver and UPGMASolver.
"""

import abc
from typing import Callable, Dict, List, Optional, Tuple

import networkx as nx
import numpy as np
import os
import tempfile
import ete3
import subprocess
import pandas as pd
import configparser

from cassiopeia.data import CassiopeiaTree, utilities
from cassiopeia.mixins import DistanceSolverError
from cassiopeia.solver import (
DistanceSolver,
dissimilarity_functions,
solver_utilities,
)


class CCPhyloSolver(DistanceSolver.DistanceSolver):
"""
Distance based solver class. This solver serves as a wrapper around
CCPhylo algorithms.

Args:
colganwi marked this conversation as resolved.
Show resolved Hide resolved
dissimilarity_function: Function that can be used to compute the
dissimilarity between samples.
add_root: Whether or not to add an implicit root the tree. Only
pertinent in algorithms that return an unrooted tree, by default
(e.g. Neighbor Joining). Will not override an explicitly defined
root, specified by the 'root_sample_name' attribute in the
CassiopeiaTree
prior_transformation: Function to use when transforming priors into
weights. Supports the following transformations:
"negative_log": Transforms each probability by the negative
log (default)
"inverse": Transforms each probability p by taking 1/p
"square_root_inverse": Transforms each probability by the
the square root of 1/p

Attributes:
dissimilarity_function: Function used to compute dissimilarity between
samples.
add_root: Whether or not to add an implicit root the tree.
prior_transformation: Function to use when transforming priors into
weights.
"""

def __init__(
self,
dissimilarity_function: Optional[
Callable[
[np.array, np.array, int, Dict[int, Dict[int, float]]], float
]
] = dissimilarity_functions.weighted_hamming_distance,
add_root: bool = False,
prior_transformation: str = "negative_log",
fast = True,
method: str = "nj",
):

super().__init__(prior_transformation)

self.dissimilarity_function = dissimilarity_function
self.add_root = add_root
self.fast = fast
mattjones315 marked this conversation as resolved.
Show resolved Hide resolved
self.method = method

def _setup_ccphylo(self) -> None:
colganwi marked this conversation as resolved.
Show resolved Hide resolved

# get ccphylo path
config = configparser.ConfigParser()
config.read(os.path.join(os.path.dirname(__file__),"..","config.ini"))
self.ccphylo_path = config.get("Paths","ccphylo_path")

#check that ccphylo_path is valid
if not os.path.exists(self.ccphylo_path):
raise DistanceSolverError(
f"ccphylo_path {self.ccphylo_path} does not exist. To use fast "
"versions of Neighbor-Joining and UPGMA please install "
"CCPhylo (https://bitbucket.org/genomicepidemiology/ccphylo/src/master/) "
"set the ccphylo_path in the config.ini file then reinstall Cassiopeia."
)

#check that ccphylo_path is executable
if not os.access(self.ccphylo_path, os.X_OK):
raise DistanceSolverError(
f"ccphylo_path {self.ccphylo_path} is not executable. To use fast "
"versions of Neighbor-Joining and UPGMA please install "
"CCPhylo (https://bitbucket.org/genomicepidemiology/ccphylo/src/master/) "
"set the ccphylo_path in the config.ini file then reinstall Cassiopeia."
)

def _fast_solve(
self,
cassiopeia_tree: CassiopeiaTree,
layer: Optional[str] = None,
collapse_mutationless_edges: bool = False,
logfile: str = "stdout.log",
) -> None:
"""Solves a tree using fast distance-based algorithms implemented by
CCPhylo. To call this method the CCPhlyo package must be installed
and the ccphylo_path must be set in the config file. The method attribute
specifies which algorithm to use. The function will update the `tree`.

Args:
cassiopeia_tree: CassiopeiaTree object to be populated
layer: Layer storing the character matrix for solving. If None, the
default character matrix is used in the CassiopeiaTree.
collapse_mutationless_edges: Indicates if the final reconstructed
tree should collapse mutationless edges based on internal states
inferred by Camin-Sokal parsimony. In scoring accuracy, this
removes artifacts caused by arbitrarily resolving polytomies.
logfile: File location to log output. Not currently used.
"""

self._setup_ccphylo()

dissimilarity_map = self.get_dissimilarity_map(cassiopeia_tree, layer)

with tempfile.TemporaryDirectory() as temp_dir:

# save dissimilarity map as phylip file
dis_path = os.path.join(temp_dir, "dist.phylip")
tree_path = os.path.join(temp_dir, "tree.nwk")
solver_utilities.save_dissimilarity_as_phylip(dissimilarity_map, dis_path)

# run ccphylo
command = f"{self.ccphylo_path} tree -i {dis_path} -o {tree_path} -m {self.method}"
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
T = ete3.Tree(tree_path, format=1)
mattjones315 marked this conversation as resolved.
Show resolved Hide resolved

# remove temporary files
os.remove(dis_path)
os.remove(tree_path)

# Covert to networkx
tree = nx.Graph()
internal_node_iter = 0
for n in T.traverse():
if n.name == "":
n.name = f"cassiopeia_internal_node{internal_node_iter}"
internal_node_iter += 1
if not n.is_root():
tree.add_edge(n.up.name,n.name)

# find last split
midpoint = T.get_midpoint_outgroup()
root = T.get_tree_root()
if midpoint in root.children:
last_split = [root.name,midpoint.name]
else:
last_split = [root.name,root.children[0].name]
tree.remove_edge(last_split[0],last_split[1])

# root tree
tree = self.root_tree(tree,cassiopeia_tree.root_sample_name,last_split)

# remove root from character matrix before populating tree
if (
cassiopeia_tree.root_sample_name
in cassiopeia_tree.character_matrix.index
):
cassiopeia_tree.character_matrix = (
cassiopeia_tree.character_matrix.drop(
index=cassiopeia_tree.root_sample_name
)
)

# populate tree
cassiopeia_tree.populate_tree(tree,layer=layer)
cassiopeia_tree.collapse_unifurcations()

# collapse mutationless edges
if collapse_mutationless_edges:
cassiopeia_tree.collapse_mutationless_edges(
infer_ancestral_characters=True
)

def solve(
self,
cassiopeia_tree: CassiopeiaTree,
layer: Optional[str] = None,
collapse_mutationless_edges: bool = False,
logfile: str = "stdout.log",
) -> None:
"""Solves a tree for a general bottom-up distance-based solver routine.
If fast is set to True, this function will use the fast_solve method which
is wrapper around CCPhylo algorithms. Otherwise, it will default to the
generic solve method in DistanceSolver. The function will update the `tree`
attribute of the input CassiopeiaTree.

Args:
cassiopeia_tree: CassiopeiaTree object to be populated
layer: Layer storing the character matrix for solving. If None, the
default character matrix is used in the CassiopeiaTree.
collapse_mutationless_edges: Indicates if the final reconstructed
tree should collapse mutationless edges based on internal states
inferred by Camin-Sokal parsimony. In scoring accuracy, this
removes artifacts caused by arbitrarily resolving polytomies.
logfile: File location to log output. Not currently used.
"""

if self.fast:
self._fast_solve(cassiopeia_tree, layer, collapse_mutationless_edges, logfile)
else:
if self.__class__ == CCPhyloSolver:
raise NotImplementedError(
"CCPhyloSolver does not implement solve. Please set fast to True"
"to use the fast_solve method. Or use a subclass of CCPhyloSolver."
mattjones315 marked this conversation as resolved.
Show resolved Hide resolved
)
super().solve(cassiopeia_tree, layer, collapse_mutationless_edges, logfile)
66 changes: 66 additions & 0 deletions cassiopeia/solver/DynamicNeighborJoiningSolver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""
This file stores a subclass of NeighborJoiningSolver, DynamicNeighborJoining.
The inference procedure is the Dymanic Neighbor-Joining algorithm proposed by
Clausen (2023). https://pubmed.ncbi.nlm.nih.gov/36453849/
"""
from typing import Callable, Dict, List, Optional, Tuple, Union

import numpy as np

from cassiopeia.data import CassiopeiaTree
from cassiopeia.solver import (
NeighborJoiningSolver,
dissimilarity_functions,
)

class DynamicNeighborJoiningSolver(NeighborJoiningSolver):
"""
Dynamic Neighbor-Joining class for Cassiopeia.

Implements the Dynamic Neighbor-Joining algorithm described by Clausen (2023)
as a derived class of CCPhyloSolver. This class inherits the generic
`solve` method. This algorithm is guaranteed to return an exact solution.

Args:
dissimilarity_function: A function by which to compute the dissimilarity
map. Optional if a dissimilarity map is already provided.
add_root: Whether or not to add an implicit root the tree, i.e. a root
with unmutated characters. If set to False, and no explicit root is
provided in the CassiopeiaTree, then will return an unrooted,
undirected tree
prior_transformation: Function to use when transforming priors into
weights. Supports the following transformations:
"negative_log": Transforms each probability by the negative
log (default)
"inverse": Transforms each probability p by taking 1/p
"square_root_inverse": Transforms each probability by the
the square root of 1/p

Attributes:
dissimilarity_function: Function used to compute dissimilarity between
samples.
add_root: Whether or not to add an implicit root the tree.
prior_transformation: Function to use when transforming priors into
weights.

"""

def __init__(
self,
dissimilarity_function: Optional[
Callable[
[np.array, np.array, int, Dict[int, Dict[int, float]]], float
]
] = dissimilarity_functions.weighted_hamming_distance,
add_root: bool = False,
prior_transformation: str = "negative_log",
):

super().__init__(
dissimilarity_function=dissimilarity_function,
add_root=add_root,
prior_transformation=prior_transformation,
fast = True,
)

self.method = "dnj"
67 changes: 67 additions & 0 deletions cassiopeia/solver/HeuristicNeighborJoiningSolver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""
This file stores a subclass of NeighborJoiningSolver, HeuristicNeighborJoiningSolver.
The inference procedure is the Heuristic Neighbor-Joining algorithm proposed by
Clausen (2023). https://pubmed.ncbi.nlm.nih.gov/36453849/
"""
from typing import Callable, Dict, List, Optional, Tuple, Union

import numpy as np

from cassiopeia.data import CassiopeiaTree
from cassiopeia.solver import (
NeighborJoiningSolver,
dissimilarity_functions,
)

class HeuristicNeighborJoiningSolver(NeighborJoiningSolver):
"""
Heuristic Neighbor-Joining class for Cassiopeia.

Implements the Heuristic Neighbor-Joining algorithm described by Clausen (2023)
as a derived class of NeighborJoiningSolver. This class inherits the generic
`solve` method. This algorithm is not guaranteed to return an exact solution.


Args:
dissimilarity_function: A function by which to compute the dissimilarity
map. Optional if a dissimilarity map is already provided.
add_root: Whether or not to add an implicit root the tree, i.e. a root
with unmutated characters. If set to False, and no explicit root is
provided in the CassiopeiaTree, then will return an unrooted,
undirected tree
prior_transformation: Function to use when transforming priors into
weights. Supports the following transformations:
"negative_log": Transforms each probability by the negative
log (default)
"inverse": Transforms each probability p by taking 1/p
"square_root_inverse": Transforms each probability by the
the square root of 1/p

Attributes:
dissimilarity_function: Function used to compute dissimilarity between
samples.
add_root: Whether or not to add an implicit root the tree.
prior_transformation: Function to use when transforming priors into
weights.

"""

def __init__(
mattjones315 marked this conversation as resolved.
Show resolved Hide resolved
self,
dissimilarity_function: Optional[
Callable[
[np.array, np.array, int, Dict[int, Dict[int, float]]], float
]
] = dissimilarity_functions.weighted_hamming_distance,
add_root: bool = False,
prior_transformation: str = "negative_log",
):

super().__init__(
dissimilarity_function=dissimilarity_function,
add_root=add_root,
prior_transformation=prior_transformation,
fast = True,
)

self.method = "hnj"
Loading