Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fast UPGMA and NJ with CCPhylo #205

Merged
merged 9 commits into from
Aug 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -164,4 +164,5 @@ build
stdout.log
notebooks/.ipynb_checkpoints
cassiopeia/tools/branch_length_estimator/_iid_exponential_bayesian.cpp
docs/api/reference/**
docs/api/reference/**
cassiopeia/config.ini
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ For developers:
* Run the command ``gurobi.sh`` from a terminal window
* From the Gurobi installation directory (where there is a setup.py file), use ``python setup.py install --user``

4. Install Cassiopeia by first changing into the Cassiopeia directory and then `pip3 install .`. To install dev and docs requirements, you can run `pip3 install .[dev,docs]`.
4. [Optional] To use fast versions of Neighbor-Joining and UPGMA, install [CCPhylo](https://bitbucket.org/genomicepidemiology/ccphylo/src/master/) then set ccphylo_path in the config.ini file in the cassiopeia directory.

5. Install Cassiopeia by first changing into the Cassiopeia directory and then `pip3 install .`. To install dev and docs requirements, you can run `pip3 install .[dev,docs]`.

To verify that it installed correctly, try running our tests with `pytest`.

Expand Down
2 changes: 2 additions & 0 deletions cassiopeia/config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[Paths]
ccphylo_path = /path/to/ccphylo/ccphylo
156 changes: 150 additions & 6 deletions cassiopeia/solver/DistanceSolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,28 @@
the inference procedures that inherit from this method will need to implement
methods for selecting "cherries" and updating the dissimilarity map. Methods
that will inherit from this class by default are Neighbor-Joining and UPGMA.
There may be other subclasses of this.
There may be other subclasses of this. Currently also implements a method for
solving trees with CCPhylo but this will be moved with switch to compositional
framework.
"""
import os

import abc
import subprocess
import tempfile
from typing import Callable, Dict, List, Optional, Tuple

import configparser
import ete3
import networkx as nx
import numpy as np
import pandas as pd

from cassiopeia.data import CassiopeiaTree
from cassiopeia.mixins import DistanceSolverError
from cassiopeia.solver import CassiopeiaSolver, solver_utilities

from cassiopeia.solver import (CassiopeiaSolver,
solver_utilities)
from cassiopeia.data import utilities

class DistanceSolver(CassiopeiaSolver.CassiopeiaSolver):
"""
Expand Down Expand Up @@ -74,6 +83,9 @@ def __init__(

self.dissimilarity_function = dissimilarity_function
self.add_root = add_root

if "ccphylo" in self._implementation:
self._setup_ccphylo()

def get_dissimilarity_map(
self,
Expand Down Expand Up @@ -108,7 +120,6 @@ def get_dissimilarity_map(

return dissimilarity_map


def solve(
self,
cassiopeia_tree: CassiopeiaTree,
Expand All @@ -135,6 +146,24 @@ def solve(
removes artifacts caused by arbitrarily resolving polytomies.
logfile: File location to log output. Not currently used.
"""

if self._implementation == "ccphylo_dnj":
self._ccphylo_solve(cassiopeia_tree,layer,
collapse_mutationless_edges,logfile,method="dnj")
return
elif self._implementation == "ccphylo_nj":
self._ccphylo_solve(cassiopeia_tree,layer,
collapse_mutationless_edges,logfile,method="nj")
return
elif self._implementation == "ccphylo_hnj":
self._ccphylo_solve(cassiopeia_tree,layer,
collapse_mutationless_edges,logfile,method="hnj")
return
elif self._implementation == "ccphylo_upgma":
self._ccphylo_solve(cassiopeia_tree,layer,
collapse_mutationless_edges,logfile,method="upgma")
return

node_name_generator = solver_utilities.node_name_generator()

dissimilarity_map = self.get_dissimilarity_map(cassiopeia_tree, layer)
Expand Down Expand Up @@ -197,6 +226,121 @@ def solve(
infer_ancestral_characters=True
)

def _ccphylo_solve(
self,
cassiopeia_tree: CassiopeiaTree,
layer: Optional[str] = None,
collapse_mutationless_edges: bool = False,
logfile: str = "stdout.log",
method: str = "dnj"
) -> None:
"""Solves a tree using fast distance-based algorithms implemented by
CCPhylo. To call this method the CCPhlyo package must be installed
and the ccphylo_path must be set in the config file. The method
attribute specifies which algorithm to use. The function will update the
`tree`.

Args:
cassiopeia_tree: CassiopeiaTree object to be populated
layer: Layer storing the character matrix for solving. If None, the
default character matrix is used in the CassiopeiaTree.
collapse_mutationless_edges: Indicates if the final reconstructed
tree should collapse mutationless edges based on internal states
inferred by Camin-Sokal parsimony. In scoring accuracy, this
removes artifacts caused by arbitrarily resolving polytomies.
logfile: File location to log output. Not currently used.
"""

dissimilarity_map = self.get_dissimilarity_map(cassiopeia_tree, layer)

with tempfile.TemporaryDirectory() as temp_dir:

# save dissimilarity map as phylip file
dis_path = os.path.join(temp_dir, "dist.phylip")
tree_path = os.path.join(temp_dir, "tree.nwk")
solver_utilities.save_dissimilarity_as_phylip(dissimilarity_map,
dis_path)

# run ccphylo
command = (f"{self._ccphylo_path} tree -i {dis_path} -o "
f"{tree_path} -m {method}")

process = subprocess.Popen(command, shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
T = ete3.Tree(tree_path, format=1)

# remove temporary files
os.remove(dis_path)
os.remove(tree_path)

# Covert to networkx
tree = utilities.ete3_to_networkx(T).to_undirected()

# find last split
midpoint = T.get_midpoint_outgroup()
root = T.get_tree_root()
if midpoint in root.children:
last_split = [root.name,midpoint.name]
else:
last_split = [root.name,root.children[0].name]
tree.remove_edge(last_split[0],last_split[1])

# root tree
tree = self.root_tree(tree,cassiopeia_tree.root_sample_name,last_split)

# remove root from character matrix before populating tree
if (
cassiopeia_tree.root_sample_name
in cassiopeia_tree.character_matrix.index
):
cassiopeia_tree.character_matrix = (
cassiopeia_tree.character_matrix.drop(
index=cassiopeia_tree.root_sample_name
)
)

# populate tree
cassiopeia_tree.populate_tree(tree,layer=layer)
cassiopeia_tree.collapse_unifurcations()

# collapse mutationless edges
if collapse_mutationless_edges:
cassiopeia_tree.collapse_mutationless_edges(
infer_ancestral_characters=True
)

def _setup_ccphylo(self) -> None:
"""Sets up the ccphylo solver by getting the ccphylo_path from the
config file and checking that it is valid.
"""

# get ccphylo path
config = configparser.ConfigParser()
config.read(os.path.join(os.path.dirname(__file__),"..","config.ini"))
self._ccphylo_path = config.get("Paths","ccphylo_path")

#check that ccphylo_path is valid
if not os.path.exists(self._ccphylo_path):
raise DistanceSolverError(
f"ccphylo_path {self._ccphylo_path} does not exist. To use fast "
"versions of Neighbor-Joining and UPGMA please install CCPhylo "
"(https://bitbucket.org/genomicepidemiology/ccphylo/src/master/)"
"set the ccphylo_path in the config.ini file then reinstall "
"Cassiopeia."
)

#check that ccphylo_path is executable
if not os.access(self._ccphylo_path, os.X_OK):
raise DistanceSolverError(
f"ccphylo_path {self._ccphylo_path} is not executable. To use "
"fast versions of Neighbor-Joining and UPGMA please install CCPhylo"
" (https://bitbucket.org/genomicepidemiology/ccphylo/src/master/) "
"set the ccphylo_path in the config.ini file then reinstall "
"Cassiopeia."
)

def setup_dissimilarity_map(
self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None
) -> None:
Expand Down Expand Up @@ -228,8 +372,8 @@ def setup_dissimilarity_map(

else:
raise DistanceSolverError(
"Please specify an explicit root sample in the Cassiopeia Tree"
" or specify the solver to add an implicit root"
"Please specify an explicit root sample in the Cassiopeia"
" Tree or specify the solver to add an implicit root"
)

if cassiopeia_tree.get_dissimilarity_map() is None:
Expand Down
29 changes: 26 additions & 3 deletions cassiopeia/solver/NeighborJoiningSolver.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
This file stores a subclass of DistanceSolver, NeighborJoining. The
This file stores a subclass of DistanceSolver, NeighborJoiningSolver. The
inference procedure is the Neighbor-Joining algorithm proposed by Saitou and
Nei (1987) that iteratively joins together samples that minimize the Q-criterion
on the dissimilarity map.
Expand All @@ -13,21 +13,22 @@
import pandas as pd

from cassiopeia.data import CassiopeiaTree
from cassiopeia.mixins import DistanceSolverError
from cassiopeia.solver import (
DistanceSolver,
dissimilarity_functions,
solver_utilities,
)


class NeighborJoiningSolver(DistanceSolver.DistanceSolver):
"""
Neighbor-Joining class for Cassiopeia.

Implements the Neighbor-Joining algorithm described by Saitou and Nei (1987)
as a derived class of DistanceSolver. This class inherits the generic
`solve` method, but implements its own procedure for finding cherries by
minimizing the Q-criterion between samples.
minimizing the Q-criterion between samples. If fast is set to True,
a fast NJ implementation of is used.

Args:
dissimilarity_function: A function by which to compute the dissimilarity
Expand All @@ -43,6 +44,15 @@ class NeighborJoiningSolver(DistanceSolver.DistanceSolver):
"inverse": Transforms each probability p by taking 1/p
"square_root_inverse": Transforms each probability by the
the square root of 1/p
fast: Whether to use a fast implementation of Neighbor-Joining.
implementation: Which fast implementation to use. Options are:
"ccphylo_dnj": CCPhylo implementation the Dynamic Neighbor-Joining
algorithm described by Clausen (2023). Solution in guaranteed
to be exact.
"ccphylo_hnj": CCPhylo implementation of the Heuristic
Neighbor-Joining algorithm described by Clausen (2023).
Solution is not guaranteed to be exact.
"ccphylo_nj": CCPhylo implementation of the Neighbor-Joining.

Attributes:
dissimilarity_function: Function used to compute dissimilarity between
Expand All @@ -62,8 +72,21 @@ def __init__(
] = dissimilarity_functions.weighted_hamming_distance,
add_root: bool = False,
prior_transformation: str = "negative_log",
fast: bool = False,
implementation: str = "ccphylo_dnj",
):

if fast:
if implementation in ["ccphylo_dnj", "ccphylo_hnj", "ccphylo_nj"]:
self._implementation = implementation
else:
raise DistanceSolverError(
"Invalid fast implementation of Neighbor-Joining. Options "
"are: 'ccphylo_dnj', 'ccphylo_hnj', 'ccphylo_nj'"
)
else:
self._implementation = "generic_nj"

super().__init__(
dissimilarity_function=dissimilarity_function,
add_root=add_root,
Expand Down
3 changes: 3 additions & 0 deletions cassiopeia/solver/SpectralNeighborJoiningSolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ def __init__(
add_root: bool = False,
prior_transformation: str = "negative_log",
):
self._implementation = "generic_spectral_nj"

super().__init__(
dissimilarity_function=similarity_function,
add_root=add_root,
Expand All @@ -80,6 +82,7 @@ def __init__(
self._similarity_map = None
self._lambda_indices = None


def get_dissimilarity_map(
self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None
) -> pd.DataFrame:
Expand Down
19 changes: 18 additions & 1 deletion cassiopeia/solver/UPGMASolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import pandas as pd

from cassiopeia.data import CassiopeiaTree
from cassiopeia.mixins import DistanceSolverError
from cassiopeia.solver import DistanceSolver, dissimilarity_functions


Expand All @@ -26,7 +27,7 @@ class UPGMASolver(DistanceSolver.DistanceSolver):
dissimilarity between samples. After joining nodes, the dissimilarities
are updated by averaging the distances of elements in the new cluster
with each existing node. Produces a rooted tree that is assumed to be
ultrametric.
ultrametric. If fast is set to True, a fast UPGMA implementation of is used.

Args:
dissimilarity_function: A function by which to compute the dissimilarity
Expand All @@ -38,6 +39,9 @@ class UPGMASolver(DistanceSolver.DistanceSolver):
"inverse": Transforms each probability p by taking 1/p
"square_root_inverse": Transforms each probability by the
the square root of 1/p
fast: Whether to use a fast implementation of UPGMA.
implementation: Which fast implementation to use. Options are:
"ccphylo_upgma": Uses the fast UPGMA implementation from CCPhylo.
Attributes:
dissimilarity_function: Function used to compute dissimilarity between
samples.
Expand All @@ -54,8 +58,21 @@ def __init__(
]
] = dissimilarity_functions.weighted_hamming_distance,
prior_transformation: str = "negative_log",
fast: bool = False,
implementation: str = "ccphylo_upgma",
):

if fast:
if implementation in ["ccphylo_upgma"]:
self._implementation = implementation
else:
raise DistanceSolverError(
"Invalid fast implementation of UPGMA. Options are: "
"'ccphylo_upgma'"
)
else:
self._implementation = "generic_upgma"

super().__init__(
dissimilarity_function=dissimilarity_function,
add_root=True,
Expand Down
22 changes: 22 additions & 0 deletions cassiopeia/solver/solver_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import ete3
from hashlib import blake2b
import numpy as np
import pandas as pd
import time

from cassiopeia.mixins import PriorTransformationError
Expand Down Expand Up @@ -122,3 +123,24 @@ def convert_sample_names_to_indices(
name_to_index = dict(zip(names, range(len(names))))

return list(map(lambda x: name_to_index[x], samples))

def save_dissimilarity_as_phylip(
mattjones315 marked this conversation as resolved.
Show resolved Hide resolved
dissimilarity_map: pd.DataFrame, path: str
) -> None:
"""Saves a dissimilarity map as a phylip file.

Args:
dissimilarity_map: A dissimilarity map
path: The path to save the phylip file

Returns:
None
"""
dissimilarity_np = dissimilarity_map.to_numpy()
n = dissimilarity_np.shape[0]
with open(path, "w") as f:
f.write("{}\n".format(n))
for i in range(n):
row = dissimilarity_np[i, :i+1]
formatted_values = '\t'.join(map('{:.4f}'.format, row))
f.write("{}\t{}\n".format(dissimilarity_map.index[i], formatted_values))
Loading