From 918491cebfc893832c1a1aef0d4a53e9b5a3ecfc Mon Sep 17 00:00:00 2001 From: colganwi Date: Tue, 13 Jun 2023 11:20:54 -0400 Subject: [PATCH 1/8] Added method to reorder children in the CassiopeiaTree --- cassiopeia/data/CassiopeiaTree.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/cassiopeia/data/CassiopeiaTree.py b/cassiopeia/data/CassiopeiaTree.py index 62f3f5fb..6fa855d5 100755 --- a/cassiopeia/data/CassiopeiaTree.py +++ b/cassiopeia/data/CassiopeiaTree.py @@ -766,9 +766,32 @@ def __add_edge(self, u, v) -> None: self.__network.add_edge(u, v) - # this will change the topology of the tree, so reset the cache + # this will change the topology of the tree, so reset the cache self.__cache = {} + def reorder_children(self, node: str, new_order: List[str]) -> None: + """Reorders the children of a particular node. + + Args: + node: Node in the tree + new_order: A list of the new order of children for the node. + + Raises: + CassiopeiaTreeError if the node of interest is a leaf that has not been + instantiated, or if the new order of children is not a permutation + of the original children. + """ + self.__check_network_initialized() + + if self.is_leaf(node): + raise CassiopeiaTreeError("Cannot reorder children of a leaf node.") + + if set(new_order) != set(self.children(node)): + raise CassiopeiaTreeWarning("New order of children is not a permutation of the original children.") + + self.__network.remove_edges_from([(node, child) for child in self.children(node)]) + self.__network.add_edges_from([(node, child) for child in new_order]) + def set_time(self, node: str, new_time: float) -> None: """Sets the time of a node. From 7b504057aa4e68d5e86d0b6e101027b641394059 Mon Sep 17 00:00:00 2001 From: colganwi Date: Fri, 14 Jul 2023 18:26:35 -0400 Subject: [PATCH 2/8] Added CCphylo solvers --- cassiopeia/solver/CCPhyloSolver.py | 209 ++++++++++++++++++++++++++ cassiopeia/solver/__init__.py | 1 + cassiopeia/solver/solver_utilities.py | 22 +++ docs/notebooks | 2 +- 4 files changed, 233 insertions(+), 1 deletion(-) create mode 100755 cassiopeia/solver/CCPhyloSolver.py diff --git a/cassiopeia/solver/CCPhyloSolver.py b/cassiopeia/solver/CCPhyloSolver.py new file mode 100755 index 00000000..edf24cb0 --- /dev/null +++ b/cassiopeia/solver/CCPhyloSolver.py @@ -0,0 +1,209 @@ +""" +This file stores a subclass of CassiopeiaSolver, the CCPhyloSolver. This is +a wrapper around optimized agglomerative algorithms implemented by CCPhylo +(https://bitbucket.org/genomicepidemiology/ccphylo/src/master/). Methods +that will inherit from this class by default are DynamicNeighborJoiningSolver, +HeuristicNeighborJoiningSolver, and OptimizedUPGMASolver. +""" + +import abc +from typing import Callable, Dict, List, Optional, Tuple + +import networkx as nx +import numpy as np +import os +import tempfile +import ete3 +import subprocess +import pandas as pd + +from cassiopeia.data import CassiopeiaTree +from cassiopeia.mixins import DistanceSolverError +from cassiopeia.solver import CassiopeiaSolver, solver_utilities + + +class CCPhyloSolver(CassiopeiaSolver.CassiopeiaSolver): + """ + Distance based solver class. + + This solver serves as a wrapper around CCPhylo algorithms. + + Args: + dissimilarity_function: Function that can be used to compute the + dissimilarity between samples. + add_root: Whether or not to add an implicit root the tree. Only + pertinent in algorithms that return an unrooted tree, by default + (e.g. Neighbor Joining). Will not override an explicitly defined + root, specified by the 'root_sample_name' attribute in the + CassiopeiaTree + prior_transformation: Function to use when transforming priors into + weights. Supports the following transformations: + "negative_log": Transforms each probability by the negative + log (default) + "inverse": Transforms each probability p by taking 1/p + "square_root_inverse": Transforms each probability by the + the square root of 1/p + + Attributes: + dissimilarity_function: Function used to compute dissimilarity between + samples. + add_root: Whether or not to add an implicit root the tree. + prior_transformation: Function to use when transforming priors into + weights. + """ + + def __init__( + self, + dissimilarity_function: Optional[ + Callable[ + [np.array, np.array, int, Dict[int, Dict[int, float]]], float + ] + ] = None, + add_root: bool = False, + prior_transformation: str = "negative_log", + method: str = "nj", + ): + + super().__init__(prior_transformation) + + self.dissimilarity_function = dissimilarity_function + self.add_root = add_root + self.method = method + + def get_dissimilarity_map( + self, + cassiopeia_tree: CassiopeiaTree, + layer: Optional[str] = None + ) -> pd.DataFrame: + """Obtains or generates a matrix that is updated throughout the solver. + + The highest-level method to obtain a dissimilarity map, which + will be the matrix primarily used throughout the solve method. This + matrix contains the pairwise dissimilarity between samples which is used + for identifying sample pairs to merge, and will be updated at every + iteration within the solve method. This method is not limited to + outputting dissimilarity maps, but is instead deliberately + designed to be overwritten to allow for use of similarity maps or other + algorithm-specific sample to sample comparison maps in derived classes. + + Args: + cassiopeia_tree: Tree object from which the + dissimilarity map is generated from + layer: Layer storing the character matrix + for solving. If None, the default character matrix is used in + the CassiopeiaTree. + + Returns: + pd.DataFrame: The matrix that will be used throughout the solve + method. + """ + + self.setup_dissimilarity_map(cassiopeia_tree, layer) + dissimilarity_map = cassiopeia_tree.get_dissimilarity_map() + + return dissimilarity_map + + + def solve( + self, + cassiopeia_tree: CassiopeiaTree, + layer: Optional[str] = None, + collapse_mutationless_edges: bool = False, + logfile: str = "stdout.log", + ) -> None: + """Solves a tree for a general bottom-up distance-based solver routine. + + The general solver routine proceeds by iteratively finding pairs of + samples to join together into a "cherry" and then reform the + dissimilarity matrix with respect to this new cherry. The implementation + of how to find cherries and update the dissimilarity map is left to + subclasses of DistanceSolver. The function will update the `tree` + attribute of the input CassiopeiaTree. + + Args: + cassiopeia_tree: CassiopeiaTree object to be populated + layer: Layer storing the character matrix for solving. If None, the + default character matrix is used in the CassiopeiaTree. + collapse_mutationless_edges: Indicates if the final reconstructed + tree should collapse mutationless edges based on internal states + inferred by Camin-Sokal parsimony. In scoring accuracy, this + removes artifacts caused by arbitrarily resolving polytomies. + logfile: File location to log output. Not currently used. + """ + + dissimilarity_map = self.get_dissimilarity_map(cassiopeia_tree, layer) + + with tempfile.TemporaryDirectory() as temp_dir: + + # save dissimilarity map as phylip file + dis_path = os.path.join(temp_dir, "dist.phylip") + tree_path = os.path.join(temp_dir, "tree.nwk") + solver_utilities.save_dissimilarity_as_phylip(dissimilarity_map, dis_path) + + # run ccphylo + command = f". ~/.bashrc && ccphylo tree -i {dis_path} -o {tree_path} -m {self.method}" + process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + T = ete3.Tree(tree_path, format=1) + + # remove temporary files + os.remove(dis_path) + os.remove(tree_path) + + # remove root from character matrix before populating tree + if (cassiopeia_tree.root_sample_name + in cassiopeia_tree.character_matrix.index): + cassiopeia_tree.character_matrix = ( + cassiopeia_tree.character_matrix.drop( + index=cassiopeia_tree.root_sample_name + ) + ) + + # root tree + if (self.add_root): + T.set_outgroup(T.get_midpoint_outgroup()) + root = ete3.TreeNode(name="root") + root.add_child(T) + + # populate tree + T.ladderize(direction=1) + cassiopeia_tree.populate_tree(T,layer=layer) + print(stderr.decode("utf-8")) + + # collapse mutationless edges + if collapse_mutationless_edges: + cassiopeia_tree.collapse_mutationless_edges( + infer_ancestral_characters=True + ) + + def setup_dissimilarity_map( + self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None + ) -> None: + """Sets up the solver. + + Sets up the solver with respect to the input CassiopeiaTree by + creating the dissimilarity map. + + Args: + cassiopeia_tree: Input CassiopeiaTree to `solve`. + layer: Layer storing the character matrix for solving. If None, the + default character matrix is used in the CassiopeiaTree. + + Raises: + A `DistanceSolverError` if rooting parameters are not passed in + correctly (i.e. no root is specified and the user has not + asked to find a root) or when a dissimilarity map cannot + be found or computed. + """ + + if cassiopeia_tree.get_dissimilarity_map() is None: + if self.dissimilarity_function is None: + raise DistanceSolverError( + "Please specify a dissimilarity function or populate the " + "CassiopeiaTree object with a dissimilarity map" + ) + + cassiopeia_tree.compute_dissimilarity_map( + self.dissimilarity_function, self.prior_transformation, layer + ) + diff --git a/cassiopeia/solver/__init__.py b/cassiopeia/solver/__init__.py index d6c0146b..8b2930e9 100755 --- a/cassiopeia/solver/__init__.py +++ b/cassiopeia/solver/__init__.py @@ -12,4 +12,5 @@ from .UPGMASolver import UPGMASolver from .VanillaGreedySolver import VanillaGreedySolver from .SpectralNeighborJoiningSolver import SpectralNeighborJoiningSolver +from .CCPhyloSolver import CCPhyloSolver from . import dissimilarity_functions as dissimilarity diff --git a/cassiopeia/solver/solver_utilities.py b/cassiopeia/solver/solver_utilities.py index c0fd7f51..f79036de 100755 --- a/cassiopeia/solver/solver_utilities.py +++ b/cassiopeia/solver/solver_utilities.py @@ -7,6 +7,7 @@ import ete3 from hashlib import blake2b import numpy as np +import pandas as pd import time from cassiopeia.mixins import PriorTransformationError @@ -122,3 +123,24 @@ def convert_sample_names_to_indices( name_to_index = dict(zip(names, range(len(names)))) return list(map(lambda x: name_to_index[x], samples)) + +def save_dissimilarity_as_phylip( + dissimilarity_map: pd.DataFrame, path: str + ) -> None: + """Saves a dissimilarity map as a phylip file. + + Args: + dissimilarity_map: A dissimilarity map + path: The path to save the phylip file + + Returns: + None + """ + dissimilarity_np = dissimilarity_map.to_numpy() + n = dissimilarity_np.shape[0] + with open(path, "w") as f: + f.write("{}\n".format(n)) + for i in range(n): + row = dissimilarity_np[i, :i+1] + formatted_values = '\t'.join(map('{:.4f}'.format, row)) + f.write("{}\t{}\n".format(dissimilarity_map.index[i], formatted_values)) diff --git a/docs/notebooks b/docs/notebooks index edb8f02d..8f9a5b2e 120000 --- a/docs/notebooks +++ b/docs/notebooks @@ -1 +1 @@ -../notebooks/ \ No newline at end of file +../notebooks \ No newline at end of file From 3e12dcd09aa6bbea8a383231b89f46770bbf3372 Mon Sep 17 00:00:00 2001 From: colganwi Date: Thu, 27 Jul 2023 14:18:58 -0400 Subject: [PATCH 3/8] Implemented fast versions of NJ and UPGMA using CCPhlyo --- README.md | 4 +- cassiopeia/config.ini | 2 + cassiopeia/solver/CCPhyloSolver.py | 188 ++++++++------- .../solver/DynamicNeighborJoiningSolver.py | 66 +++++ .../solver/HeuristicNeighborJoiningSolver.py | 67 ++++++ cassiopeia/solver/NeighborJoiningSolver.py | 12 +- cassiopeia/solver/UPGMASolver.py | 11 +- cassiopeia/solver/__init__.py | 2 + test/solver_tests/ccphylo_solver_test.py | 225 ++++++++++++++++++ 9 files changed, 483 insertions(+), 94 deletions(-) create mode 100755 cassiopeia/config.ini create mode 100755 cassiopeia/solver/DynamicNeighborJoiningSolver.py create mode 100755 cassiopeia/solver/HeuristicNeighborJoiningSolver.py create mode 100755 test/solver_tests/ccphylo_solver_test.py diff --git a/README.md b/README.md index f2882800..ddbb201d 100755 --- a/README.md +++ b/README.md @@ -45,7 +45,9 @@ For developers: * Run the command ``gurobi.sh`` from a terminal window * From the Gurobi installation directory (where there is a setup.py file), use ``python setup.py install --user`` -4. Install Cassiopeia by first changing into the Cassiopeia directory and then `pip3 install .`. To install dev and docs requirements, you can run `pip3 install .[dev,docs]`. +4. [Optional] To use fast versions of Neighbor-Joining and UPGMA, install [CCPhylo](https://bitbucket.org/genomicepidemiology/ccphylo/src/master/) then set ccphylo_path in the config.ini file in the cassiopeia directory. + +5. Install Cassiopeia by first changing into the Cassiopeia directory and then `pip3 install .`. To install dev and docs requirements, you can run `pip3 install .[dev,docs]`. To verify that it installed correctly, try running our tests with `pytest`. diff --git a/cassiopeia/config.ini b/cassiopeia/config.ini new file mode 100755 index 00000000..cc497800 --- /dev/null +++ b/cassiopeia/config.ini @@ -0,0 +1,2 @@ +[Paths] +ccphylo_path = /path/to/ccphylo/ccphylo diff --git a/cassiopeia/solver/CCPhyloSolver.py b/cassiopeia/solver/CCPhyloSolver.py index edf24cb0..1715e574 100755 --- a/cassiopeia/solver/CCPhyloSolver.py +++ b/cassiopeia/solver/CCPhyloSolver.py @@ -3,7 +3,7 @@ a wrapper around optimized agglomerative algorithms implemented by CCPhylo (https://bitbucket.org/genomicepidemiology/ccphylo/src/master/). Methods that will inherit from this class by default are DynamicNeighborJoiningSolver, -HeuristicNeighborJoiningSolver, and OptimizedUPGMASolver. +HeuristicNeighborJoiningSolver, NeighborJoiningSolver and UPGMASolver. """ import abc @@ -16,17 +16,21 @@ import ete3 import subprocess import pandas as pd +import configparser -from cassiopeia.data import CassiopeiaTree +from cassiopeia.data import CassiopeiaTree, utilities from cassiopeia.mixins import DistanceSolverError -from cassiopeia.solver import CassiopeiaSolver, solver_utilities +from cassiopeia.solver import ( + DistanceSolver, + dissimilarity_functions, + solver_utilities, +) -class CCPhyloSolver(CassiopeiaSolver.CassiopeiaSolver): +class CCPhyloSolver(DistanceSolver.DistanceSolver): """ - Distance based solver class. - - This solver serves as a wrapper around CCPhylo algorithms. + Distance based solver class. This solver serves as a wrapper around + CCPhylo algorithms. Args: dissimilarity_function: Function that can be used to compute the @@ -58,9 +62,10 @@ def __init__( Callable[ [np.array, np.array, int, Dict[int, Dict[int, float]]], float ] - ] = None, + ] = dissimilarity_functions.weighted_hamming_distance, add_root: bool = False, prior_transformation: str = "negative_log", + fast = True, method: str = "nj", ): @@ -68,57 +73,45 @@ def __init__( self.dissimilarity_function = dissimilarity_function self.add_root = add_root + self.fast = fast self.method = method - - def get_dissimilarity_map( - self, - cassiopeia_tree: CassiopeiaTree, - layer: Optional[str] = None - ) -> pd.DataFrame: - """Obtains or generates a matrix that is updated throughout the solver. - - The highest-level method to obtain a dissimilarity map, which - will be the matrix primarily used throughout the solve method. This - matrix contains the pairwise dissimilarity between samples which is used - for identifying sample pairs to merge, and will be updated at every - iteration within the solve method. This method is not limited to - outputting dissimilarity maps, but is instead deliberately - designed to be overwritten to allow for use of similarity maps or other - algorithm-specific sample to sample comparison maps in derived classes. - - Args: - cassiopeia_tree: Tree object from which the - dissimilarity map is generated from - layer: Layer storing the character matrix - for solving. If None, the default character matrix is used in - the CassiopeiaTree. - - Returns: - pd.DataFrame: The matrix that will be used throughout the solve - method. - """ - - self.setup_dissimilarity_map(cassiopeia_tree, layer) - dissimilarity_map = cassiopeia_tree.get_dissimilarity_map() - - return dissimilarity_map - - - def solve( + + def _setup_ccphylo(self) -> None: + + # get ccphylo path + config = configparser.ConfigParser() + config.read(os.path.join(os.path.dirname(__file__),"..","config.ini")) + self.ccphylo_path = config.get("Paths","ccphylo_path") + + #check that ccphylo_path is valid + if not os.path.exists(self.ccphylo_path): + raise DistanceSolverError( + f"ccphylo_path {self.ccphylo_path} does not exist. To use fast " + "versions of Neighbor-Joining and UPGMA please install " + "CCPhylo (https://bitbucket.org/genomicepidemiology/ccphylo/src/master/) " + "set the ccphylo_path in the config.ini file then reinstall Cassiopeia." + ) + + #check that ccphylo_path is executable + if not os.access(self.ccphylo_path, os.X_OK): + raise DistanceSolverError( + f"ccphylo_path {self.ccphylo_path} is not executable. To use fast " + "versions of Neighbor-Joining and UPGMA please install " + "CCPhylo (https://bitbucket.org/genomicepidemiology/ccphylo/src/master/) " + "set the ccphylo_path in the config.ini file then reinstall Cassiopeia." + ) + + def _fast_solve( self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None, collapse_mutationless_edges: bool = False, logfile: str = "stdout.log", ) -> None: - """Solves a tree for a general bottom-up distance-based solver routine. - - The general solver routine proceeds by iteratively finding pairs of - samples to join together into a "cherry" and then reform the - dissimilarity matrix with respect to this new cherry. The implementation - of how to find cherries and update the dissimilarity map is left to - subclasses of DistanceSolver. The function will update the `tree` - attribute of the input CassiopeiaTree. + """Solves a tree using fast distance-based algorithms implemented by + CCPhylo. To call this method the CCPhlyo package must be installed + and the ccphylo_path must be set in the config file. The method attribute + specifies which algorithm to use. The function will update the `tree`. Args: cassiopeia_tree: CassiopeiaTree object to be populated @@ -131,6 +124,8 @@ def solve( logfile: File location to log output. Not currently used. """ + self._setup_ccphylo() + dissimilarity_map = self.get_dissimilarity_map(cassiopeia_tree, layer) with tempfile.TemporaryDirectory() as temp_dir: @@ -141,7 +136,7 @@ def solve( solver_utilities.save_dissimilarity_as_phylip(dissimilarity_map, dis_path) # run ccphylo - command = f". ~/.bashrc && ccphylo tree -i {dis_path} -o {tree_path} -m {self.method}" + command = f"{self.ccphylo_path} tree -i {dis_path} -o {tree_path} -m {self.method}" process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = process.communicate() T = ete3.Tree(tree_path, format=1) @@ -150,25 +145,42 @@ def solve( os.remove(dis_path) os.remove(tree_path) + # Covert to networkx + tree = nx.Graph() + internal_node_iter = 0 + for n in T.traverse(): + if n.name == "": + n.name = f"cassiopeia_internal_node{internal_node_iter}" + internal_node_iter += 1 + if not n.is_root(): + tree.add_edge(n.up.name,n.name) + + # find last split + midpoint = T.get_midpoint_outgroup() + root = T.get_tree_root() + if midpoint in root.children: + last_split = [root.name,midpoint.name] + else: + last_split = [root.name,root.children[0].name] + tree.remove_edge(last_split[0],last_split[1]) + + # root tree + tree = self.root_tree(tree,cassiopeia_tree.root_sample_name,last_split) + # remove root from character matrix before populating tree - if (cassiopeia_tree.root_sample_name - in cassiopeia_tree.character_matrix.index): + if ( + cassiopeia_tree.root_sample_name + in cassiopeia_tree.character_matrix.index + ): cassiopeia_tree.character_matrix = ( cassiopeia_tree.character_matrix.drop( index=cassiopeia_tree.root_sample_name ) ) - # root tree - if (self.add_root): - T.set_outgroup(T.get_midpoint_outgroup()) - root = ete3.TreeNode(name="root") - root.add_child(T) - # populate tree - T.ladderize(direction=1) - cassiopeia_tree.populate_tree(T,layer=layer) - print(stderr.decode("utf-8")) + cassiopeia_tree.populate_tree(tree,layer=layer) + cassiopeia_tree.collapse_unifurcations() # collapse mutationless edges if collapse_mutationless_edges: @@ -176,34 +188,36 @@ def solve( infer_ancestral_characters=True ) - def setup_dissimilarity_map( - self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None + def solve( + self, + cassiopeia_tree: CassiopeiaTree, + layer: Optional[str] = None, + collapse_mutationless_edges: bool = False, + logfile: str = "stdout.log", ) -> None: - """Sets up the solver. - - Sets up the solver with respect to the input CassiopeiaTree by - creating the dissimilarity map. + """Solves a tree for a general bottom-up distance-based solver routine. + If fast is set to True, this function will use the fast_solve method which + is wrapper around CCPhylo algorithms. Otherwise, it will default to the + generic solve method in DistanceSolver. The function will update the `tree` + attribute of the input CassiopeiaTree. Args: - cassiopeia_tree: Input CassiopeiaTree to `solve`. + cassiopeia_tree: CassiopeiaTree object to be populated layer: Layer storing the character matrix for solving. If None, the default character matrix is used in the CassiopeiaTree. - - Raises: - A `DistanceSolverError` if rooting parameters are not passed in - correctly (i.e. no root is specified and the user has not - asked to find a root) or when a dissimilarity map cannot - be found or computed. + collapse_mutationless_edges: Indicates if the final reconstructed + tree should collapse mutationless edges based on internal states + inferred by Camin-Sokal parsimony. In scoring accuracy, this + removes artifacts caused by arbitrarily resolving polytomies. + logfile: File location to log output. Not currently used. """ - if cassiopeia_tree.get_dissimilarity_map() is None: - if self.dissimilarity_function is None: - raise DistanceSolverError( - "Please specify a dissimilarity function or populate the " - "CassiopeiaTree object with a dissimilarity map" + if self.fast: + self._fast_solve(cassiopeia_tree, layer, collapse_mutationless_edges, logfile) + else: + if self.__class__ == CCPhyloSolver: + raise NotImplementedError( + "CCPhyloSolver does not implement solve. Please set fast to True" + "to use the fast_solve method. Or use a subclass of CCPhyloSolver." ) - - cassiopeia_tree.compute_dissimilarity_map( - self.dissimilarity_function, self.prior_transformation, layer - ) - + super().solve(cassiopeia_tree, layer, collapse_mutationless_edges, logfile) \ No newline at end of file diff --git a/cassiopeia/solver/DynamicNeighborJoiningSolver.py b/cassiopeia/solver/DynamicNeighborJoiningSolver.py new file mode 100755 index 00000000..13b06090 --- /dev/null +++ b/cassiopeia/solver/DynamicNeighborJoiningSolver.py @@ -0,0 +1,66 @@ +""" +This file stores a subclass of NeighborJoiningSolver, DynamicNeighborJoining. +The inference procedure is the Dymanic Neighbor-Joining algorithm proposed by +Clausen (2023). https://pubmed.ncbi.nlm.nih.gov/36453849/ +""" +from typing import Callable, Dict, List, Optional, Tuple, Union + +import numpy as np + +from cassiopeia.data import CassiopeiaTree +from cassiopeia.solver import ( + NeighborJoiningSolver, + dissimilarity_functions, +) + +class DynamicNeighborJoiningSolver(NeighborJoiningSolver): + """ + Dynamic Neighbor-Joining class for Cassiopeia. + + Implements the Dynamic Neighbor-Joining algorithm described by Clausen (2023) + as a derived class of CCPhyloSolver. This class inherits the generic + `solve` method. This algorithm is guaranteed to return an exact solution. + + Args: + dissimilarity_function: A function by which to compute the dissimilarity + map. Optional if a dissimilarity map is already provided. + add_root: Whether or not to add an implicit root the tree, i.e. a root + with unmutated characters. If set to False, and no explicit root is + provided in the CassiopeiaTree, then will return an unrooted, + undirected tree + prior_transformation: Function to use when transforming priors into + weights. Supports the following transformations: + "negative_log": Transforms each probability by the negative + log (default) + "inverse": Transforms each probability p by taking 1/p + "square_root_inverse": Transforms each probability by the + the square root of 1/p + + Attributes: + dissimilarity_function: Function used to compute dissimilarity between + samples. + add_root: Whether or not to add an implicit root the tree. + prior_transformation: Function to use when transforming priors into + weights. + + """ + + def __init__( + self, + dissimilarity_function: Optional[ + Callable[ + [np.array, np.array, int, Dict[int, Dict[int, float]]], float + ] + ] = dissimilarity_functions.weighted_hamming_distance, + add_root: bool = False, + prior_transformation: str = "negative_log", + ): + + super().__init__( + dissimilarity_function=dissimilarity_function, + add_root=add_root, + prior_transformation=prior_transformation, + fast = True, + ) + + self.method = "dnj" \ No newline at end of file diff --git a/cassiopeia/solver/HeuristicNeighborJoiningSolver.py b/cassiopeia/solver/HeuristicNeighborJoiningSolver.py new file mode 100755 index 00000000..7eb19a6e --- /dev/null +++ b/cassiopeia/solver/HeuristicNeighborJoiningSolver.py @@ -0,0 +1,67 @@ +""" +This file stores a subclass of NeighborJoiningSolver, HeuristicNeighborJoiningSolver. +The inference procedure is the Heuristic Neighbor-Joining algorithm proposed by +Clausen (2023). https://pubmed.ncbi.nlm.nih.gov/36453849/ +""" +from typing import Callable, Dict, List, Optional, Tuple, Union + +import numpy as np + +from cassiopeia.data import CassiopeiaTree +from cassiopeia.solver import ( + NeighborJoiningSolver, + dissimilarity_functions, +) + +class HeuristicNeighborJoiningSolver(NeighborJoiningSolver): + """ + Heuristic Neighbor-Joining class for Cassiopeia. + + Implements the Heuristic Neighbor-Joining algorithm described by Clausen (2023) + as a derived class of NeighborJoiningSolver. This class inherits the generic + `solve` method. This algorithm is not guaranteed to return an exact solution. + + + Args: + dissimilarity_function: A function by which to compute the dissimilarity + map. Optional if a dissimilarity map is already provided. + add_root: Whether or not to add an implicit root the tree, i.e. a root + with unmutated characters. If set to False, and no explicit root is + provided in the CassiopeiaTree, then will return an unrooted, + undirected tree + prior_transformation: Function to use when transforming priors into + weights. Supports the following transformations: + "negative_log": Transforms each probability by the negative + log (default) + "inverse": Transforms each probability p by taking 1/p + "square_root_inverse": Transforms each probability by the + the square root of 1/p + + Attributes: + dissimilarity_function: Function used to compute dissimilarity between + samples. + add_root: Whether or not to add an implicit root the tree. + prior_transformation: Function to use when transforming priors into + weights. + + """ + + def __init__( + self, + dissimilarity_function: Optional[ + Callable[ + [np.array, np.array, int, Dict[int, Dict[int, float]]], float + ] + ] = dissimilarity_functions.weighted_hamming_distance, + add_root: bool = False, + prior_transformation: str = "negative_log", + ): + + super().__init__( + dissimilarity_function=dissimilarity_function, + add_root=add_root, + prior_transformation=prior_transformation, + fast = True, + ) + + self.method = "hnj" \ No newline at end of file diff --git a/cassiopeia/solver/NeighborJoiningSolver.py b/cassiopeia/solver/NeighborJoiningSolver.py index 848d9a7d..720418c1 100755 --- a/cassiopeia/solver/NeighborJoiningSolver.py +++ b/cassiopeia/solver/NeighborJoiningSolver.py @@ -1,5 +1,5 @@ """ -This file stores a subclass of DistanceSolver, NeighborJoining. The +This file stores a subclass of CCPhyloSolver, NeighborJoiningSolver. The inference procedure is the Neighbor-Joining algorithm proposed by Saitou and Nei (1987) that iteratively joins together samples that minimize the Q-criterion on the dissimilarity map. @@ -15,19 +15,21 @@ from cassiopeia.data import CassiopeiaTree from cassiopeia.solver import ( DistanceSolver, + CCPhyloSolver, dissimilarity_functions, solver_utilities, ) -class NeighborJoiningSolver(DistanceSolver.DistanceSolver): +class NeighborJoiningSolver(CCPhyloSolver.CCPhyloSolver): """ Neighbor-Joining class for Cassiopeia. Implements the Neighbor-Joining algorithm described by Saitou and Nei (1987) as a derived class of DistanceSolver. This class inherits the generic `solve` method, but implements its own procedure for finding cherries by - minimizing the Q-criterion between samples. + minimizing the Q-criterion between samples. If fast is set to True, then + the fast Neighbor-Joining implementation from CCPhylo of is used. Args: dissimilarity_function: A function by which to compute the dissimilarity @@ -43,6 +45,7 @@ class NeighborJoiningSolver(DistanceSolver.DistanceSolver): "inverse": Transforms each probability p by taking 1/p "square_root_inverse": Transforms each probability by the the square root of 1/p + fast: Whether to use the fast CCPhylo implementation of Neighbor-Joining. Attributes: dissimilarity_function: Function used to compute dissimilarity between @@ -62,12 +65,15 @@ def __init__( ] = dissimilarity_functions.weighted_hamming_distance, add_root: bool = False, prior_transformation: str = "negative_log", + fast: bool = False, ): super().__init__( dissimilarity_function=dissimilarity_function, add_root=add_root, prior_transformation=prior_transformation, + fast = fast, + method = "nj" ) def root_tree( diff --git a/cassiopeia/solver/UPGMASolver.py b/cassiopeia/solver/UPGMASolver.py index ba535de2..0d4053b1 100644 --- a/cassiopeia/solver/UPGMASolver.py +++ b/cassiopeia/solver/UPGMASolver.py @@ -13,10 +13,10 @@ import pandas as pd from cassiopeia.data import CassiopeiaTree -from cassiopeia.solver import DistanceSolver, dissimilarity_functions +from cassiopeia.solver import DistanceSolver, dissimilarity_functions, CCPhyloSolver -class UPGMASolver(DistanceSolver.DistanceSolver): +class UPGMASolver(CCPhyloSolver.CCPhyloSolver): """ UPGMA CassiopeiaSolver. @@ -26,7 +26,8 @@ class UPGMASolver(DistanceSolver.DistanceSolver): dissimilarity between samples. After joining nodes, the dissimilarities are updated by averaging the distances of elements in the new cluster with each existing node. Produces a rooted tree that is assumed to be - ultrametric. + ultrametric. If fast is set to True, then + the fast UPGMA implementation from CCPhylo is used. Args: dissimilarity_function: A function by which to compute the dissimilarity @@ -38,6 +39,7 @@ class UPGMASolver(DistanceSolver.DistanceSolver): "inverse": Transforms each probability p by taking 1/p "square_root_inverse": Transforms each probability by the the square root of 1/p + fast: Whether to use the fast CCPhylo implementation of UPGMA. Attributes: dissimilarity_function: Function used to compute dissimilarity between samples. @@ -54,12 +56,15 @@ def __init__( ] ] = dissimilarity_functions.weighted_hamming_distance, prior_transformation: str = "negative_log", + fast: bool = False, ): super().__init__( dissimilarity_function=dissimilarity_function, add_root=True, prior_transformation=prior_transformation, + fast = fast, + method = "upgma" ) self.__cluster_to_cluster_size = defaultdict(int) diff --git a/cassiopeia/solver/__init__.py b/cassiopeia/solver/__init__.py index 8b2930e9..804bcb1c 100755 --- a/cassiopeia/solver/__init__.py +++ b/cassiopeia/solver/__init__.py @@ -13,4 +13,6 @@ from .VanillaGreedySolver import VanillaGreedySolver from .SpectralNeighborJoiningSolver import SpectralNeighborJoiningSolver from .CCPhyloSolver import CCPhyloSolver +from .DynamicNeighborJoiningSolver import DynamicNeighborJoiningSolver +from .HeuristicNeighborJoiningSolver import HeuristicNeighborJoiningSolver from . import dissimilarity_functions as dissimilarity diff --git a/test/solver_tests/ccphylo_solver_test.py b/test/solver_tests/ccphylo_solver_test.py new file mode 100755 index 00000000..9ac03bdd --- /dev/null +++ b/test/solver_tests/ccphylo_solver_test.py @@ -0,0 +1,225 @@ +""" +Test CCPhyloSolver in Cassiopeia.solver. +""" +import unittest +from typing import Dict, Optional +from unittest import mock + +import configparser +import os +import itertools +import networkx as nx +import numpy as np +import pandas as pd + +import cassiopeia as cas + + +def find_triplet_structure(triplet, T): + a, b, c = triplet[0], triplet[1], triplet[2] + a_ancestors = [node for node in nx.ancestors(T, a)] + b_ancestors = [node for node in nx.ancestors(T, b)] + c_ancestors = [node for node in nx.ancestors(T, c)] + ab_common = len(set(a_ancestors) & set(b_ancestors)) + ac_common = len(set(a_ancestors) & set(c_ancestors)) + bc_common = len(set(b_ancestors) & set(c_ancestors)) + structure = "-" + if ab_common > bc_common and ab_common > ac_common: + structure = "ab" + elif ac_common > bc_common and ac_common > ab_common: + structure = "ac" + elif bc_common > ab_common and bc_common > ac_common: + structure = "bc" + return structure + +# specify dissimilarity function for solvers to use +def delta_fn( + x: np.array, + y: np.array, + missing_state: int, + priors: Optional[Dict[int, Dict[int, float]]], +): + d = 0 + for i in range(len(x)): + if x[i] != y[i]: + d += 1 + return d + +# only run test if ccphylo_path is specified in config.ini +config = configparser.ConfigParser() +config.read(os.path.join(os.path.dirname(__file__),"..","..","cassiopeia","config.ini")) +path_set = config.get("Paths","ccphylo_path") != "/path/to/ccphylo/ccphylo" + + +class TestCCPhyloSolver(unittest.TestCase): + def setUp(self): + if path_set: + + # --------------------- General NJ --------------------- + cm = pd.DataFrame.from_dict( + { + "a": [0, 1, 2, 1, 0, 0, 2, 0, 0, 0], + "b": [1, 1, 2, 1, 0, 0, 2, 0, 0, 0], + "c": [2, 2, 2, 1, 0, 0, 2, 0, 0, 0], + "d": [1, 1, 1, 1, 0, 0, 2, 0, 0, 0], + "e": [0, 0, 0, 0, 1, 2, 1, 0, 2, 0], + "f": [0, 0, 0, 0, 2, 2, 1, 0, 2, 0], + "g": [0, 2, 0, 0, 1, 1, 1, 0, 2, 0], + "h": [0, 2, 0, 0, 1, 0, 0, 1, 2, 1], + "i": [1, 2, 0, 0, 1, 0, 0, 2, 2, 1], + "j": [1, 2, 0, 0, 1, 0, 0, 1, 1, 1], + }, + orient="index", + columns=["x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10"], + ) + + self.cm = cm + self.basic_tree = cas.data.CassiopeiaTree( + character_matrix=cm + ) + + self.fast_nj_solver = cas.solver.NeighborJoiningSolver(add_root=True,fast=True) + self.nj_solver = cas.solver.NeighborJoiningSolver(add_root=True,fast=False) + self.fast_upgma_solver = cas.solver.UPGMASolver(fast=True) + self.upgma_solver = cas.solver.UPGMASolver(fast=False) + self.dnj_solver = cas.solver.DynamicNeighborJoiningSolver(add_root=True) + self.hnj_solver = cas.solver.HeuristicNeighborJoiningSolver(add_root=True) + + # ------------- CM with Duplictes ----------------------- + duplicates_cm = pd.DataFrame.from_dict( + { + "a": [1, 1, 0], + "b": [1, 2, 0], + "c": [1, 2, 1], + "d": [2, 0, 0], + "e": [2, 0, 2], + "f": [2, 0, 2], + }, + orient="index", + columns=["x1", "x2", "x3"], + ) + + self.duplicate_tree = cas.data.CassiopeiaTree( + character_matrix=duplicates_cm + ) + + def test_fast_nj_solver(self): + if path_set: + # NJ Solver + nj_tree = self.basic_tree.copy() + self.nj_solver.solve(nj_tree) + + # CCPhylo Fast NJ Solver + fast_nj_tree = self.basic_tree.copy() + self.fast_nj_solver.solve(fast_nj_tree) + + # test for expected number of edges + self.assertEqual(len(nj_tree.edges), len(fast_nj_tree.edges)) + + triplets = itertools.combinations(["a", "c", "d", "e"], 3) + for triplet in triplets: + expected_triplet = find_triplet_structure(triplet, nj_tree.get_tree_topology()) + observed_triplet = find_triplet_structure(triplet, fast_nj_tree.get_tree_topology()) + self.assertEqual(expected_triplet, observed_triplet) + + def test_dnj_solver(self): + if path_set: + # NJ Solver + nj_tree = self.basic_tree.copy() + self.nj_solver.solve(nj_tree) + + # CCPhylo DNJ Solver + dnj_tree = self.basic_tree.copy() + self.dnj_solver.solve(dnj_tree) + + # test for expected number of edges + self.assertEqual(len(nj_tree.edges), len(dnj_tree.edges)) + + triplets = itertools.combinations(["a", "c", "d", "e"], 3) + for triplet in triplets: + expected_triplet = find_triplet_structure(triplet, nj_tree.get_tree_topology()) + observed_triplet = find_triplet_structure(triplet, dnj_tree.get_tree_topology()) + self.assertEqual(expected_triplet, observed_triplet) + + def test_hnj_solver(self): + if path_set: + # NJ Solver + nj_tree = self.basic_tree.copy() + self.nj_solver.solve(nj_tree) + + # CCPhylo HNJ Solver + hnj_tree = self.basic_tree.copy() + self.hnj_solver.solve(hnj_tree) + + # test for expected number of edges + self.assertEqual(len(nj_tree.edges), len(hnj_tree.edges)) + + + triplets = itertools.combinations(["a", "c", "d", "e"], 3) + for triplet in triplets: + expected_triplet = find_triplet_structure(triplet, nj_tree.get_tree_topology()) + observed_triplet = find_triplet_structure(triplet, hnj_tree.get_tree_topology()) + self.assertEqual(expected_triplet, observed_triplet) + + def test_fast_upgma_solver(self): + if path_set: + # UPGMA Solver + upgma_tree = self.basic_tree.copy() + self.upgma_solver.solve(upgma_tree) + + # CCPhylo Fast UPGMA Solver + fast_upgma_tree = self.basic_tree.copy() + self.fast_upgma_solver.solve(fast_upgma_tree) + + # test for expected number of edges + self.assertEqual(len(upgma_tree.edges), len(fast_upgma_tree.edges)) + + + triplets = itertools.combinations(["a", "c", "d", "e"], 3) + for triplet in triplets: + expected_triplet = find_triplet_structure(triplet, upgma_tree.get_tree_topology()) + observed_triplet = find_triplet_structure(triplet, fast_upgma_tree.get_tree_topology()) + self.assertEqual(expected_triplet, observed_triplet) + + #test collapse mutationless edges working + def test_collapse_mutationless_edges_ccphylo(self): + if path_set: + # NJ Solver + nj_tree = self.basic_tree.copy() + self.nj_solver.solve(nj_tree, collapse_mutationless_edges=True) + + # Fast NJ Solver + fast_nj_tree = self.basic_tree.copy() + self.fast_nj_solver.solve(fast_nj_tree, collapse_mutationless_edges=True) + + # test for expected number of edges + self.assertEqual(len(nj_tree.edges), len(fast_nj_tree.edges)) + + triplets = itertools.combinations(["a", "c", "d", "e"], 3) + for triplet in triplets: + expected_triplet = find_triplet_structure(triplet, nj_tree.get_tree_topology()) + observed_triplet = find_triplet_structure(triplet, fast_nj_tree.get_tree_topology()) + self.assertEqual(expected_triplet, observed_triplet) + + # test duplicate samples + def test_duplicate_sample_ccphylo(self): + if path_set: + # NJ Solver + nj_tree = self.duplicate_tree.copy() + self.nj_solver.solve(nj_tree) + + # Fast NJ Solver + fast_nj_tree = self.duplicate_tree.copy() + self.fast_nj_solver.solve(fast_nj_tree) + + # test for expected number of edges + self.assertEqual(len(nj_tree.edges), len(fast_nj_tree.edges)) + + triplets = itertools.combinations(["a", "b", "c", "d", "e", "f"], 3) + for triplet in triplets: + expected_triplet = find_triplet_structure(triplet, nj_tree.get_tree_topology()) + observed_triplet = find_triplet_structure(triplet, fast_nj_tree.get_tree_topology()) + self.assertEqual(expected_triplet, observed_triplet) + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From 73cd868c2b0130312653ae2bd556a213bd3e1ced Mon Sep 17 00:00:00 2001 From: colganwi Date: Thu, 27 Jul 2023 15:18:43 -0400 Subject: [PATCH 4/8] Revert "Added method to reorder children in the" This reverts commit 918491cebfc893832c1a1aef0d4a53e9b5a3ecfc. --- cassiopeia/data/CassiopeiaTree.py | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/cassiopeia/data/CassiopeiaTree.py b/cassiopeia/data/CassiopeiaTree.py index 6fa855d5..62f3f5fb 100755 --- a/cassiopeia/data/CassiopeiaTree.py +++ b/cassiopeia/data/CassiopeiaTree.py @@ -766,32 +766,9 @@ def __add_edge(self, u, v) -> None: self.__network.add_edge(u, v) - # this will change the topology of the tree, so reset the cache + # this will change the topology of the tree, so reset the cache self.__cache = {} - def reorder_children(self, node: str, new_order: List[str]) -> None: - """Reorders the children of a particular node. - - Args: - node: Node in the tree - new_order: A list of the new order of children for the node. - - Raises: - CassiopeiaTreeError if the node of interest is a leaf that has not been - instantiated, or if the new order of children is not a permutation - of the original children. - """ - self.__check_network_initialized() - - if self.is_leaf(node): - raise CassiopeiaTreeError("Cannot reorder children of a leaf node.") - - if set(new_order) != set(self.children(node)): - raise CassiopeiaTreeWarning("New order of children is not a permutation of the original children.") - - self.__network.remove_edges_from([(node, child) for child in self.children(node)]) - self.__network.add_edges_from([(node, child) for child in new_order]) - def set_time(self, node: str, new_time: float) -> None: """Sets the time of a node. From c0c5ca926624067026e26d5483e49a18b3bc3f80 Mon Sep 17 00:00:00 2001 From: colganwi Date: Wed, 16 Aug 2023 11:32:58 -0400 Subject: [PATCH 5/8] reorg to remove CCPhyloSolver class --- cassiopeia/solver/CCPhyloSolver.py | 223 ------------------ cassiopeia/solver/DistanceSolver.py | 138 ++++++++++- .../solver/DynamicNeighborJoiningSolver.py | 9 +- .../solver/HeuristicNeighborJoiningSolver.py | 7 +- cassiopeia/solver/NeighborJoiningSolver.py | 10 +- cassiopeia/solver/UPGMASolver.py | 10 +- cassiopeia/solver/__init__.py | 1 - 7 files changed, 158 insertions(+), 240 deletions(-) delete mode 100755 cassiopeia/solver/CCPhyloSolver.py diff --git a/cassiopeia/solver/CCPhyloSolver.py b/cassiopeia/solver/CCPhyloSolver.py deleted file mode 100755 index 1715e574..00000000 --- a/cassiopeia/solver/CCPhyloSolver.py +++ /dev/null @@ -1,223 +0,0 @@ -""" -This file stores a subclass of CassiopeiaSolver, the CCPhyloSolver. This is -a wrapper around optimized agglomerative algorithms implemented by CCPhylo -(https://bitbucket.org/genomicepidemiology/ccphylo/src/master/). Methods -that will inherit from this class by default are DynamicNeighborJoiningSolver, -HeuristicNeighborJoiningSolver, NeighborJoiningSolver and UPGMASolver. -""" - -import abc -from typing import Callable, Dict, List, Optional, Tuple - -import networkx as nx -import numpy as np -import os -import tempfile -import ete3 -import subprocess -import pandas as pd -import configparser - -from cassiopeia.data import CassiopeiaTree, utilities -from cassiopeia.mixins import DistanceSolverError -from cassiopeia.solver import ( - DistanceSolver, - dissimilarity_functions, - solver_utilities, -) - - -class CCPhyloSolver(DistanceSolver.DistanceSolver): - """ - Distance based solver class. This solver serves as a wrapper around - CCPhylo algorithms. - - Args: - dissimilarity_function: Function that can be used to compute the - dissimilarity between samples. - add_root: Whether or not to add an implicit root the tree. Only - pertinent in algorithms that return an unrooted tree, by default - (e.g. Neighbor Joining). Will not override an explicitly defined - root, specified by the 'root_sample_name' attribute in the - CassiopeiaTree - prior_transformation: Function to use when transforming priors into - weights. Supports the following transformations: - "negative_log": Transforms each probability by the negative - log (default) - "inverse": Transforms each probability p by taking 1/p - "square_root_inverse": Transforms each probability by the - the square root of 1/p - - Attributes: - dissimilarity_function: Function used to compute dissimilarity between - samples. - add_root: Whether or not to add an implicit root the tree. - prior_transformation: Function to use when transforming priors into - weights. - """ - - def __init__( - self, - dissimilarity_function: Optional[ - Callable[ - [np.array, np.array, int, Dict[int, Dict[int, float]]], float - ] - ] = dissimilarity_functions.weighted_hamming_distance, - add_root: bool = False, - prior_transformation: str = "negative_log", - fast = True, - method: str = "nj", - ): - - super().__init__(prior_transformation) - - self.dissimilarity_function = dissimilarity_function - self.add_root = add_root - self.fast = fast - self.method = method - - def _setup_ccphylo(self) -> None: - - # get ccphylo path - config = configparser.ConfigParser() - config.read(os.path.join(os.path.dirname(__file__),"..","config.ini")) - self.ccphylo_path = config.get("Paths","ccphylo_path") - - #check that ccphylo_path is valid - if not os.path.exists(self.ccphylo_path): - raise DistanceSolverError( - f"ccphylo_path {self.ccphylo_path} does not exist. To use fast " - "versions of Neighbor-Joining and UPGMA please install " - "CCPhylo (https://bitbucket.org/genomicepidemiology/ccphylo/src/master/) " - "set the ccphylo_path in the config.ini file then reinstall Cassiopeia." - ) - - #check that ccphylo_path is executable - if not os.access(self.ccphylo_path, os.X_OK): - raise DistanceSolverError( - f"ccphylo_path {self.ccphylo_path} is not executable. To use fast " - "versions of Neighbor-Joining and UPGMA please install " - "CCPhylo (https://bitbucket.org/genomicepidemiology/ccphylo/src/master/) " - "set the ccphylo_path in the config.ini file then reinstall Cassiopeia." - ) - - def _fast_solve( - self, - cassiopeia_tree: CassiopeiaTree, - layer: Optional[str] = None, - collapse_mutationless_edges: bool = False, - logfile: str = "stdout.log", - ) -> None: - """Solves a tree using fast distance-based algorithms implemented by - CCPhylo. To call this method the CCPhlyo package must be installed - and the ccphylo_path must be set in the config file. The method attribute - specifies which algorithm to use. The function will update the `tree`. - - Args: - cassiopeia_tree: CassiopeiaTree object to be populated - layer: Layer storing the character matrix for solving. If None, the - default character matrix is used in the CassiopeiaTree. - collapse_mutationless_edges: Indicates if the final reconstructed - tree should collapse mutationless edges based on internal states - inferred by Camin-Sokal parsimony. In scoring accuracy, this - removes artifacts caused by arbitrarily resolving polytomies. - logfile: File location to log output. Not currently used. - """ - - self._setup_ccphylo() - - dissimilarity_map = self.get_dissimilarity_map(cassiopeia_tree, layer) - - with tempfile.TemporaryDirectory() as temp_dir: - - # save dissimilarity map as phylip file - dis_path = os.path.join(temp_dir, "dist.phylip") - tree_path = os.path.join(temp_dir, "tree.nwk") - solver_utilities.save_dissimilarity_as_phylip(dissimilarity_map, dis_path) - - # run ccphylo - command = f"{self.ccphylo_path} tree -i {dis_path} -o {tree_path} -m {self.method}" - process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = process.communicate() - T = ete3.Tree(tree_path, format=1) - - # remove temporary files - os.remove(dis_path) - os.remove(tree_path) - - # Covert to networkx - tree = nx.Graph() - internal_node_iter = 0 - for n in T.traverse(): - if n.name == "": - n.name = f"cassiopeia_internal_node{internal_node_iter}" - internal_node_iter += 1 - if not n.is_root(): - tree.add_edge(n.up.name,n.name) - - # find last split - midpoint = T.get_midpoint_outgroup() - root = T.get_tree_root() - if midpoint in root.children: - last_split = [root.name,midpoint.name] - else: - last_split = [root.name,root.children[0].name] - tree.remove_edge(last_split[0],last_split[1]) - - # root tree - tree = self.root_tree(tree,cassiopeia_tree.root_sample_name,last_split) - - # remove root from character matrix before populating tree - if ( - cassiopeia_tree.root_sample_name - in cassiopeia_tree.character_matrix.index - ): - cassiopeia_tree.character_matrix = ( - cassiopeia_tree.character_matrix.drop( - index=cassiopeia_tree.root_sample_name - ) - ) - - # populate tree - cassiopeia_tree.populate_tree(tree,layer=layer) - cassiopeia_tree.collapse_unifurcations() - - # collapse mutationless edges - if collapse_mutationless_edges: - cassiopeia_tree.collapse_mutationless_edges( - infer_ancestral_characters=True - ) - - def solve( - self, - cassiopeia_tree: CassiopeiaTree, - layer: Optional[str] = None, - collapse_mutationless_edges: bool = False, - logfile: str = "stdout.log", - ) -> None: - """Solves a tree for a general bottom-up distance-based solver routine. - If fast is set to True, this function will use the fast_solve method which - is wrapper around CCPhylo algorithms. Otherwise, it will default to the - generic solve method in DistanceSolver. The function will update the `tree` - attribute of the input CassiopeiaTree. - - Args: - cassiopeia_tree: CassiopeiaTree object to be populated - layer: Layer storing the character matrix for solving. If None, the - default character matrix is used in the CassiopeiaTree. - collapse_mutationless_edges: Indicates if the final reconstructed - tree should collapse mutationless edges based on internal states - inferred by Camin-Sokal parsimony. In scoring accuracy, this - removes artifacts caused by arbitrarily resolving polytomies. - logfile: File location to log output. Not currently used. - """ - - if self.fast: - self._fast_solve(cassiopeia_tree, layer, collapse_mutationless_edges, logfile) - else: - if self.__class__ == CCPhyloSolver: - raise NotImplementedError( - "CCPhyloSolver does not implement solve. Please set fast to True" - "to use the fast_solve method. Or use a subclass of CCPhyloSolver." - ) - super().solve(cassiopeia_tree, layer, collapse_mutationless_edges, logfile) \ No newline at end of file diff --git a/cassiopeia/solver/DistanceSolver.py b/cassiopeia/solver/DistanceSolver.py index cfcdd099..2ecdca9f 100644 --- a/cassiopeia/solver/DistanceSolver.py +++ b/cassiopeia/solver/DistanceSolver.py @@ -11,11 +11,17 @@ import networkx as nx import numpy as np import pandas as pd +import configparser +import subprocess +import os +import tempfile +import ete3 from cassiopeia.data import CassiopeiaTree from cassiopeia.mixins import DistanceSolverError -from cassiopeia.solver import CassiopeiaSolver, solver_utilities - +from cassiopeia.solver import (CassiopeiaSolver, + solver_utilities) +from cassiopeia.data.utilities import ete3_to_networkx class DistanceSolver(CassiopeiaSolver.CassiopeiaSolver): """ @@ -50,6 +56,9 @@ class DistanceSolver(CassiopeiaSolver.CassiopeiaSolver): "inverse": Transforms each probability p by taking 1/p "square_root_inverse": Transforms each probability by the the square root of 1/p + fast: Whether to use fast implementation of the solver. Currently + Neighbor Joining, Dynamic Neighbor Joining, Heuristic Neighbor + Joining, and UPGMA have fast implementations which use CCPhylo. Attributes: dissimilarity_function: Function used to compute dissimilarity between @@ -57,6 +66,7 @@ class DistanceSolver(CassiopeiaSolver.CassiopeiaSolver): add_root: Whether or not to add an implicit root the tree. prior_transformation: Function to use when transforming priors into weights. + fast: Whether to use fast implementation of the solver. """ def __init__( @@ -68,12 +78,24 @@ def __init__( ] = None, add_root: bool = False, prior_transformation: str = "negative_log", + fast: bool = False, ): super().__init__(prior_transformation) self.dissimilarity_function = dissimilarity_function self.add_root = add_root + self.fast = fast + + # Select fast solver to use + if self.fast: + if self.fast_solver == "ccphylo": + self._setup_ccphylo() + self.fast_solve = self._ccphylo_solve + else: + raise DistanceSolverError( + f"Fast solver is not available for {self.__class__}." + ) def get_dissimilarity_map( self, @@ -135,6 +157,12 @@ def solve( removes artifacts caused by arbitrarily resolving polytomies. logfile: File location to log output. Not currently used. """ + + # Use fast solver if selected + if self.fast: + self.fast_solve(cassiopeia_tree,layer,collapse_mutationless_edges,logfile) + return + node_name_generator = solver_utilities.node_name_generator() dissimilarity_map = self.get_dissimilarity_map(cassiopeia_tree, layer) @@ -197,6 +225,112 @@ def solve( infer_ancestral_characters=True ) + def _ccphylo_solve( + self, + cassiopeia_tree: CassiopeiaTree, + layer: Optional[str] = None, + collapse_mutationless_edges: bool = False, + logfile: str = "stdout.log", + ) -> None: + """Solves a tree using fast distance-based algorithms implemented by + CCPhylo. To call this method the CCPhlyo package must be installed + and the ccphylo_path must be set in the config file. The method attribute + specifies which algorithm to use. The function will update the `tree`. + + Args: + cassiopeia_tree: CassiopeiaTree object to be populated + layer: Layer storing the character matrix for solving. If None, the + default character matrix is used in the CassiopeiaTree. + collapse_mutationless_edges: Indicates if the final reconstructed + tree should collapse mutationless edges based on internal states + inferred by Camin-Sokal parsimony. In scoring accuracy, this + removes artifacts caused by arbitrarily resolving polytomies. + logfile: File location to log output. Not currently used. + """ + + dissimilarity_map = self.get_dissimilarity_map(cassiopeia_tree, layer) + + with tempfile.TemporaryDirectory() as temp_dir: + + # save dissimilarity map as phylip file + dis_path = os.path.join(temp_dir, "dist.phylip") + tree_path = os.path.join(temp_dir, "tree.nwk") + solver_utilities.save_dissimilarity_as_phylip(dissimilarity_map, dis_path) + + # run ccphylo + command = f"{self.ccphylo_path} tree -i {dis_path} -o {tree_path} -m {self.fast_method}" + process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + T = ete3.Tree(tree_path, format=1) + + # remove temporary files + os.remove(dis_path) + os.remove(tree_path) + + # Covert to networkx + tree = ete3_to_networkx(T).to_undirected() + + # find last split + midpoint = T.get_midpoint_outgroup() + root = T.get_tree_root() + if midpoint in root.children: + last_split = [root.name,midpoint.name] + else: + last_split = [root.name,root.children[0].name] + tree.remove_edge(last_split[0],last_split[1]) + + # root tree + tree = self.root_tree(tree,cassiopeia_tree.root_sample_name,last_split) + + # remove root from character matrix before populating tree + if ( + cassiopeia_tree.root_sample_name + in cassiopeia_tree.character_matrix.index + ): + cassiopeia_tree.character_matrix = ( + cassiopeia_tree.character_matrix.drop( + index=cassiopeia_tree.root_sample_name + ) + ) + + # populate tree + cassiopeia_tree.populate_tree(tree,layer=layer) + cassiopeia_tree.collapse_unifurcations() + + # collapse mutationless edges + if collapse_mutationless_edges: + cassiopeia_tree.collapse_mutationless_edges( + infer_ancestral_characters=True + ) + + def _setup_ccphylo(self) -> None: + """Sets up the ccphylo solver by getting the ccphylo_path from the + config file and checking that it is valid. + """ + + # get ccphylo path + config = configparser.ConfigParser() + config.read(os.path.join(os.path.dirname(__file__),"..","config.ini")) + self.ccphylo_path = config.get("Paths","ccphylo_path") + + #check that ccphylo_path is valid + if not os.path.exists(self.ccphylo_path): + raise DistanceSolverError( + f"ccphylo_path {self.ccphylo_path} does not exist. To use fast " + "versions of Neighbor-Joining and UPGMA please install " + "CCPhylo (https://bitbucket.org/genomicepidemiology/ccphylo/src/master/) " + "set the ccphylo_path in the config.ini file then reinstall Cassiopeia." + ) + + #check that ccphylo_path is executable + if not os.access(self.ccphylo_path, os.X_OK): + raise DistanceSolverError( + f"ccphylo_path {self.ccphylo_path} is not executable. To use fast " + "versions of Neighbor-Joining and UPGMA please install " + "CCPhylo (https://bitbucket.org/genomicepidemiology/ccphylo/src/master/) " + "set the ccphylo_path in the config.ini file then reinstall Cassiopeia." + ) + def setup_dissimilarity_map( self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None ) -> None: diff --git a/cassiopeia/solver/DynamicNeighborJoiningSolver.py b/cassiopeia/solver/DynamicNeighborJoiningSolver.py index 13b06090..4acb0ed2 100755 --- a/cassiopeia/solver/DynamicNeighborJoiningSolver.py +++ b/cassiopeia/solver/DynamicNeighborJoiningSolver.py @@ -7,7 +7,7 @@ import numpy as np -from cassiopeia.data import CassiopeiaTree +from cassiopeia.mixins import DistanceSolverError from cassiopeia.solver import ( NeighborJoiningSolver, dissimilarity_functions, @@ -55,12 +55,13 @@ def __init__( add_root: bool = False, prior_transformation: str = "negative_log", ): + # setup fast solver + self.fast_solver = "ccphylo" + self.fast_method = "dnj" super().__init__( dissimilarity_function=dissimilarity_function, add_root=add_root, prior_transformation=prior_transformation, fast = True, - ) - - self.method = "dnj" \ No newline at end of file + ) \ No newline at end of file diff --git a/cassiopeia/solver/HeuristicNeighborJoiningSolver.py b/cassiopeia/solver/HeuristicNeighborJoiningSolver.py index 7eb19a6e..f4c71f16 100755 --- a/cassiopeia/solver/HeuristicNeighborJoiningSolver.py +++ b/cassiopeia/solver/HeuristicNeighborJoiningSolver.py @@ -56,12 +56,13 @@ def __init__( add_root: bool = False, prior_transformation: str = "negative_log", ): + # setup fast solver + self.fast_solver = "ccphylo" + self.fast_method = "dnj" super().__init__( dissimilarity_function=dissimilarity_function, add_root=add_root, prior_transformation=prior_transformation, fast = True, - ) - - self.method = "hnj" \ No newline at end of file + ) \ No newline at end of file diff --git a/cassiopeia/solver/NeighborJoiningSolver.py b/cassiopeia/solver/NeighborJoiningSolver.py index 720418c1..d68ea806 100755 --- a/cassiopeia/solver/NeighborJoiningSolver.py +++ b/cassiopeia/solver/NeighborJoiningSolver.py @@ -15,13 +15,11 @@ from cassiopeia.data import CassiopeiaTree from cassiopeia.solver import ( DistanceSolver, - CCPhyloSolver, dissimilarity_functions, solver_utilities, ) - -class NeighborJoiningSolver(CCPhyloSolver.CCPhyloSolver): +class NeighborJoiningSolver(DistanceSolver.DistanceSolver): """ Neighbor-Joining class for Cassiopeia. @@ -68,12 +66,16 @@ def __init__( fast: bool = False, ): + # setup fast solver + if fast: + self.fast_solver = "ccphylo" + self.fast_method = "nj" + super().__init__( dissimilarity_function=dissimilarity_function, add_root=add_root, prior_transformation=prior_transformation, fast = fast, - method = "nj" ) def root_tree( diff --git a/cassiopeia/solver/UPGMASolver.py b/cassiopeia/solver/UPGMASolver.py index 0d4053b1..bcaa10ec 100644 --- a/cassiopeia/solver/UPGMASolver.py +++ b/cassiopeia/solver/UPGMASolver.py @@ -13,10 +13,10 @@ import pandas as pd from cassiopeia.data import CassiopeiaTree -from cassiopeia.solver import DistanceSolver, dissimilarity_functions, CCPhyloSolver +from cassiopeia.solver import DistanceSolver, dissimilarity_functions -class UPGMASolver(CCPhyloSolver.CCPhyloSolver): +class UPGMASolver(DistanceSolver.DistanceSolver): """ UPGMA CassiopeiaSolver. @@ -59,12 +59,16 @@ def __init__( fast: bool = False, ): + # setup fast solver + if fast: + self.fast_solver = "ccphylo" + self.fast_method = "upgma" + super().__init__( dissimilarity_function=dissimilarity_function, add_root=True, prior_transformation=prior_transformation, fast = fast, - method = "upgma" ) self.__cluster_to_cluster_size = defaultdict(int) diff --git a/cassiopeia/solver/__init__.py b/cassiopeia/solver/__init__.py index 804bcb1c..24175c0c 100755 --- a/cassiopeia/solver/__init__.py +++ b/cassiopeia/solver/__init__.py @@ -12,7 +12,6 @@ from .UPGMASolver import UPGMASolver from .VanillaGreedySolver import VanillaGreedySolver from .SpectralNeighborJoiningSolver import SpectralNeighborJoiningSolver -from .CCPhyloSolver import CCPhyloSolver from .DynamicNeighborJoiningSolver import DynamicNeighborJoiningSolver from .HeuristicNeighborJoiningSolver import HeuristicNeighborJoiningSolver from . import dissimilarity_functions as dissimilarity From 75b945c036e42d1d3dbc2ddc91925d4e52953a8a Mon Sep 17 00:00:00 2001 From: colganwi Date: Fri, 18 Aug 2023 15:49:39 -0400 Subject: [PATCH 6/8] simplified by removing DNJ and HNJ classes --- cassiopeia/solver/DistanceSolver.py | 101 ++--- .../solver/DynamicNeighborJoiningSolver.py | 67 ---- .../solver/HeuristicNeighborJoiningSolver.py | 68 ---- cassiopeia/solver/NeighborJoiningSolver.py | 31 +- .../solver/SpectralNeighborJoiningSolver.py | 3 + cassiopeia/solver/UPGMASolver.py | 22 +- cassiopeia/solver/__init__.py | 2 - test/solver_tests/ccphylo_solver_test.py | 370 ++++++++++-------- .../dissimilarity_functions_test.py | 22 ++ 9 files changed, 320 insertions(+), 366 deletions(-) delete mode 100755 cassiopeia/solver/DynamicNeighborJoiningSolver.py delete mode 100755 cassiopeia/solver/HeuristicNeighborJoiningSolver.py diff --git a/cassiopeia/solver/DistanceSolver.py b/cassiopeia/solver/DistanceSolver.py index 2ecdca9f..f3caee04 100644 --- a/cassiopeia/solver/DistanceSolver.py +++ b/cassiopeia/solver/DistanceSolver.py @@ -3,25 +3,29 @@ the inference procedures that inherit from this method will need to implement methods for selecting "cherries" and updating the dissimilarity map. Methods that will inherit from this class by default are Neighbor-Joining and UPGMA. -There may be other subclasses of this. +There may be other subclasses of this. Currently also implements a method for +solving trees with CCPhylo but this will be moved with switch to compositional +framework. """ import abc from typing import Callable, Dict, List, Optional, Tuple -import networkx as nx -import numpy as np -import pandas as pd -import configparser -import subprocess import os + +import subprocess import tempfile + +import configparser import ete3 +import networkx as nx +import numpy as np +import pandas as pd from cassiopeia.data import CassiopeiaTree from cassiopeia.mixins import DistanceSolverError from cassiopeia.solver import (CassiopeiaSolver, solver_utilities) -from cassiopeia.data.utilities import ete3_to_networkx +from cassiopeia.data import utilities class DistanceSolver(CassiopeiaSolver.CassiopeiaSolver): """ @@ -56,9 +60,6 @@ class DistanceSolver(CassiopeiaSolver.CassiopeiaSolver): "inverse": Transforms each probability p by taking 1/p "square_root_inverse": Transforms each probability by the the square root of 1/p - fast: Whether to use fast implementation of the solver. Currently - Neighbor Joining, Dynamic Neighbor Joining, Heuristic Neighbor - Joining, and UPGMA have fast implementations which use CCPhylo. Attributes: dissimilarity_function: Function used to compute dissimilarity between @@ -66,7 +67,6 @@ class DistanceSolver(CassiopeiaSolver.CassiopeiaSolver): add_root: Whether or not to add an implicit root the tree. prior_transformation: Function to use when transforming priors into weights. - fast: Whether to use fast implementation of the solver. """ def __init__( @@ -78,24 +78,15 @@ def __init__( ] = None, add_root: bool = False, prior_transformation: str = "negative_log", - fast: bool = False, ): super().__init__(prior_transformation) self.dissimilarity_function = dissimilarity_function self.add_root = add_root - self.fast = fast - # Select fast solver to use - if self.fast: - if self.fast_solver == "ccphylo": - self._setup_ccphylo() - self.fast_solve = self._ccphylo_solve - else: - raise DistanceSolverError( - f"Fast solver is not available for {self.__class__}." - ) + if "ccphylo" in self._implementation: + self._setup_ccphylo() def get_dissimilarity_map( self, @@ -130,7 +121,6 @@ def get_dissimilarity_map( return dissimilarity_map - def solve( self, cassiopeia_tree: CassiopeiaTree, @@ -158,9 +148,21 @@ def solve( logfile: File location to log output. Not currently used. """ - # Use fast solver if selected - if self.fast: - self.fast_solve(cassiopeia_tree,layer,collapse_mutationless_edges,logfile) + if self._implementation == "ccphylo_dnj": + self._ccphylo_solve(cassiopeia_tree,layer, + collapse_mutationless_edges,logfile,method="dnj") + return + elif self._implementation == "ccphylo_nj": + self._ccphylo_solve(cassiopeia_tree,layer, + collapse_mutationless_edges,logfile,method="nj") + return + elif self._implementation == "ccphylo_hnj": + self._ccphylo_solve(cassiopeia_tree,layer, + collapse_mutationless_edges,logfile,method="hnj") + return + elif self._implementation == "ccphylo_upgma": + self._ccphylo_solve(cassiopeia_tree,layer, + collapse_mutationless_edges,logfile,method="upgma") return node_name_generator = solver_utilities.node_name_generator() @@ -231,11 +233,13 @@ def _ccphylo_solve( layer: Optional[str] = None, collapse_mutationless_edges: bool = False, logfile: str = "stdout.log", + method: str = "dnj" ) -> None: """Solves a tree using fast distance-based algorithms implemented by CCPhylo. To call this method the CCPhlyo package must be installed - and the ccphylo_path must be set in the config file. The method attribute - specifies which algorithm to use. The function will update the `tree`. + and the ccphylo_path must be set in the config file. The method + attribute specifies which algorithm to use. The function will update the + `tree`. Args: cassiopeia_tree: CassiopeiaTree object to be populated @@ -255,11 +259,16 @@ def _ccphylo_solve( # save dissimilarity map as phylip file dis_path = os.path.join(temp_dir, "dist.phylip") tree_path = os.path.join(temp_dir, "tree.nwk") - solver_utilities.save_dissimilarity_as_phylip(dissimilarity_map, dis_path) + solver_utilities.save_dissimilarity_as_phylip(dissimilarity_map, + dis_path) # run ccphylo - command = f"{self.ccphylo_path} tree -i {dis_path} -o {tree_path} -m {self.fast_method}" - process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + command = (f"{self._ccphylo_path} tree -i {dis_path} -o " + f"{tree_path} -m {method}") + + process = subprocess.Popen(command, shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) stdout, stderr = process.communicate() T = ete3.Tree(tree_path, format=1) @@ -268,7 +277,7 @@ def _ccphylo_solve( os.remove(tree_path) # Covert to networkx - tree = ete3_to_networkx(T).to_undirected() + tree = utilities.ete3_to_networkx(T).to_undirected() # find last split midpoint = T.get_midpoint_outgroup() @@ -311,24 +320,26 @@ def _setup_ccphylo(self) -> None: # get ccphylo path config = configparser.ConfigParser() config.read(os.path.join(os.path.dirname(__file__),"..","config.ini")) - self.ccphylo_path = config.get("Paths","ccphylo_path") + self._ccphylo_path = config.get("Paths","ccphylo_path") #check that ccphylo_path is valid - if not os.path.exists(self.ccphylo_path): + if not os.path.exists(self._ccphylo_path): raise DistanceSolverError( - f"ccphylo_path {self.ccphylo_path} does not exist. To use fast " - "versions of Neighbor-Joining and UPGMA please install " - "CCPhylo (https://bitbucket.org/genomicepidemiology/ccphylo/src/master/) " - "set the ccphylo_path in the config.ini file then reinstall Cassiopeia." + f"ccphylo_path {self._ccphylo_path} does not exist. To use fast " + "versions of Neighbor-Joining and UPGMA please install CCPhylo " + "(https://bitbucket.org/genomicepidemiology/ccphylo/src/master/)" + "set the ccphylo_path in the config.ini file then reinstall " + "Cassiopeia." ) #check that ccphylo_path is executable - if not os.access(self.ccphylo_path, os.X_OK): + if not os.access(self._ccphylo_path, os.X_OK): raise DistanceSolverError( - f"ccphylo_path {self.ccphylo_path} is not executable. To use fast " - "versions of Neighbor-Joining and UPGMA please install " - "CCPhylo (https://bitbucket.org/genomicepidemiology/ccphylo/src/master/) " - "set the ccphylo_path in the config.ini file then reinstall Cassiopeia." + f"ccphylo_path {self._ccphylo_path} is not executable. To use " + "fast versions of Neighbor-Joining and UPGMA please install CCPhylo" + " (https://bitbucket.org/genomicepidemiology/ccphylo/src/master/) " + "set the ccphylo_path in the config.ini file then reinstall " + "Cassiopeia." ) def setup_dissimilarity_map( @@ -362,8 +373,8 @@ def setup_dissimilarity_map( else: raise DistanceSolverError( - "Please specify an explicit root sample in the Cassiopeia Tree" - " or specify the solver to add an implicit root" + "Please specify an explicit root sample in the Cassiopeia" + " Tree or specify the solver to add an implicit root" ) if cassiopeia_tree.get_dissimilarity_map() is None: diff --git a/cassiopeia/solver/DynamicNeighborJoiningSolver.py b/cassiopeia/solver/DynamicNeighborJoiningSolver.py deleted file mode 100755 index 4acb0ed2..00000000 --- a/cassiopeia/solver/DynamicNeighborJoiningSolver.py +++ /dev/null @@ -1,67 +0,0 @@ -""" -This file stores a subclass of NeighborJoiningSolver, DynamicNeighborJoining. -The inference procedure is the Dymanic Neighbor-Joining algorithm proposed by -Clausen (2023). https://pubmed.ncbi.nlm.nih.gov/36453849/ -""" -from typing import Callable, Dict, List, Optional, Tuple, Union - -import numpy as np - -from cassiopeia.mixins import DistanceSolverError -from cassiopeia.solver import ( - NeighborJoiningSolver, - dissimilarity_functions, -) - -class DynamicNeighborJoiningSolver(NeighborJoiningSolver): - """ - Dynamic Neighbor-Joining class for Cassiopeia. - - Implements the Dynamic Neighbor-Joining algorithm described by Clausen (2023) - as a derived class of CCPhyloSolver. This class inherits the generic - `solve` method. This algorithm is guaranteed to return an exact solution. - - Args: - dissimilarity_function: A function by which to compute the dissimilarity - map. Optional if a dissimilarity map is already provided. - add_root: Whether or not to add an implicit root the tree, i.e. a root - with unmutated characters. If set to False, and no explicit root is - provided in the CassiopeiaTree, then will return an unrooted, - undirected tree - prior_transformation: Function to use when transforming priors into - weights. Supports the following transformations: - "negative_log": Transforms each probability by the negative - log (default) - "inverse": Transforms each probability p by taking 1/p - "square_root_inverse": Transforms each probability by the - the square root of 1/p - - Attributes: - dissimilarity_function: Function used to compute dissimilarity between - samples. - add_root: Whether or not to add an implicit root the tree. - prior_transformation: Function to use when transforming priors into - weights. - - """ - - def __init__( - self, - dissimilarity_function: Optional[ - Callable[ - [np.array, np.array, int, Dict[int, Dict[int, float]]], float - ] - ] = dissimilarity_functions.weighted_hamming_distance, - add_root: bool = False, - prior_transformation: str = "negative_log", - ): - # setup fast solver - self.fast_solver = "ccphylo" - self.fast_method = "dnj" - - super().__init__( - dissimilarity_function=dissimilarity_function, - add_root=add_root, - prior_transformation=prior_transformation, - fast = True, - ) \ No newline at end of file diff --git a/cassiopeia/solver/HeuristicNeighborJoiningSolver.py b/cassiopeia/solver/HeuristicNeighborJoiningSolver.py deleted file mode 100755 index f4c71f16..00000000 --- a/cassiopeia/solver/HeuristicNeighborJoiningSolver.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -This file stores a subclass of NeighborJoiningSolver, HeuristicNeighborJoiningSolver. -The inference procedure is the Heuristic Neighbor-Joining algorithm proposed by -Clausen (2023). https://pubmed.ncbi.nlm.nih.gov/36453849/ -""" -from typing import Callable, Dict, List, Optional, Tuple, Union - -import numpy as np - -from cassiopeia.data import CassiopeiaTree -from cassiopeia.solver import ( - NeighborJoiningSolver, - dissimilarity_functions, -) - -class HeuristicNeighborJoiningSolver(NeighborJoiningSolver): - """ - Heuristic Neighbor-Joining class for Cassiopeia. - - Implements the Heuristic Neighbor-Joining algorithm described by Clausen (2023) - as a derived class of NeighborJoiningSolver. This class inherits the generic - `solve` method. This algorithm is not guaranteed to return an exact solution. - - - Args: - dissimilarity_function: A function by which to compute the dissimilarity - map. Optional if a dissimilarity map is already provided. - add_root: Whether or not to add an implicit root the tree, i.e. a root - with unmutated characters. If set to False, and no explicit root is - provided in the CassiopeiaTree, then will return an unrooted, - undirected tree - prior_transformation: Function to use when transforming priors into - weights. Supports the following transformations: - "negative_log": Transforms each probability by the negative - log (default) - "inverse": Transforms each probability p by taking 1/p - "square_root_inverse": Transforms each probability by the - the square root of 1/p - - Attributes: - dissimilarity_function: Function used to compute dissimilarity between - samples. - add_root: Whether or not to add an implicit root the tree. - prior_transformation: Function to use when transforming priors into - weights. - - """ - - def __init__( - self, - dissimilarity_function: Optional[ - Callable[ - [np.array, np.array, int, Dict[int, Dict[int, float]]], float - ] - ] = dissimilarity_functions.weighted_hamming_distance, - add_root: bool = False, - prior_transformation: str = "negative_log", - ): - # setup fast solver - self.fast_solver = "ccphylo" - self.fast_method = "dnj" - - super().__init__( - dissimilarity_function=dissimilarity_function, - add_root=add_root, - prior_transformation=prior_transformation, - fast = True, - ) \ No newline at end of file diff --git a/cassiopeia/solver/NeighborJoiningSolver.py b/cassiopeia/solver/NeighborJoiningSolver.py index d68ea806..274be39e 100755 --- a/cassiopeia/solver/NeighborJoiningSolver.py +++ b/cassiopeia/solver/NeighborJoiningSolver.py @@ -1,5 +1,5 @@ """ -This file stores a subclass of CCPhyloSolver, NeighborJoiningSolver. The +This file stores a subclass of DistanceSolver, NeighborJoiningSolver. The inference procedure is the Neighbor-Joining algorithm proposed by Saitou and Nei (1987) that iteratively joins together samples that minimize the Q-criterion on the dissimilarity map. @@ -13,6 +13,7 @@ import pandas as pd from cassiopeia.data import CassiopeiaTree +from cassiopeia.mixins import DistanceSolverError from cassiopeia.solver import ( DistanceSolver, dissimilarity_functions, @@ -26,8 +27,8 @@ class NeighborJoiningSolver(DistanceSolver.DistanceSolver): Implements the Neighbor-Joining algorithm described by Saitou and Nei (1987) as a derived class of DistanceSolver. This class inherits the generic `solve` method, but implements its own procedure for finding cherries by - minimizing the Q-criterion between samples. If fast is set to True, then - the fast Neighbor-Joining implementation from CCPhylo of is used. + minimizing the Q-criterion between samples. If fast is set to True, + a fast NJ implementation of is used. Args: dissimilarity_function: A function by which to compute the dissimilarity @@ -43,7 +44,15 @@ class NeighborJoiningSolver(DistanceSolver.DistanceSolver): "inverse": Transforms each probability p by taking 1/p "square_root_inverse": Transforms each probability by the the square root of 1/p - fast: Whether to use the fast CCPhylo implementation of Neighbor-Joining. + fast: Whether to use a fast implementation of Neighbor-Joining. + implementation: Which fast implementation to use. Options are: + "ccphylo_dnj": CCPhylo implementation the Dynamic Neighbor-Joining + algorithm described by Clausen (2023). Solution in guaranteed + to be exact. + "ccphylo_hnj": CCPhylo implementation of the Heuristic + Neighbor-Joining algorithm described by Clausen (2023). + Solution is not guaranteed to be exact. + "ccphylo_nj": CCPhylo implementation of the Neighbor-Joining. Attributes: dissimilarity_function: Function used to compute dissimilarity between @@ -64,18 +73,24 @@ def __init__( add_root: bool = False, prior_transformation: str = "negative_log", fast: bool = False, + implementation: str = "ccphylo_dnj", ): - # setup fast solver if fast: - self.fast_solver = "ccphylo" - self.fast_method = "nj" + if implementation in ["ccphylo_dnj", "ccphylo_hnj", "ccphylo_nj"]: + self._implementation = implementation + else: + raise DistanceSolverError( + "Invalid fast implementation of Neighbor-Joining. Options " + "are: 'ccphylo_dnj', 'ccphylo_hnj', 'ccphylo_nj'" + ) + else: + self._implementation = "generic_nj" super().__init__( dissimilarity_function=dissimilarity_function, add_root=add_root, prior_transformation=prior_transformation, - fast = fast, ) def root_tree( diff --git a/cassiopeia/solver/SpectralNeighborJoiningSolver.py b/cassiopeia/solver/SpectralNeighborJoiningSolver.py index ec1c8ca0..acc6d431 100644 --- a/cassiopeia/solver/SpectralNeighborJoiningSolver.py +++ b/cassiopeia/solver/SpectralNeighborJoiningSolver.py @@ -71,6 +71,8 @@ def __init__( add_root: bool = False, prior_transformation: str = "negative_log", ): + self._implementation = "generic_spectral_nj" + super().__init__( dissimilarity_function=similarity_function, add_root=add_root, @@ -80,6 +82,7 @@ def __init__( self._similarity_map = None self._lambda_indices = None + def get_dissimilarity_map( self, cassiopeia_tree: CassiopeiaTree, layer: Optional[str] = None ) -> pd.DataFrame: diff --git a/cassiopeia/solver/UPGMASolver.py b/cassiopeia/solver/UPGMASolver.py index bcaa10ec..cc87b648 100644 --- a/cassiopeia/solver/UPGMASolver.py +++ b/cassiopeia/solver/UPGMASolver.py @@ -13,6 +13,7 @@ import pandas as pd from cassiopeia.data import CassiopeiaTree +from cassiopeia.mixins import DistanceSolverError from cassiopeia.solver import DistanceSolver, dissimilarity_functions @@ -26,8 +27,7 @@ class UPGMASolver(DistanceSolver.DistanceSolver): dissimilarity between samples. After joining nodes, the dissimilarities are updated by averaging the distances of elements in the new cluster with each existing node. Produces a rooted tree that is assumed to be - ultrametric. If fast is set to True, then - the fast UPGMA implementation from CCPhylo is used. + ultrametric. If fast is set to True, a fast UPGMA implementation of is used. Args: dissimilarity_function: A function by which to compute the dissimilarity @@ -39,7 +39,9 @@ class UPGMASolver(DistanceSolver.DistanceSolver): "inverse": Transforms each probability p by taking 1/p "square_root_inverse": Transforms each probability by the the square root of 1/p - fast: Whether to use the fast CCPhylo implementation of UPGMA. + fast: Whether to use a fast implementation of UPGMA. + implementation: Which fast implementation to use. Options are: + "ccphylo_upgma": Uses the fast UPGMA implementation from CCPhylo. Attributes: dissimilarity_function: Function used to compute dissimilarity between samples. @@ -57,18 +59,24 @@ def __init__( ] = dissimilarity_functions.weighted_hamming_distance, prior_transformation: str = "negative_log", fast: bool = False, + implementation: str = "ccphylo_upgma", ): - # setup fast solver if fast: - self.fast_solver = "ccphylo" - self.fast_method = "upgma" + if implementation in ["ccphylo_upgma"]: + self._implementation = implementation + else: + raise DistanceSolverError( + "Invalid fast implementation of UPGMA. Options are: " + "'ccphylo_upgma'" + ) + else: + self._implementation = "generic_upgma" super().__init__( dissimilarity_function=dissimilarity_function, add_root=True, prior_transformation=prior_transformation, - fast = fast, ) self.__cluster_to_cluster_size = defaultdict(int) diff --git a/cassiopeia/solver/__init__.py b/cassiopeia/solver/__init__.py index 24175c0c..d6c0146b 100755 --- a/cassiopeia/solver/__init__.py +++ b/cassiopeia/solver/__init__.py @@ -12,6 +12,4 @@ from .UPGMASolver import UPGMASolver from .VanillaGreedySolver import VanillaGreedySolver from .SpectralNeighborJoiningSolver import SpectralNeighborJoiningSolver -from .DynamicNeighborJoiningSolver import DynamicNeighborJoiningSolver -from .HeuristicNeighborJoiningSolver import HeuristicNeighborJoiningSolver from . import dissimilarity_functions as dissimilarity diff --git a/test/solver_tests/ccphylo_solver_test.py b/test/solver_tests/ccphylo_solver_test.py index 9ac03bdd..759231de 100755 --- a/test/solver_tests/ccphylo_solver_test.py +++ b/test/solver_tests/ccphylo_solver_test.py @@ -1,12 +1,13 @@ """ -Test CCPhyloSolver in Cassiopeia.solver. +Test the ccphylo solver implementations against the standard NJ and UPGMA """ import unittest from typing import Dict, Optional from unittest import mock -import configparser import os + +import configparser import itertools import networkx as nx import numpy as np @@ -47,179 +48,210 @@ def delta_fn( # only run test if ccphylo_path is specified in config.ini config = configparser.ConfigParser() -config.read(os.path.join(os.path.dirname(__file__),"..","..","cassiopeia","config.ini")) -path_set = config.get("Paths","ccphylo_path") != "/path/to/ccphylo/ccphylo" +config.read(os.path.join(os.path.dirname(__file__), + "..","..","cassiopeia","config.ini")) +CCPHYLO_CONFIGURED = (config.get("Paths","ccphylo_path") != + "/path/to/ccphylo/ccphylo") +print(CCPHYLO_CONFIGURED) class TestCCPhyloSolver(unittest.TestCase): + @unittest.skipUnless(CCPHYLO_CONFIGURED, "CCPhylo not configured.") def setUp(self): - if path_set: - - # --------------------- General NJ --------------------- - cm = pd.DataFrame.from_dict( - { - "a": [0, 1, 2, 1, 0, 0, 2, 0, 0, 0], - "b": [1, 1, 2, 1, 0, 0, 2, 0, 0, 0], - "c": [2, 2, 2, 1, 0, 0, 2, 0, 0, 0], - "d": [1, 1, 1, 1, 0, 0, 2, 0, 0, 0], - "e": [0, 0, 0, 0, 1, 2, 1, 0, 2, 0], - "f": [0, 0, 0, 0, 2, 2, 1, 0, 2, 0], - "g": [0, 2, 0, 0, 1, 1, 1, 0, 2, 0], - "h": [0, 2, 0, 0, 1, 0, 0, 1, 2, 1], - "i": [1, 2, 0, 0, 1, 0, 0, 2, 2, 1], - "j": [1, 2, 0, 0, 1, 0, 0, 1, 1, 1], - }, - orient="index", - columns=["x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10"], - ) - - self.cm = cm - self.basic_tree = cas.data.CassiopeiaTree( - character_matrix=cm - ) - - self.fast_nj_solver = cas.solver.NeighborJoiningSolver(add_root=True,fast=True) - self.nj_solver = cas.solver.NeighborJoiningSolver(add_root=True,fast=False) - self.fast_upgma_solver = cas.solver.UPGMASolver(fast=True) - self.upgma_solver = cas.solver.UPGMASolver(fast=False) - self.dnj_solver = cas.solver.DynamicNeighborJoiningSolver(add_root=True) - self.hnj_solver = cas.solver.HeuristicNeighborJoiningSolver(add_root=True) - - # ------------- CM with Duplictes ----------------------- - duplicates_cm = pd.DataFrame.from_dict( - { - "a": [1, 1, 0], - "b": [1, 2, 0], - "c": [1, 2, 1], - "d": [2, 0, 0], - "e": [2, 0, 2], - "f": [2, 0, 2], - }, - orient="index", - columns=["x1", "x2", "x3"], - ) - - self.duplicate_tree = cas.data.CassiopeiaTree( - character_matrix=duplicates_cm - ) - - def test_fast_nj_solver(self): - if path_set: - # NJ Solver - nj_tree = self.basic_tree.copy() - self.nj_solver.solve(nj_tree) - - # CCPhylo Fast NJ Solver - fast_nj_tree = self.basic_tree.copy() - self.fast_nj_solver.solve(fast_nj_tree) - - # test for expected number of edges - self.assertEqual(len(nj_tree.edges), len(fast_nj_tree.edges)) - - triplets = itertools.combinations(["a", "c", "d", "e"], 3) - for triplet in triplets: - expected_triplet = find_triplet_structure(triplet, nj_tree.get_tree_topology()) - observed_triplet = find_triplet_structure(triplet, fast_nj_tree.get_tree_topology()) - self.assertEqual(expected_triplet, observed_triplet) - - def test_dnj_solver(self): - if path_set: - # NJ Solver - nj_tree = self.basic_tree.copy() - self.nj_solver.solve(nj_tree) - - # CCPhylo DNJ Solver - dnj_tree = self.basic_tree.copy() - self.dnj_solver.solve(dnj_tree) - - # test for expected number of edges - self.assertEqual(len(nj_tree.edges), len(dnj_tree.edges)) - - triplets = itertools.combinations(["a", "c", "d", "e"], 3) - for triplet in triplets: - expected_triplet = find_triplet_structure(triplet, nj_tree.get_tree_topology()) - observed_triplet = find_triplet_structure(triplet, dnj_tree.get_tree_topology()) - self.assertEqual(expected_triplet, observed_triplet) - - def test_hnj_solver(self): - if path_set: - # NJ Solver - nj_tree = self.basic_tree.copy() - self.nj_solver.solve(nj_tree) - - # CCPhylo HNJ Solver - hnj_tree = self.basic_tree.copy() - self.hnj_solver.solve(hnj_tree) - - # test for expected number of edges - self.assertEqual(len(nj_tree.edges), len(hnj_tree.edges)) - - - triplets = itertools.combinations(["a", "c", "d", "e"], 3) - for triplet in triplets: - expected_triplet = find_triplet_structure(triplet, nj_tree.get_tree_topology()) - observed_triplet = find_triplet_structure(triplet, hnj_tree.get_tree_topology()) - self.assertEqual(expected_triplet, observed_triplet) - - def test_fast_upgma_solver(self): - if path_set: - # UPGMA Solver - upgma_tree = self.basic_tree.copy() - self.upgma_solver.solve(upgma_tree) - - # CCPhylo Fast UPGMA Solver - fast_upgma_tree = self.basic_tree.copy() - self.fast_upgma_solver.solve(fast_upgma_tree) - - # test for expected number of edges - self.assertEqual(len(upgma_tree.edges), len(fast_upgma_tree.edges)) - - - triplets = itertools.combinations(["a", "c", "d", "e"], 3) - for triplet in triplets: - expected_triplet = find_triplet_structure(triplet, upgma_tree.get_tree_topology()) - observed_triplet = find_triplet_structure(triplet, fast_upgma_tree.get_tree_topology()) - self.assertEqual(expected_triplet, observed_triplet) - - #test collapse mutationless edges working - def test_collapse_mutationless_edges_ccphylo(self): - if path_set: - # NJ Solver - nj_tree = self.basic_tree.copy() - self.nj_solver.solve(nj_tree, collapse_mutationless_edges=True) - # Fast NJ Solver - fast_nj_tree = self.basic_tree.copy() - self.fast_nj_solver.solve(fast_nj_tree, collapse_mutationless_edges=True) - - # test for expected number of edges - self.assertEqual(len(nj_tree.edges), len(fast_nj_tree.edges)) - - triplets = itertools.combinations(["a", "c", "d", "e"], 3) - for triplet in triplets: - expected_triplet = find_triplet_structure(triplet, nj_tree.get_tree_topology()) - observed_triplet = find_triplet_structure(triplet, fast_nj_tree.get_tree_topology()) - self.assertEqual(expected_triplet, observed_triplet) - - # test duplicate samples + # --------------------- General NJ --------------------- + cm = pd.DataFrame.from_dict( + { + "a": [0, 1, 2, 1, 0, 0, 2, 0, 0, 0], + "b": [1, 1, 2, 1, 0, 0, 2, 0, 0, 0], + "c": [2, 2, 2, 1, 0, 0, 2, 0, 0, 0], + "d": [1, 1, 1, 1, 0, 0, 2, 0, 0, 0], + "e": [0, 0, 0, 0, 1, 2, 1, 0, 2, 0], + "f": [0, 0, 0, 0, 2, 2, 1, 0, 2, 0], + "g": [0, 2, 0, 0, 1, 1, 1, 0, 2, 0], + "h": [0, 2, 0, 0, 1, 0, 0, 1, 2, 1], + "i": [1, 2, 0, 0, 1, 0, 0, 2, 2, 1], + "j": [1, 2, 0, 0, 1, 0, 0, 1, 1, 1], + }, + orient="index", + columns=["x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", + "x9", "x10"], + ) + + self.cm = cm + self.basic_tree = cas.data.CassiopeiaTree( + character_matrix=cm + ) + + self.nj_solver = cas.solver.NeighborJoiningSolver( + add_root=True,fast=False) + self.ccphylo_nj_solver = cas.solver.NeighborJoiningSolver( + add_root=True,fast=True,implementation="ccphylo_nj") + self.ccphylo_dnj_solver = cas.solver.NeighborJoiningSolver( + add_root=True,fast = True, implementation="ccphylo_dnj") + self.ccphylo_hnj_solver = cas.solver.NeighborJoiningSolver( + add_root=True, fast = True, implementation="ccphylo_hnj") + + self.ccphylo_upgma_solver = cas.solver.UPGMASolver(fast=True) + self.upgma_solver = cas.solver.UPGMASolver(fast=False) + + + # ------------- CM with Duplictes ----------------------- + duplicates_cm = pd.DataFrame.from_dict( + { + "a": [1, 1, 0], + "b": [1, 2, 0], + "c": [1, 2, 1], + "d": [2, 0, 0], + "e": [2, 0, 2], + "f": [2, 0, 2], + }, + orient="index", + columns=["x1", "x2", "x3"], + ) + + self.duplicate_tree = cas.data.CassiopeiaTree( + character_matrix=duplicates_cm + ) + + @unittest.skipUnless(CCPHYLO_CONFIGURED, "CCPhylo not configured.") + def test_ccphylo_invalid_input(self): + with self.assertRaises(cas.solver.DistanceSolver.DistanceSolverError): + nothing_solver = cas.solver.NeighborJoiningSolver(fast = True, + implementation="invalid") + + with self.assertRaises(cas.solver.DistanceSolver.DistanceSolverError): + nothing_solver = cas.solver.UPGMASolver(fast = True, + implementation="invalid") + + @unittest.skipUnless(CCPHYLO_CONFIGURED, "CCPhylo not configured.") + def test_ccphylo_nj_solver(self): + # NJ Solver + nj_tree = self.basic_tree.copy() + self.nj_solver.solve(nj_tree) + + # CCPhylo Fast NJ Solver + ccphylo_nj_tree = self.basic_tree.copy() + self.ccphylo_nj_solver.solve(ccphylo_nj_tree) + + # test for expected number of edges + self.assertEqual(len(nj_tree.edges), len(ccphylo_nj_tree.edges)) + + triplets = itertools.combinations(["a", "c", "d", "e"], 3) + for triplet in triplets: + expected_triplet = find_triplet_structure(triplet, + nj_tree.get_tree_topology()) + observed_triplet = find_triplet_structure(triplet, + ccphylo_nj_tree.get_tree_topology()) + self.assertEqual(expected_triplet, observed_triplet) + + @unittest.skipUnless(CCPHYLO_CONFIGURED, "CCPhylo not configured.") + def test_ccphylo_dnj_solver(self): + # NJ Solver + nj_tree = self.basic_tree.copy() + self.nj_solver.solve(nj_tree) + + # CCPhylo DNJ Solver + dnj_tree = self.basic_tree.copy() + self.ccphylo_dnj_solver.solve(dnj_tree) + + # test for expected number of edges + self.assertEqual(len(nj_tree.edges), len(dnj_tree.edges)) + + triplets = itertools.combinations(["a", "c", "d", "e"], 3) + for triplet in triplets: + expected_triplet = find_triplet_structure(triplet, + nj_tree.get_tree_topology()) + observed_triplet = find_triplet_structure(triplet, + dnj_tree.get_tree_topology()) + self.assertEqual(expected_triplet, observed_triplet) + + @unittest.skipUnless(CCPHYLO_CONFIGURED, "CCPhylo not configured.") + def test_ccphylo_hnj_solver(self): + # NJ Solver + nj_tree = self.basic_tree.copy() + self.nj_solver.solve(nj_tree) + + # CCPhylo HNJ Solver + hnj_tree = self.basic_tree.copy() + self.ccphylo_hnj_solver.solve(hnj_tree) + + # test for expected number of edges + self.assertEqual(len(nj_tree.edges), len(hnj_tree.edges)) + + + triplets = itertools.combinations(["a", "c", "d", "e"], 3) + for triplet in triplets: + expected_triplet = find_triplet_structure(triplet, + nj_tree.get_tree_topology()) + observed_triplet = find_triplet_structure(triplet, + hnj_tree.get_tree_topology()) + self.assertEqual(expected_triplet, observed_triplet) + + @unittest.skipUnless(CCPHYLO_CONFIGURED, "CCPhylo not configured.") + def test_ccphylo_upgma_solver(self): + # UPGMA Solver + upgma_tree = self.basic_tree.copy() + self.upgma_solver.solve(upgma_tree) + + # CCPhylo Fast UPGMA Solver + ccphylo_upgma_tree = self.basic_tree.copy() + self.ccphylo_upgma_solver.solve(ccphylo_upgma_tree) + + # test for expected number of edges + self.assertEqual(len(upgma_tree.edges), len(ccphylo_upgma_tree.edges)) + + + triplets = itertools.combinations(["a", "c", "d", "e"], 3) + for triplet in triplets: + expected_triplet = find_triplet_structure(triplet, + upgma_tree.get_tree_topology()) + observed_triplet = find_triplet_structure(triplet, + ccphylo_upgma_tree.get_tree_topology()) + self.assertEqual(expected_triplet, observed_triplet) + + @unittest.skipUnless(CCPHYLO_CONFIGURED, "CCPhylo not configured.") + def test_collapse_mutationless_edges_ccphylo(self): + # NJ Solver + nj_tree = self.basic_tree.copy() + self.nj_solver.solve(nj_tree, collapse_mutationless_edges=True) + + # Fast NJ Solver + ccphylo_nj_tree = self.basic_tree.copy() + self.ccphylo_nj_solver.solve(ccphylo_nj_tree, + collapse_mutationless_edges=True) + + # test for expected number of edges + self.assertEqual(len(nj_tree.edges), len(ccphylo_nj_tree.edges)) + + triplets = itertools.combinations(["a", "c", "d", "e"], 3) + for triplet in triplets: + expected_triplet = find_triplet_structure(triplet, + nj_tree.get_tree_topology()) + observed_triplet = find_triplet_structure(triplet, + ccphylo_nj_tree.get_tree_topology()) + self.assertEqual(expected_triplet, observed_triplet) + + @unittest.skipUnless(CCPHYLO_CONFIGURED, "CCPhylo not configured.") def test_duplicate_sample_ccphylo(self): - if path_set: - # NJ Solver - nj_tree = self.duplicate_tree.copy() - self.nj_solver.solve(nj_tree) - - # Fast NJ Solver - fast_nj_tree = self.duplicate_tree.copy() - self.fast_nj_solver.solve(fast_nj_tree) - - # test for expected number of edges - self.assertEqual(len(nj_tree.edges), len(fast_nj_tree.edges)) - - triplets = itertools.combinations(["a", "b", "c", "d", "e", "f"], 3) - for triplet in triplets: - expected_triplet = find_triplet_structure(triplet, nj_tree.get_tree_topology()) - observed_triplet = find_triplet_structure(triplet, fast_nj_tree.get_tree_topology()) - self.assertEqual(expected_triplet, observed_triplet) + # NJ Solver + nj_tree = self.duplicate_tree.copy() + self.nj_solver.solve(nj_tree) + + # Fast NJ Solver + ccphylo_nj_tree = self.duplicate_tree.copy() + self.ccphylo_nj_solver.solve(ccphylo_nj_tree) + + # test for expected number of edges + self.assertEqual(len(nj_tree.edges), len(ccphylo_nj_tree.edges)) + + triplets = itertools.combinations(["a", "b", "c", "d", "e", "f"], 3) + for triplet in triplets: + expected_triplet = find_triplet_structure(triplet, + nj_tree.get_tree_topology()) + observed_triplet = find_triplet_structure(triplet, + ccphylo_nj_tree.get_tree_topology()) + self.assertEqual(expected_triplet, observed_triplet) if __name__ == "__main__": unittest.main() \ No newline at end of file diff --git a/test/solver_tests/dissimilarity_functions_test.py b/test/solver_tests/dissimilarity_functions_test.py index e3209c57..e9db7a82 100755 --- a/test/solver_tests/dissimilarity_functions_test.py +++ b/test/solver_tests/dissimilarity_functions_test.py @@ -6,6 +6,7 @@ from unittest import mock import numpy as np +import pandas as pd from cassiopeia.solver import dissimilarity_functions from cassiopeia.solver import solver_utilities @@ -298,6 +299,27 @@ def test_hamming_distance_ignore_missing(self): self.assertEqual(distance, 0) + def test_save_dissimilarity_as_phylip(self): + # Create a sample dissimilarity map + data = [[0.0, 0.5, 0.7], [0.5, 0.0, 0.3], [0.7, 0.3, 0.0]] + index = ['A', 'B', 'C'] + dissimilarity_map = pd.DataFrame(data, index=index) + + # Expected content in the mock file + expected_content = ("3\n" + "A\t0.0000\n" + "B\t0.5000\t0.0000\n" + "C\t0.7000\t0.3000\t0.0000\n") + + # Mock the open function to use a mock file object + with mock.patch("builtins.open", mock.mock_open()) as mock_file: + solver_utilities.save_dissimilarity_as_phylip(dissimilarity_map, + "dummy_path") + mock_file.assert_called_once_with("dummy_path", "w") + mock_file().write.assert_called() + self.assertIn(expected_content, "".join(call[0][0] for + call in mock_file().write.call_args_list)) + if __name__ == "__main__": unittest.main() From 8c25ed5630d619a48eec65c8fff50d507d76e1bf Mon Sep 17 00:00:00 2001 From: colganwi Date: Fri, 18 Aug 2023 16:24:20 -0400 Subject: [PATCH 7/8] Added config to .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 4cb7d7a2..1c3ff383 100755 --- a/.gitignore +++ b/.gitignore @@ -164,4 +164,5 @@ build stdout.log notebooks/.ipynb_checkpoints cassiopeia/tools/branch_length_estimator/_iid_exponential_bayesian.cpp -docs/api/reference/** \ No newline at end of file +docs/api/reference/** +cassiopeia/config.ini From 7f96e865ec8861859a9816d870847449f908204a Mon Sep 17 00:00:00 2001 From: Matthew Gregory Jones Date: Fri, 18 Aug 2023 13:50:57 -0700 Subject: [PATCH 8/8] reordered imports in DistanceSolver --- cassiopeia/solver/DistanceSolver.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cassiopeia/solver/DistanceSolver.py b/cassiopeia/solver/DistanceSolver.py index f3caee04..9fdbaa30 100644 --- a/cassiopeia/solver/DistanceSolver.py +++ b/cassiopeia/solver/DistanceSolver.py @@ -7,13 +7,12 @@ solving trees with CCPhylo but this will be moved with switch to compositional framework. """ -import abc -from typing import Callable, Dict, List, Optional, Tuple - import os +import abc import subprocess import tempfile +from typing import Callable, Dict, List, Optional, Tuple import configparser import ete3