diff --git a/brokit/cd_hit.py b/brokit/cd_hit.py index a323ca2..84d27fc 100644 --- a/brokit/cd_hit.py +++ b/brokit/cd_hit.py @@ -108,7 +108,7 @@ class CD_HIT(CommandLineApplication): '-p':ValuedParameter('-',Name='p',Delimiter=' '), # 1 or 0, default 0 - # by cd-hit's default algorithm, a sequence is clustered to the first + # by cd-hit's default algorithm, a sequence is clustered to the first # cluster that meet the threshold (fast cluster). If set to 1, the program # will cluster it into the most similar cluster that meet the threshold # (accurate but slow mode) @@ -119,7 +119,7 @@ class CD_HIT(CommandLineApplication): '-h':ValuedParameter('-',Name='h',Delimiter=' ') } _synonyms = {'Similarity':'-c'} - + def getHelp(self): """Method that points to documentation""" help_str =\ @@ -129,12 +129,12 @@ def getHelp(self): The following papers should be cited if this resource is used: - Clustering of highly homologous sequences to reduce thesize of large + Clustering of highly homologous sequences to reduce thesize of large protein database", Weizhong Li, Lukasz Jaroszewski & Adam Godzik Bioinformatics, (2001) 17:282-283 Tolerating some redundancy significantly speeds up clustering of large - protein databases", Weizhong Li, Lukasz Jaroszewski & Adam Godzik + protein databases", Weizhong Li, Lukasz Jaroszewski & Adam Godzik Bioinformatics, (2002) 18:77-82 """ return help_str @@ -213,7 +213,7 @@ class CD_HIT_EST(CD_HIT): '-r':ValuedParameter('-',Name='r',Delimiter=' ') }) -def cdhit_clusters_from_seqs(seqs, moltype, params=None): +def cdhit_clusters_from_seqs(seqs, moltype=DNA, params=None): """Returns the CD-HIT clusters given seqs seqs : dict like collection of sequences @@ -230,7 +230,7 @@ def cdhit_clusters_from_seqs(seqs, moltype, params=None): int_map, int_keys = seqs.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) - + # setup params and make sure the output argument is set if params is None: params = {} @@ -332,4 +332,3 @@ def parse_cdhit_clstr_file(lines): clusters.append(curr_cluster) return clusters - diff --git a/brokit/clearcut.py b/brokit/clearcut.py index 1bc8611..57138c4 100644 --- a/brokit/clearcut.py +++ b/brokit/clearcut.py @@ -22,10 +22,10 @@ class Clearcut(CommandLineApplication): - """ clearcut application controller - - The parameters are organized by function to give some idea of how the - program works. However, no restrictions are put on any combinations + """ clearcut application controller + + The parameters are organized by function to give some idea of how the + program works. However, no restrictions are put on any combinations of parameters. Misuse of parameters can lead to errors or otherwise strange results. """ @@ -43,16 +43,16 @@ class Clearcut(CommandLineApplication): '-S':FlagParameter('-',Name='S'), #--neighbor. Use traditional Neighbor-Joining algorithm. (Default: OFF) '-N':FlagParameter('-',Name='N'), - + } - + # Input file is distance matrix or alignment. Default expects distance # matrix. Output file is tree created by clearcut. _input = {\ # --in=. Input file '--in':ValuedParameter('--',Name='in',Delimiter='=',IsPath=True), - # --stdin. Read input from STDIN. + # --stdin. Read input from STDIN. '-I':FlagParameter('-',Name='I'), # --distance. Input file is a distance matrix. (Default: ON) '-d':FlagParameter('-',Name='d',Value=True), @@ -64,17 +64,17 @@ class Clearcut(CommandLineApplication): # --protein. Input alignment are protein sequences. '-P':FlagParameter('-',Name='P'), } - - + + #Correction model for computing distance matrix (Default: NO Correction): _correction={\ # --jukes. Use Jukes-Cantor correction for computing distance matrix. '-j':FlagParameter('-',Name='j'), # --kimura. Use Kimura correction for distance matrix. '-k':FlagParameter('-',Name='k'), - + } - + _output={\ # --out=. Output file '--out':ValuedParameter('--',Name='out',Delimiter='=',IsPath=True), @@ -88,10 +88,10 @@ class Clearcut(CommandLineApplication): '-e':FlagParameter('-',Name='e'), # --expdist. Exponential notation in distance output. (Default: OFF) '-E':FlagParameter('-',Name='E'), - + } - + #NOT SUPPORTED #'-h':FlagParameter('-','h'), #Help #'-V':FlagParameter('-','V'), #Version @@ -102,9 +102,9 @@ class Clearcut(CommandLineApplication): _parameters.update(_input) _parameters.update(_correction) _parameters.update(_output) - + _command = 'clearcut' - + def getHelp(self): """Method that points to the Clearcut documentation.""" help_str =\ @@ -113,7 +113,7 @@ def getHelp(self): http://bioinformatics.hungry.com/clearcut/ """ return help_str - + def _input_as_multiline_string(self, data): """Writes data to tempfile and sets -infile parameter @@ -150,17 +150,17 @@ def _input_as_seqs(self,data): def _input_as_string(self,data): """Makes data the value of a specific parameter - + This method returns the empty string. The parameter will be printed automatically once set. """ if data: self.Parameters['--in'].on(data) return '' - + def _tree_filename(self): """Return name of file containing the alignment - + prefix -- str, prefix of alignment file. """ if self.Parameters['--out']: @@ -176,32 +176,32 @@ def _get_result_paths(self,data): if self.Parameters['--out'].isOn(): out_name = self._tree_filename() result['Tree'] = ResultPath(Path=out_name,IsWritten=True) - return result + return result + - #SOME FUNCTIONS TO EXECUTE THE MOST COMMON TASKS -def align_unaligned_seqs(seqs, moltype, params=None): +def align_unaligned_seqs(seqs, moltype=DNA, params=None): """Returns an Alignment object from seqs. seqs: SequenceCollection object, or data that can be used to build one. - + moltype: a MolType object. DNA, RNA, or PROTEIN. params: dict of parameters to pass in to the Clearcut app controller. - + Result will be an Alignment object. """ #Clearcut does not support alignment raise NotImplementedError, """Clearcut does not support alignment.""" - + def align_and_build_tree(seqs, moltype, best_tree=False, params={}): """Returns an alignment and a tree from Sequences object seqs. - + seqs: SequenceCollection object, or data that can be used to build one. - + best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. @@ -213,8 +213,8 @@ def align_and_build_tree(seqs, moltype, best_tree=False, params={}): """ #Clearcut does not support alignment raise NotImplementedError, """Clearcut does not support alignment.""" - -def build_tree_from_alignment(aln, moltype, best_tree=False, params={},\ + +def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params={},\ working_dir='/tmp'): """Returns a tree from Alignment object aln. @@ -222,10 +222,10 @@ def build_tree_from_alignment(aln, moltype, best_tree=False, params={},\ to build one. - Clearcut only accepts aligned sequences. Alignment object used to handle unaligned sequences. - + moltype: a cogent.core.moltype object. - NOTE: If moltype = RNA, we must convert to DNA since Clearcut v1.0.8 - gives incorrect results if RNA is passed in. 'U' is treated as an + gives incorrect results if RNA is passed in. 'U' is treated as an incorrect character and is excluded from distance calculations. best_tree: if True (default:False), uses a slower but more accurate @@ -237,7 +237,7 @@ def build_tree_from_alignment(aln, moltype, best_tree=False, params={},\ fails. """ params['--out'] = get_tmp_filename(working_dir) - + # Create instance of app controller, enable tree, disable alignment app = Clearcut(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir=working_dir, SuppressStdout=True,\ @@ -246,17 +246,17 @@ def build_tree_from_alignment(aln, moltype, best_tree=False, params={},\ app.Parameters['-a'].on() #Turn off input as distance matrix app.Parameters['-d'].off() - + #If moltype = RNA, we must convert to DNA. if moltype == RNA: moltype = DNA - + if best_tree: app.Parameters['-N'].on() - + #Turn on correct moltype moltype_string = moltype.label.upper() - app.Parameters[MOLTYPE_MAP[moltype_string]].on() + app.Parameters[MOLTYPE_MAP[moltype_string]].on() # Setup mapping. Clearcut clips identifiers. We will need to remap them. # Clearcut only accepts aligned sequences. Let Alignment object handle @@ -269,7 +269,7 @@ def build_tree_from_alignment(aln, moltype, best_tree=False, params={},\ # Collect result result = app(int_map.toFasta()) - + # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) for node in tree.tips(): @@ -280,7 +280,7 @@ def build_tree_from_alignment(aln, moltype, best_tree=False, params={},\ del(seq_aln, app, result, int_map, int_keys, params) return tree - + def add_seqs_to_alignment(seqs, aln, params=None): """Returns an Alignment object from seqs and existing Alignment. @@ -306,13 +306,13 @@ def align_two_alignments(aln1, aln2, params=None): #Clearcut does not support alignment raise NotImplementedError, """Clearcut does not support alignment.""" - + def build_tree_from_distance_matrix(matrix, best_tree=False, params={},\ working_dir='/tmp'): """Returns a tree from a distance matrix. matrix: a square Dict2D object (cogent.util.dict2d) - + best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. @@ -322,7 +322,7 @@ def build_tree_from_distance_matrix(matrix, best_tree=False, params={},\ fails. """ params['--out'] = get_tmp_filename(working_dir) - + # Create instance of app controller, enable tree, disable alignment app = Clearcut(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir=working_dir, SuppressStdout=True,\ @@ -331,16 +331,16 @@ def build_tree_from_distance_matrix(matrix, best_tree=False, params={},\ app.Parameters['-a'].off() #Input is a distance matrix app.Parameters['-d'].on() - + if best_tree: app.Parameters['-N'].on() - + # Turn the dict2d object into the expected input format matrix_input, int_keys = _matrix_input_from_dict2d(matrix) # Collect result result = app(matrix_input) - + # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) @@ -356,9 +356,9 @@ def build_tree_from_distance_matrix(matrix, best_tree=False, params={},\ def _matrix_input_from_dict2d(matrix): """makes input for running clearcut on a matrix from a dict2D object""" - #clearcut truncates names to 10 char- need to rename before and + #clearcut truncates names to 10 char- need to rename before and #reassign after - + #make a dict of env_index:full name int_keys = dict([('env_' + str(i), k) for i,k in \ enumerate(sorted(matrix.keys()))]) @@ -374,7 +374,7 @@ def _matrix_input_from_dict2d(matrix): for env2 in matrix[env1]: new_dists.append((int_map[env1], int_map[env2], matrix[env1][env2])) int_map_dists = Dict2D(new_dists) - + #names will be fed into the phylipTable function - it is the int map names names = sorted(int_map_dists.keys()) rows = [] @@ -388,6 +388,5 @@ def _matrix_input_from_dict2d(matrix): input_matrix = phylipMatrix(rows, names) #input needs a trailing whitespace or it will fail! input_matrix += '\n' - - return input_matrix, int_keys + return input_matrix, int_keys diff --git a/brokit/clustalw.py b/brokit/clustalw.py index 85d5a6e..2b1c627 100644 --- a/brokit/clustalw.py +++ b/brokit/clustalw.py @@ -15,19 +15,19 @@ class Clustalw(CommandLineApplication): - """ clustalw application controller - - The parameters are organized by function to give some idea of how the - program works. However, no restrictions are put on any combinations + """ clustalw application controller + + The parameters are organized by function to give some idea of how the + program works. However, no restrictions are put on any combinations of parameters. Misuse of parameters can lead to errors or otherwise strange results. - You are supposed to choose one action for the program to perform. (align, + You are supposed to choose one action for the program to perform. (align, profile, sequences, tree, or bootstrap). If you choose multiple, only the dominant action (see order above) will be executed. By DEFAULT, the -align - parameter is turned on. If you decide to turn another one on, you should + parameter is turned on. If you decide to turn another one on, you should turn '-align' off IN ADDITION! - + Some references to help pages are available in the 'getHelp' method. Some might be useful to you. """ @@ -41,7 +41,7 @@ class Clustalw(CommandLineApplication): #sequence file for alignment, or alignment file for bootstrap and tree #actions _input = {'-infile':ValuedParameter('-','infile',Delimiter='=',IsPath=True)} - + # matrix and dnamatrix can be filenames as well, but not always. # They won't be treated as filenames and thus not quoted. # Therefore filepaths containing spaces might result in errors. @@ -69,7 +69,7 @@ class Clustalw(CommandLineApplication): '-window':ValuedParameter('-',Name='window',Delimiter='='), '-pairgap':ValuedParameter('-',Name='pairgap',Delimiter='='), '-score':ValuedParameter('-',Name='score',Delimiter='=')} - + # pwmatrix and pwdnamatrix can be filenames as well, but not always. # They won't be treated as filenames and thus not quoted. # Therefore filepaths containing spaces might result in errors. @@ -102,7 +102,7 @@ class Clustalw(CommandLineApplication): '-usetree2':ValuedParameter('-','usetree2',Delimiter='=',IsPath=True), '-newtree1':ValuedParameter('-','newtree1',Delimiter='=',IsPath=True), '-newtree2':ValuedParameter('-','newtree2',Delimiter='=',IsPath=True)} - + _structure_alignment={\ '-nosecstr1':FlagParameter('-',Name='nosecstr1'), '-nosecstr2':FlagParameter('-',Name='nosecstr2'), @@ -115,7 +115,7 @@ class Clustalw(CommandLineApplication): '-strandendin':ValuedParameter('-',Name='strandendin',Delimiter='='), '-strandendout':ValuedParameter('-',Name='strandendout',Delimiter='='), '-secstrout':ValuedParameter('-',Name='secstrout',Delimiter='=')} - + #NOT SUPPORTED #'-help':FlagParameter('-','help'), #'-check':FlagParameter('-','check'), @@ -136,9 +136,9 @@ class Clustalw(CommandLineApplication): _parameters.update(_output) _parameters.update(_profile_alignment) _parameters.update(_structure_alignment) - + _command = 'clustalw' - + def getHelp(self): """Methods that points to the documentation""" help_str =\ @@ -148,12 +148,12 @@ def getHelp(self): clustalw_help_1.8.html http://hypernig.nig.ac.jp/homology/clustalw-e_help.html http://www.genebee.msu.su/clustal/help.html - + A page that give reasonable insight in use of the parameters: http://bioweb.pasteur.fr/seqanal/interfaces/clustalw.html """ return help_str - + def _input_as_multiline_string(self, data): """Writes data to tempfile and sets -infile parameter @@ -190,7 +190,7 @@ def _input_as_seqs(self,data): def _input_as_string(self,data): """Makes data the value of a specific parameter - + This method returns the empty string. The parameter will be printed automatically once set. """ @@ -210,10 +210,10 @@ def _suffix(self): return _output_formats[self.Parameters['-output'].Value] else: return '.aln' - + def _aln_filename(self,prefix): """Return name of file containing the alignment - + prefix -- str, prefix of alignment file. """ if self.Parameters['-outfile'].isOn(): @@ -221,13 +221,13 @@ def _aln_filename(self,prefix): else: aln_filename = prefix + self._suffix() return aln_filename - + def _tempfile_as_multiline_string(self, data): """Write a multiline string to a temp file and return the filename. data: a multiline string to be written to a file. - * Note: the result will be the filename as a FilePath object + * Note: the result will be the filename as a FilePath object (which is a string subclass). """ @@ -240,7 +240,7 @@ def _tempfile_as_multiline_string(self, data): def _get_result_paths(self,data): """Return dict of {key: ResultPath} """ - + #clustalw .aln is used when no or unkown output type specified _treeinfo_formats = {'nj':'.nj', 'dist':'.dst', @@ -249,7 +249,7 @@ def _get_result_paths(self,data): result = {} par = self.Parameters abs = self._absolute - + if par['-align'].isOn(): prefix = par['-infile'].Value.rsplit('.', 1)[0] #prefix = par['-infile'].Value.split('.')[0] @@ -300,7 +300,7 @@ def _get_result_paths(self,data): #prefix2 = par['-profile2'].Value.split('.')[0] #sequences aln_filename = ''; aln_written = True dnd_filename = ''; dnd_written = True - + aln_filename = self._aln_filename(prefix2) if par['-usetree'].isOn(): dnd_written = False @@ -308,7 +308,7 @@ def _get_result_paths(self,data): aln_written = False dnd_filename = abs(par['-newtree'].Value) else: - dnd_filename = prefix2 + '.dnd' + dnd_filename = prefix2 + '.dnd' result['Align'] = ResultPath(Path=aln_filename,\ IsWritten=aln_written) result['Dendro'] = ResultPath(Path=dnd_filename,\ @@ -328,16 +328,16 @@ def _get_result_paths(self,data): IsWritten=tree_written) result['TreeInfo'] = ResultPath(Path=treeinfo_filename,\ IsWritten=treeinfo_written) - + elif par['-bootstrap'].isOn(): prefix = par['-infile'].Value.rsplit('.', 1)[0] - #prefix = par['-infile'].Value.split('.')[0] + #prefix = par['-infile'].Value.split('.')[0] boottree_filename = prefix + '.phb' result['Tree'] = ResultPath(Path=boottree_filename,IsWritten=True) - + return result - + #SOME FUNCTIONS TO EXECUTE THE MOST COMMON TASKS def alignUnalignedSeqs(seqs,add_seq_names=True,WorkingDir=None,\ SuppressStderr=None,SuppressStdout=None): @@ -372,13 +372,13 @@ def alignUnalignedSeqsFromFile(filename,WorkingDir=None,SuppressStderr=None,\ def alignTwoAlignments(aln1,aln2,outfile,WorkingDir=None,SuppressStderr=None,\ SuppressStdout=None): """Aligns two alignments. Individual sequences are not realigned - + aln1: string, name of file containing the first alignment aln2: string, name of file containing the second alignment - outfile: you're forced to specify an outfile name, because if you don't - aln1 will be overwritten. So, if you want aln1 to be overwritten, you + outfile: you're forced to specify an outfile name, because if you don't + aln1 will be overwritten. So, if you want aln1 to be overwritten, you should specify the same filename. - WARNING: a .dnd file is created with the same prefix as aln1. So an + WARNING: a .dnd file is created with the same prefix as aln1. So an existing dendrogram might get overwritten. """ app = Clustalw({'-profile':None,'-profile1':aln1,\ @@ -390,7 +390,7 @@ def alignTwoAlignments(aln1,aln2,outfile,WorkingDir=None,SuppressStderr=None,\ def addSeqsToAlignment(aln1,seqs,outfile,WorkingDir=None,SuppressStderr=None,\ SuppressStdout=None): """Aligns sequences from second profile against first profile - + aln1: string, name of file containing the alignment seqs: string, name of file containing the sequences that should be added to the alignment. @@ -399,13 +399,13 @@ def addSeqsToAlignment(aln1,seqs,outfile,WorkingDir=None,SuppressStderr=None,\ app = Clustalw({'-sequences':None,'-profile1':aln1,\ '-profile2':seqs,'-outfile':outfile},SuppressStderr=\ SuppressStderr,WorkingDir=WorkingDir, SuppressStdout=SuppressStdout) - + app.Parameters['-align'].off() return app() def buildTreeFromAlignment(filename,WorkingDir=None,SuppressStderr=None): """Builds a new tree from an existing alignment - + filename: string, name of file containing the seqs or alignment """ app = Clustalw({'-tree':None,'-infile':filename},SuppressStderr=\ @@ -415,10 +415,10 @@ def buildTreeFromAlignment(filename,WorkingDir=None,SuppressStderr=None): def align_and_build_tree(seqs, moltype, best_tree=False, params=None): """Returns an alignment and a tree from Sequences object seqs. - + seqs: an cogent.core.alignment.SequenceCollection object, or data that can be used to build one. - + moltype: cogent.core.moltype.MolType object best_tree: if True (default:False), uses a slower but more accurate @@ -433,8 +433,8 @@ def align_and_build_tree(seqs, moltype, best_tree=False, params=None): aln = align_unaligned_seqs(seqs, moltype=moltype, params=params) tree = build_tree_from_alignment(aln, moltype, best_tree, params) return {'Align':aln,'Tree':tree} - -def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): + +def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params=None): """Returns a tree from Alignment object aln. aln: an cogent.core.alignment.Alignment object, or data that can be used @@ -454,7 +454,7 @@ def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): app = Clustalw(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-align'].off() - + #Set params to empty dict if None. if params is None: params={} @@ -481,7 +481,7 @@ def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): seq_collection = SequenceCollection(aln) int_map, int_keys = seq_collection.getIntMap() int_map = SequenceCollection(int_map) - + # Collect result result = app(int_map.toFasta()) @@ -495,7 +495,7 @@ def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): del(seq_collection, app, result, int_map, int_keys) return tree - + def bootstrap_tree_from_alignment(aln, seed=None, num_trees=None, params=None): """Returns a tree from Alignment object aln with bootstrap support values. @@ -503,7 +503,7 @@ def bootstrap_tree_from_alignment(aln, seed=None, num_trees=None, params=None): to build one. seed: an interger, seed value to use - + num_trees: an integer, number of trees to bootstrap against params: dict of parameters to pass in to the Clustal app controller. @@ -538,7 +538,7 @@ def bootstrap_tree_from_alignment(aln, seed=None, num_trees=None, params=None): seq_collection = SequenceCollection(aln) int_map, int_keys = seq_collection.getIntMap() int_map = SequenceCollection(int_map) - + # Collect result result = app(int_map.toFasta()) @@ -553,16 +553,16 @@ def bootstrap_tree_from_alignment(aln, seed=None, num_trees=None, params=None): return tree -def align_unaligned_seqs(seqs, moltype, params=None): +def align_unaligned_seqs(seqs, moltype=DNA, params=None): """Returns an Alignment object from seqs. seqs: cogent.core.alignment.SequenceCollection object, or data that can be used to build one. - + moltype: a MolType object. DNA, RNA, or PROTEIN. params: dict of parameters to pass in to the Clustal app controller. - + Result will be a cogent.core.alignment.Alignment object. """ #create SequenceCollection object from seqs @@ -606,17 +606,17 @@ def add_seqs_to_alignment(seqs, aln, moltype, params=None): seq_int_map, seq_int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. seq_int_map = SequenceCollection(seq_int_map,MolType=moltype) - + #create Alignment object from aln aln = Alignment(aln,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln_int_map = Alignment(aln_int_map,MolType=moltype) - + #Update seq_int_keys with aln_int_keys seq_int_keys.update(aln_int_keys) - + #Create Mafft app. app = Clustalw(InputHandler='_input_as_multiline_string',\ params=params, @@ -624,20 +624,20 @@ def add_seqs_to_alignment(seqs, aln, moltype, params=None): app.Parameters['-align'].off() app.Parameters['-infile'].off() app.Parameters['-sequences'].on() - + #Add aln_int_map as profile1 app.Parameters['-profile1'].on(\ app._tempfile_as_multiline_string(aln_int_map.toFasta())) - + #Add seq_int_map as profile2 app.Parameters['-profile2'].on(\ app._tempfile_as_multiline_string(seq_int_map.toFasta())) #Get results using int_map as input to app res = app() - + #Get alignment as dict out of results alignment = dict(ClustalParser(res['Align'].readlines())) - + #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): @@ -667,17 +667,17 @@ def align_two_alignments(aln1, aln2, moltype, params=None): aln1_int_map, aln1_int_keys = aln1.getIntMap() #Create SequenceCollection from int_map. aln1_int_map = Alignment(aln1_int_map,MolType=moltype) - + #create Alignment object from aln aln2 = Alignment(aln2,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln2_int_map, aln2_int_keys = aln2.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln2_int_map = Alignment(aln2_int_map,MolType=moltype) - + #Update aln1_int_keys with aln2_int_keys aln1_int_keys.update(aln2_int_keys) - + #Create Mafft app. app = Clustalw(InputHandler='_input_as_multiline_string',\ params=params, @@ -685,20 +685,20 @@ def align_two_alignments(aln1, aln2, moltype, params=None): app.Parameters['-align'].off() app.Parameters['-infile'].off() app.Parameters['-profile'].on() - + #Add aln_int_map as profile1 app.Parameters['-profile1'].on(\ app._tempfile_as_multiline_string(aln1_int_map.toFasta())) - + #Add seq_int_map as profile2 app.Parameters['-profile2'].on(\ app._tempfile_as_multiline_string(aln2_int_map.toFasta())) #Get results using int_map as input to app res = app() - + #Get alignment as dict out of results alignment = dict(ClustalParser(res['Align'].readlines())) - + #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): @@ -713,4 +713,3 @@ def align_two_alignments(aln1, aln2, moltype, params=None): aln2,aln2_int_map,aln2_int_keys,app,res,alignment) return new_alignment - diff --git a/brokit/fasttree.py b/brokit/fasttree.py index 0abab9c..fc47c21 100644 --- a/brokit/fasttree.py +++ b/brokit/fasttree.py @@ -50,10 +50,10 @@ class FastTree(CommandLineApplication): def __call__(self,data=None, remove_tmp=True): """Run the application with the specified kwargs on data - + data: anything that can be cast into a string or written out to - a file. Usually either a list of things or a single string or - number. input_handler will be called on this data before it + a file. Usually either a list of things or a single string or + number. input_handler will be called on this data before it is passed as part of the command-line argument, so by creating your own input handlers you can customize what kind of data you want your application to accept @@ -84,13 +84,13 @@ def __call__(self,data=None, remove_tmp=True): str(errfile)])) if self.HaltExec: raise AssertionError, "Halted exec with command:\n" + command - # The return value of system is a 16-bit number containing the signal - # number that killed the process, and then the exit status. - # We only want to keep the exit status so do a right bitwise shift to + # The return value of system is a 16-bit number containing the signal + # number that killed the process, and then the exit status. + # We only want to keep the exit status so do a right bitwise shift to # get rid of the signal number byte exit_status = system(command) >> 8 - # Determine if error should be raised due to exit status of + # Determine if error should be raised due to exit status of # appliciation if not self._accept_exit_status(exit_status): raise ApplicationError, \ @@ -119,9 +119,9 @@ def _get_result_paths(self, data): result['Tree'] = ResultPath(Path=self._outfile) return result -def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): +def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params=None): """Returns a tree from alignment - + Will check MolType of aln object """ if params is None: @@ -144,7 +144,7 @@ def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): int_map = SequenceCollection(int_map,MolType=moltype) app = FastTree(params=params) - + result = app(int_map.toFasta()) tree = DndParser(result['Tree'].read(), constructor=PhyloNode) #remap tip names @@ -152,4 +152,3 @@ def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): tip.Name = int_keys[tip.Name] return tree - diff --git a/brokit/fasttree_v1.py b/brokit/fasttree_v1.py index b7a378c..3501b02 100644 --- a/brokit/fasttree_v1.py +++ b/brokit/fasttree_v1.py @@ -42,10 +42,10 @@ class FastTree(CommandLineApplication): def __call__(self,data=None, remove_tmp=True): """Run the application with the specified kwargs on data - + data: anything that can be cast into a string or written out to - a file. Usually either a list of things or a single string or - number. input_handler will be called on this data before it + a file. Usually either a list of things or a single string or + number. input_handler will be called on this data before it is passed as part of the command-line argument, so by creating your own input handlers you can customize what kind of data you want your application to accept @@ -76,13 +76,13 @@ def __call__(self,data=None, remove_tmp=True): str(errfile)])) if self.HaltExec: raise AssertionError, "Halted exec with command:\n" + command - # The return value of system is a 16-bit number containing the signal - # number that killed the process, and then the exit status. - # We only want to keep the exit status so do a right bitwise shift to + # The return value of system is a 16-bit number containing the signal + # number that killed the process, and then the exit status. + # We only want to keep the exit status so do a right bitwise shift to # get rid of the signal number byte exit_status = system(command) >> 8 - # Determine if error should be raised due to exit status of + # Determine if error should be raised due to exit status of # appliciation if not self._accept_exit_status(exit_status): raise ApplicationError, \ @@ -110,10 +110,10 @@ def _get_result_paths(self, data): result = {} result['Tree'] = ResultPath(Path=self._outfile) return result - -def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): + +def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params=None): """Returns a tree from alignment - + Will check MolType of aln object """ if params is None: @@ -128,10 +128,9 @@ def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): "FastTree does not support moltype: %s" % moltype.label app = FastTree(params=params) - + if best_tree: raise NotImplementedError, "best_tree not implemented yet" result = app(aln.toFasta()) tree = DndParser(result['Tree'].read(), constructor=PhyloNode) return tree - diff --git a/brokit/mafft.py b/brokit/mafft.py index 354e679..9cd5e4c 100644 --- a/brokit/mafft.py +++ b/brokit/mafft.py @@ -23,11 +23,11 @@ class Mafft(CommandLineApplication): """Mafft application controller""" - - + + _options ={ # Algorithm - + # Automatically selects an appropriate strategy from L-INS-i, FFT-NS-i # and FFT-NS-2, according to data size. Default: off (always FFT-NS-2) '--auto':FlagParameter(Prefix='--',Name='auto'),\ @@ -38,7 +38,7 @@ class Mafft(CommandLineApplication): # All pairwise alignments are computed with the Needleman-Wunsch algorithm. # More accurate but slower than --6merpair. Suitable for a set of globally # alignable sequences. Applicable to up to ~200 sequences. A combination - # with --maxiterate 1000 is recommended (G-INS-i). Default: off + # with --maxiterate 1000 is recommended (G-INS-i). Default: off # (6mer distance is used) '--globalpair':FlagParameter(Prefix='--',Name='globalpair'),\ @@ -50,13 +50,13 @@ class Mafft(CommandLineApplication): '--localpair':FlagParameter(Prefix='--',Name='localpair'),\ # All pairwise alignments are computed with a local algorithm with the - # generalized affine gap cost (Altschul 1998). More accurate but slower than + # generalized affine gap cost (Altschul 1998). More accurate but slower than # --6merpair. Suitable when large internal gaps are expected. Applicable to # up to ~200 sequences. A combination with --maxiterate 1000 is recommended # (E-INS-i). Default: off (6mer distance is used) '--genafpair':FlagParameter(Prefix='--',Name='genafpair'),\ - # All pairwise alignments are computed with FASTA (Pearson and Lipman 1988). + # All pairwise alignments are computed with FASTA (Pearson and Lipman 1988). # FASTA is required. Default: off (6mer distance is used) '--fastapair':FlagParameter(Prefix='--',Name='fastapair'),\ @@ -65,17 +65,17 @@ class Mafft(CommandLineApplication): # --fastapair or --blastpair is selected. Default: 2.7 '--weighti':ValuedParameter(Prefix='--',Name='weighti',Delimiter=' '),\ - # Guide tree is built number times in the progressive stage. Valid with 6mer + # Guide tree is built number times in the progressive stage. Valid with 6mer # distance. Default: 2 '--retree':ValuedParameter(Prefix='--',Name='retree',Delimiter=' '),\ # number cycles of iterative refinement are performed. Default: 0 '--maxiterate':ValuedParameter(Prefix='--',Name='maxiterate',\ Delimiter=' '),\ - + # Use FFT approximation in group-to-group alignment. Default: on '--fft':FlagParameter(Prefix='--',Name='fft'),\ - + # Do not use FFT approximation in group-to-group alignment. Default: off '--nofft':FlagParameter(Prefix='--',Name='nofft'),\ @@ -83,12 +83,12 @@ class Mafft(CommandLineApplication): # off (score is checked) '--noscore':FlagParameter(Prefix='--',Name='noscore'),\ - # Use the Myers-Miller (1988) algorithm. Default: automatically turned on + # Use the Myers-Miller (1988) algorithm. Default: automatically turned on # when the alignment length exceeds 10,000 (aa/nt). '--memsave':FlagParameter(Prefix='--',Name='memsave'),\ # Use a fast tree-building method (PartTree, Katoh and Toh 2007) with the - # 6mer distance. Recommended for a large number (> ~10,000) of sequences are + # 6mer distance. Recommended for a large number (> ~10,000) of sequences are # input. Default: off '--parttree':FlagParameter(Prefix='--',Name='parttree'),\ @@ -108,7 +108,7 @@ class Mafft(CommandLineApplication): # Do not make alignment larger than number sequences. Valid only with the # --*parttree options. Default: the number of input sequences '--groupsize':ValuedParameter(Prefix='--',Name='groupsize',Delimiter=' '),\ - + # Parameter # Gap opening penalty at group-to-group alignment. Default: 1.53 @@ -122,7 +122,7 @@ class Mafft(CommandLineApplication): # --localpair or --genafpair option is selected. Default: -2.00 '--lop':ValuedParameter(Prefix='--',Name='lop',Delimiter=' '),\ - # Offset value at local pairwise alignment. Valid when the --localpair or + # Offset value at local pairwise alignment. Valid when the --localpair or # --genafpair option is selected. Default: 0.1 '--lep':ValuedParameter(Prefix='--',Name='lep',Delimiter=' '),\ @@ -150,7 +150,7 @@ class Mafft(CommandLineApplication): # Default: BLOSUM62 '--tm':ValuedParameter(Prefix='--',Name='tm',Delimiter=' '),\ - # Use a user-defined AA scoring matrix. The format of matrixfile is the same + # Use a user-defined AA scoring matrix. The format of matrixfile is the same # to that of BLAST. Ignored when nucleotide sequences are input. # Default: BLOSUM62 '--aamatrix':ValuedParameter(Prefix='--',Name='aamatrix',Delimiter=' '),\ @@ -158,7 +158,7 @@ class Mafft(CommandLineApplication): # Incorporate the AA/nuc composition information into the scoring matrix. # Deafult: off '--fmodel':FlagParameter(Prefix='--',Name='fmodel'),\ - + # Output # Output format: clustal format. Default: off (fasta format) @@ -188,12 +188,12 @@ class Mafft(CommandLineApplication): # sequences in input. The alignment within every seed is preserved. '--seed':ValuedParameter(Prefix='--',Name='seed',Delimiter=' '),\ } - + _parameters = {} _parameters.update(_options) _command = "mafft" _suppress_stderr=True - + def _input_as_seqs(self,data): lines = [] for i,s in enumerate(data): @@ -201,20 +201,20 @@ def _input_as_seqs(self,data): lines.append(''.join(['>',str(i+1)])) lines.append(s) return self._input_as_lines(lines) - + def _tree_out_filename(self): if self.Parameters['--treeout'].isOn(): tree_filename = self._absolute(str(self._input_filename))+'.tree' else: raise ValueError, "No tree output file specified." return tree_filename - + def _tempfile_as_multiline_string(self, data): """Write a multiline string to a temp file and return the filename. data: a multiline string to be written to a file. - * Note: the result will be the filename as a FilePath object + * Note: the result will be the filename as a FilePath object (which is a string subclass). """ @@ -223,17 +223,17 @@ def _tempfile_as_multiline_string(self, data): data_file.write(data) data_file.close() return filename - + def getHelp(self): """Method that points to the Mafft documentation.""" - + help_str = \ """ See Mafft documentation at: http://align.bmr.kyushu-u.ac.jp/mafft/software/manual/manual.html """ return help_str - + def _get_result_paths(self,data): result = {} if self.Parameters['--treeout'].isOn(): @@ -241,7 +241,7 @@ def _get_result_paths(self,data): result['Tree'] = ResultPath(Path=out_name,IsWritten=True) return result -def align_unaligned_seqs(seqs,moltype,params=None,accurate=False): +def align_unaligned_seqs(seqs,moltype=DNA,params=None,accurate=False): """Aligns unaligned sequences seqs: either list of sequence objects or list of strings @@ -257,19 +257,19 @@ def align_unaligned_seqs(seqs,moltype,params=None,accurate=False): int_map = SequenceCollection(int_map,MolType=moltype) #Create Mafft app. app = Mafft(InputHandler='_input_as_multiline_string',params=params) - + #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() - + #Do not report progress app.Parameters['--quiet'].on() - + #More accurate alignment, sacrificing performance. if accurate: app.Parameters['--globalpair'].on() app.Parameters['--maxiterate'].Value=1000 - + #Get results using int_map as input to app res = app(int_map.toFasta()) #Get alignment as dict out of results @@ -289,9 +289,9 @@ def align_unaligned_seqs(seqs,moltype,params=None,accurate=False): def align_and_build_tree(seqs, moltype, best_tree=False, params={}): """Returns an alignment and a tree from Sequences object seqs. - + seqs: SequenceCollection object, or data that can be used to build one. - + best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. @@ -303,8 +303,8 @@ def align_and_build_tree(seqs, moltype, best_tree=False, params={}): """ #Current version of Mafft does not support tree building. raise NotImplementedError, """Current version of Mafft does not support tree building.""" - -def build_tree_from_alignment(aln, moltype, best_tree=False, params={},\ + +def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params={},\ working_dir='/tmp'): """Returns a tree from Alignment object aln. @@ -314,10 +314,10 @@ def build_tree_from_alignment(aln, moltype, best_tree=False, params={},\ best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. NOTE: Mafft does not necessarily support best_tree option. - Will only return guide tree used to align sequences. Passing + Will only return guide tree used to align sequences. Passing best_tree = True will construct the guide tree 100 times instead of default 2 times. - + ***Mafft does allow you to get the guide tree back, but the IDs in the output guide tree do not match the original IDs in the fasta file and are impossible to map. Sent bug report to Mafft authors; possibly @@ -330,7 +330,7 @@ def build_tree_from_alignment(aln, moltype, best_tree=False, params={},\ """ #Current version of Mafft does not support tree building. raise NotImplementedError, """Current version of Mafft does not support tree building.""" - + def add_seqs_to_alignment(seqs, aln, moltype, params=None, accurate=False): """Returns an Alignment object from seqs and existing Alignment. @@ -348,43 +348,43 @@ def add_seqs_to_alignment(seqs, aln, moltype, params=None, accurate=False): seq_int_map, seq_int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. seq_int_map = SequenceCollection(seq_int_map,MolType=moltype) - + #create Alignment object from aln aln = Alignment(aln,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln_int_map = Alignment(aln_int_map,MolType=moltype) - + #Update seq_int_keys with aln_int_keys seq_int_keys.update(aln_int_keys) - + #Create Mafft app. app = Mafft(InputHandler='_input_as_multiline_string',\ params=params, SuppressStderr=True) - + #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() - + #Do not report progress app.Parameters['--quiet'].on() - + #Add aln_int_map as seed alignment app.Parameters['--seed'].on(\ app._tempfile_as_multiline_string(aln_int_map.toFasta())) - + #More accurate alignment, sacrificing performance. if accurate: app.Parameters['--globalpair'].on() app.Parameters['--maxiterate'].Value=1000 - + #Get results using int_map as input to app res = app(seq_int_map.toFasta()) #Get alignment as dict out of results alignment = dict(parse_fasta(res['StdOut'])) - + #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): @@ -416,33 +416,33 @@ def align_two_alignments(aln1, aln2, moltype, params=None): aln1_int_map, aln1_int_keys = aln1.getIntMap() #Create SequenceCollection from int_map. aln1_int_map = Alignment(aln1_int_map,MolType=moltype) - + #create Alignment object from aln aln2 = Alignment(aln2,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln2_int_map, aln2_int_keys = aln2.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln2_int_map = Alignment(aln2_int_map,MolType=moltype) - + #Update aln1_int_keys with aln2_int_keys aln1_int_keys.update(aln2_int_keys) - + #Create Mafft app. app = Mafft(InputHandler='_input_as_paths',\ params=params, SuppressStderr=False) app._command = 'mafft-profile' - + aln1_path = app._tempfile_as_multiline_string(aln1_int_map.toFasta()) aln2_path = app._tempfile_as_multiline_string(aln2_int_map.toFasta()) filepaths = [aln1_path,aln2_path] - + #Get results using int_map as input to app res = app(filepaths) #Get alignment as dict out of results alignment = dict(parse_fasta(res['StdOut'])) - + #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): @@ -460,5 +460,3 @@ def align_two_alignments(aln1, aln2, moltype, params=None): aln2,aln2_int_map,aln2_int_keys,app,res,alignment) return new_alignment - - diff --git a/brokit/muscle_v38.py b/brokit/muscle_v38.py index dd834b5..c093132 100644 --- a/brokit/muscle_v38.py +++ b/brokit/muscle_v38.py @@ -12,78 +12,79 @@ from cogent.core.alignment import SequenceCollection, Alignment from cogent.parse.tree import DndParser from cogent.core.tree import PhyloNode +from cogent import DNA class Muscle(CommandLineApplication): """Muscle application controller""" - + _options ={ # Minimum spacing between anchor columns. [Integer] '-anchorspacing':ValuedParameter('-',Name='anchorspacing',Delimiter=' '), # Center parameter. Should be negative [Float] '-center':ValuedParameter('-',Name='center',Delimiter=' '), - + # Clustering method. cluster1 is used in iteration 1 # and 2, cluster2 in later iterations '-cluster1':ValuedParameter('-',Name='cluster1',Delimiter=' '), '-cluster2':ValuedParameter('-',Name='cluster2',Delimiter=' '), - + # Minimum length of diagonal. '-diaglength':ValuedParameter('-',Name='diaglength',Delimiter=' '), - + # Discard this many positions at ends of diagonal. '-diagmargin':ValuedParameter('-',Name='diagmargin',Delimiter=' '), - + # Distance measure for iteration 1. '-distance1':ValuedParameter('-',Name='distance1',Delimiter=' '), - + # Distance measure for iterations 2, 3 ... '-distance2':ValuedParameter('-',Name='distance2',Delimiter=' '), - + # The gap open score. Must be negative. '-gapopen':ValuedParameter('-',Name='gapopen',Delimiter=' '), - + # Window size for determining whether a region is hydrophobic. '-hydro':ValuedParameter('-',Name='hydro',Delimiter=' '), - + # Multiplier for gap open/close penalties in hydrophobic regions. '-hydrofactor':ValuedParameter('-',Name='hydrofactor',Delimiter=' '), - + # Where to find the input sequences. '-in':ValuedParameter('-',Name='in',Delimiter=' ', Quote="\""), '-in1':ValuedParameter('-',Name='in1',Delimiter=' ', Quote="\""), '-in2':ValuedParameter('-',Name='in2',Delimiter=' ', Quote="\""), - + # Log file name (delete existing file). '-log':ValuedParameter('-',Name='log',Delimiter=' '), - + # Log file name (append to existing file). '-loga':ValuedParameter('-',Name='loga',Delimiter=' '), - + # Maximum distance between two diagonals that allows them to merge # into one diagonal. '-maxdiagbreak':ValuedParameter('-',Name='maxdiagbreak',Delimiter=' '), - + # Maximum time to run in hours. The actual time may exceed the # requested limit by a few minutes. Decimals are allowed, so 1.5 # means one hour and 30 minutes. '-maxhours':ValuedParameter('-',Name='maxhours',Delimiter=' '), - + # Maximum number of iterations. '-maxiters':ValuedParameter('-',Name='maxiters',Delimiter=' '), # Maximum memory in Mb '-maxmb': ValuedParameter('-', Name='maxmb', Delimiter=' '), - + # Maximum number of new trees to build in iteration 2. '-maxtrees':ValuedParameter('-',Name='maxtrees',Delimiter=' '), - + # Minimum score a column must have to be an anchor. '-minbestcolscore':ValuedParameter('-',Name='minbestcolscore',Delimiter=' '), - + # Minimum smoothed score a column must have to be an anchor. '-minsmoothscore':ValuedParameter('-',Name='minsmoothscore',Delimiter=' '), - + # Objective score used by tree dependent refinement. # sp=sum-of-pairs score. # spf=sum-of-pairs score (dimer approximation) @@ -92,17 +93,17 @@ class Muscle(CommandLineApplication): # ps=average profile-sequence score. # xp=cross profile score. '-objscore':ValuedParameter('-',Name='objscore',Delimiter=' '), - + # Where to write the alignment. '-out':ValuedParameter('-',Name='out',Delimiter=' ', Quote="\""), - + # Where to write the file in phylip sequenctial format (v3.6 only). '-physout':ValuedParameter('-',Name='physout',Delimiter=' '), - + # Where to write the file in phylip interleaved format (v3.6 only). '-phyiout':ValuedParameter('-',Name='phyiout',Delimiter=' '), - # Set to profile for aligning two alignments and adding seqs to an + # Set to profile for aligning two alignments and adding seqs to an # existing alignment '-profile':FlagParameter(Prefix='-',Name='profile'), @@ -110,22 +111,22 @@ class Muscle(CommandLineApplication): # in later iterations. '-root1':ValuedParameter('-',Name='root1',Delimiter=' '), '-root2':ValuedParameter('-',Name='root2',Delimiter=' '), - + # Sequence type. '-seqtype':ValuedParameter('-',Name='seqtype',Delimiter=' '), - + # Maximum value of column score for smoothing purposes. '-smoothscoreceil':ValuedParameter('-',Name='smoothscoreceil',Delimiter=' '), - + # Constant used in UPGMB clustering. Determines the relative fraction # of average linkage (SUEFF) vs. nearest-neighbor linkage (1 . SUEFF). '-SUEFF':ValuedParameter('-',Name='SUEFF',Delimiter=' '), - + # Save tree produced in first or second iteration to given file in # Newick (Phylip-compatible) format. '-tree1':ValuedParameter('-',Name='tree1',Delimiter=' ', Quote="\""), '-tree2':ValuedParameter('-',Name='tree2',Delimiter=' ', Quote="\""), - + # Sequence weighting scheme. # weight1 is used in iterations 1 and 2. # weight2 is used for tree-dependent refinement. @@ -136,102 +137,102 @@ class Muscle(CommandLineApplication): # threeway=Gotoh three-way method. '-weight1':ValuedParameter('-',Name='weight1',Delimiter=' '), '-weight2':ValuedParameter('-',Name='weight2',Delimiter=' '), - + # Use anchor optimization in tree dependent refinement iterations '-anchors':FlagParameter(Prefix='-',Name='anchors'), - + # Write output in CLUSTALW format (default is FASTA). '-clw':FlagParameter(Prefix='-',Name='clw'), - + # Cluster sequences '-clusteronly':FlagParameter(Prefix='-',Name='clusteronly'), # neighborjoining is "unrecognized" #'-neighborjoining':FlagParameter(Prefix='-',Name='neighborjoining'), - + # Write output in CLUSTALW format with the "CLUSTAL W (1.81)" header # rather than the MUSCLE version. This is useful when a post-processing # step is picky about the file header. '-clwstrict':FlagParameter(Prefix='-',Name='clwstrict'), - + # Do not catch exceptions. '-core':FlagParameter(Prefix='-',Name='core'), - + # Write output in FASTA format. Alternatives include .clw, # .clwstrict, .msf and .html. '-fasta':FlagParameter(Prefix='-',Name='fasta'), - + # Group similar sequences together in the output. This is the default. # See also .stable. '-group':FlagParameter(Prefix='-',Name='group'), - + # Write output in HTML format (default is FASTA). '-html':FlagParameter(Prefix='-',Name='html'), - + # Use log-expectation profile score (VTML240). Alternatives are to use # -sp or -sv. This is the default for amino acid sequences. '-le':FlagParameter(Prefix='-',Name='le'), - + # Write output in MSF format (default is FASTA). '-msf':FlagParameter(Prefix='-',Name='msf'), - + # Disable anchor optimization. Default is -anchors. '-noanchors':FlagParameter(Prefix='-',Name='noanchors'), - + # Catch exceptions and give an error message if possible. '-nocore':FlagParameter(Prefix='-',Name='nocore'), - + # Do not display progress messages. '-quiet':FlagParameter(Prefix='-',Name='quiet'), - + # Input file is already aligned, skip first two iterations and begin # tree dependent refinement. '-refine':FlagParameter(Prefix='-',Name='refine'), - + # Use sum-of-pairs protein profile score (PAM200). Default is -le. '-sp':FlagParameter(Prefix='-',Name='sp'), - + # Use sum-of-pairs nucleotide profile score (BLASTZ parameters). This # is the only option for nucleotides, and is therefore the default. '-spn':FlagParameter(Prefix='-',Name='spn'), - + # Preserve input order of sequences in output file. Default is to group # sequences by similarity (-group). '-stable':FlagParameter(Prefix='-',Name='stable'), - + # Use sum-of-pairs profile score (VTML240). Default is -le. '-sv':FlagParameter(Prefix='-',Name='sv'), - + # Diagonal optimization '-diags':FlagParameter(Prefix='-',Name='diags'), '-diags1':FlagParameter(Prefix='-',Name='diags1'), '-diags2':FlagParameter(Prefix='-',Name='diags2'), - + # Terminal gaps penalized with full penalty. # [1] Not fully supported in this version. '-termgapsfull':FlagParameter(Prefix='-',Name='termgapsfull'), - + # Terminal gaps penalized with half penalty. # [1] Not fully supported in this version. '-termgapshalf':FlagParameter(Prefix='-',Name='termgapshalf'), - + # Terminal gaps penalized with half penalty if gap relative to # longer sequence, otherwise with full penalty. # [1] Not fully supported in this version. '-termgapshalflonger':FlagParameter(Prefix='-',Name='termgapshalflonger'), - + # Write parameter settings and progress messages to log file. '-verbose':FlagParameter(Prefix='-',Name='verbose'), - + # Write version string to stdout and exit. '-version':FlagParameter(Prefix='-',Name='version'), } - + _parameters = {} _parameters.update(_options) _command = "muscle" - + def _input_as_seqs(self,data): lines = [] for i,s in enumerate(data): @@ -239,24 +240,24 @@ def _input_as_seqs(self,data): lines.append(''.join(['>',str(i+1)])) lines.append(s) return self._input_as_lines(lines) - + def _input_as_lines(self,data): if data: self.Parameters['-in']\ .on(super(Muscle,self)._input_as_lines(data)) - + return '' - + def _input_as_string(self,data): """Makes data the value of a specific parameter - + This method returns the empty string. The parameter will be printed automatically once set. """ if data: self.Parameters['-in'].on(str(data)) return '' - + def _input_as_multiline_string(self, data): if data: self.Parameters['-in']\ @@ -281,31 +282,31 @@ def _input_as_multifile(self, data): return '' def _align_out_filename(self): - + if self.Parameters['-out'].isOn(): aln_filename = self._absolute(str(self.Parameters['-out'].Value)) else: raise ValueError, "No output file specified." return aln_filename - + def _tree1_out_filename(self): - + if self.Parameters['-tree1'].isOn(): aln_filename = self._absolute(str(self.Parameters['-tree1'].Value)) else: raise ValueError, "No tree output file specified." return aln_filename - + def _tree2_out_filename(self): - + if self.Parameters['-tree2'].isOn(): tree_filename = self._absolute(str(self.Parameters['-tree2'].Value)) else: raise ValueError, "No tree output file specified." return tree_filename - + def _get_result_paths(self,data): - + result = {} if self.Parameters['-out'].isOn(): out_name = self._align_out_filename() @@ -318,10 +319,10 @@ def _get_result_paths(self,data): result['Tree2Out'] = ResultPath(Path=out_name,IsWritten=True) return result - + def getHelp(self): """Muscle help""" - + help_str = """ """ return help_str @@ -336,12 +337,12 @@ def muscle_seqs(seqs, SuppressStderr=None, SuppressStdout=None): """Muscle align list of sequences. - + seqs: a list of sequences as strings or objects, you must set add_seq_names=True or sequences in a multiline string, as read() from a fasta file or sequences in a list of lines, as readlines() from a fasta file or a fasta seq filename. - + == for eg, testcode for guessing #guess_input_handler should correctly identify input gih = guess_input_handler @@ -349,33 +350,33 @@ def muscle_seqs(seqs, self.assertEqual(gih('>ab\nTCAG'), '_input_as_multiline_string') self.assertEqual(gih(['ACC','TGA'], True), '_input_as_seqs') self.assertEqual(gih(['>a','ACC','>b','TGA']), '_input_as_lines') - + == docstring for blast_seqs, apply to muscle_seqs == seqs: either file name or list of sequence objects or list of strings or single multiline string containing sequences. - + WARNING: DECISION RULES FOR INPUT HANDLING HAVE CHANGED. Decision rules for data are as follows. If it's s list, treat as lines, unless add_seq_names is true (in which case treat as list of seqs). If it's a string, test whether it has newlines. If it doesn't have newlines, assume it's a filename. If it does have newlines, it can't be a filename, so assume it's a multiline string containing sequences. - + If you want to skip the detection and force a specific type of input handler, use input_handler='your_favorite_handler'. - + add_seq_names: boolean. if True, sequence names are inserted in the list of sequences. if False, it assumes seqs is a list of lines of some proper format that the program can handle - + Addl docs coming soon """ - + if out_filename: params["-out"] = out_filename #else: # params["-out"] = get_tmp_filename(WorkingDir) - + ih = input_handler or guess_input_handler(seqs, add_seq_names) muscle_app = Muscle( params=params, @@ -399,17 +400,17 @@ def cluster_seqs(seqs, clean_up=True ): """Muscle cluster list of sequences. - + seqs: either file name or list of sequence objects or list of strings or single multiline string containing sequences. - + Addl docs coming soon """ num_seqs = len(seqs) if num_seqs < 2: raise ValueError, "Muscle requres 2 or more sequences to cluster." - + num_chars = sum(map(len, seqs)) if num_chars > max_chars: params["-maxiters"] = 2 @@ -420,26 +421,26 @@ def cluster_seqs(seqs, #params["-distance1"] = "kbit20_3" print "lots of chars, using fast align", num_chars - + params["-maxhours"] = max_hours #params["-maxiters"] = 10 - + #cluster_type = "upgmb" #if neighbor_join: # cluster_type = "neighborjoining" - + params["-clusteronly"] = True params["-tree1"] = get_tmp_filename(WorkingDir) - + muscle_res = muscle_seqs(seqs, params=params, add_seq_names=add_seq_names, WorkingDir=WorkingDir, SuppressStderr=SuppressStderr, SuppressStdout=SuppressStdout) - + tree = DndParser(muscle_res["Tree1Out"], constructor=constructor) - + if clean_up: muscle_res.cleanUp() return tree @@ -457,23 +458,23 @@ def aln_tree_seqs(seqs, clean_up=True ): """Muscle align sequences and report tree from iteration2. - + Unlike cluster_seqs, returns tree2 which is the tree made during the second muscle iteration (it should be more accurate that the cluster from the first iteration which is made fast based on k-mer words) - + seqs: either file name or list of sequence objects or list of strings or single multiline string containing sequences. tree_type: can be either neighborjoining (default) or upgmb for UPGMA clean_up: When true, will clean up output files """ - + params["-maxhours"] = max_hours if tree_type: params["-cluster2"] = tree_type params["-tree2"] = get_tmp_filename(WorkingDir) params["-out"] = get_tmp_filename(WorkingDir) - + muscle_res = muscle_seqs(seqs, input_handler=input_handler, params=params, @@ -483,7 +484,7 @@ def aln_tree_seqs(seqs, SuppressStdout=SuppressStdout) tree = DndParser(muscle_res["Tree2Out"], constructor=constructor) aln = [line for line in muscle_res["MuscleOut"]] - + if clean_up: muscle_res.cleanUp() return tree, aln @@ -497,18 +498,18 @@ def fastest_aln_seqs(seqs, SuppressStdout=None ): """Fastest (and least accurate) version of muscle - + seqs: either file name or list of sequence objects or list of strings or single multiline string containing sequences. - + Addl docs coming soon """ - + params["-maxiters"] = 1 params["-diags1"] = True params["-sv"] = True params["-distance1"] = "kbit20_3" - + muscle_res = muscle_seqs(seqs, params=params, add_seq_names=add_seq_names, @@ -518,15 +519,15 @@ def fastest_aln_seqs(seqs, SuppressStdout=SuppressStdout) return muscle_res -def align_unaligned_seqs(seqs, moltype, params=None): +def align_unaligned_seqs(seqs, moltype=DNA, params=None): """Returns an Alignment object from seqs. seqs: SequenceCollection object, or data that can be used to build one. - + moltype: a MolType object. DNA, RNA, or PROTEIN. params: dict of parameters to pass in to the Muscle app controller. - + Result will be an Alignment object. """ if not params: @@ -561,38 +562,38 @@ def align_unaligned_seqs(seqs, moltype, params=None): def align_and_build_tree(seqs, moltype, best_tree=False, params=None): """Returns an alignment and a tree from Sequences object seqs. - - seqs: a cogent.core.alignment.SequenceCollection object, or data that can + + seqs: a cogent.core.alignment.SequenceCollection object, or data that can be used to build one. - + moltype: cogent.core.moltype.MolType object best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. - + params: dict of parameters to pass in to the Muscle app controller. - - The result will be a tuple containing a cogent.core.alignment.Alignment - and a cogent.core.tree.PhyloNode object (or None for the alignment + + The result will be a tuple containing a cogent.core.alignment.Alignment + and a cogent.core.tree.PhyloNode object (or None for the alignment and/or tree if either fails). """ aln = align_unaligned_seqs(seqs, moltype=moltype, params=params) tree = build_tree_from_alignment(aln, moltype, best_tree, params) return {'Align':aln, 'Tree':tree} -def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): +def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params=None): """Returns a tree from Alignment object aln. - - aln: a cogent.core.alignment.Alignment object, or data that can be used + + aln: a cogent.core.alignment.Alignment object, or data that can be used to build one. - + moltype: cogent.core.moltype.MolType object best_tree: unsupported - + params: dict of parameters to pass in to the Muscle app controller. - - The result will be an cogent.core.tree.PhyloNode object, or None if tree + + The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ # Create instance of app controller, enable tree, disable alignment @@ -616,7 +617,7 @@ def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): # Build tree tree = DndParser(result['Tree1Out'].read(), constructor=PhyloNode) - + for tip in tree.tips(): tip.Name = int_keys[tip.Name] @@ -628,13 +629,13 @@ def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): def add_seqs_to_alignment(seqs, aln, params=None): """Returns an Alignment object from seqs and existing Alignment. - - seqs: a cogent.core.alignment.SequenceCollection object, or data that can + + seqs: a cogent.core.alignment.SequenceCollection object, or data that can be used to build one. - - aln: a cogent.core.alignment.Alignment object, or data that can be used + + aln: a cogent.core.alignment.Alignment object, or data that can be used to build one - + params: dict of parameters to pass in to the Muscle app controller. """ if not params: @@ -698,10 +699,10 @@ def add_seqs_to_alignment(seqs, aln, params=None): def align_two_alignments(aln1, aln2, params=None): """Returns an Alignment object from two existing Alignments. - - aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be + + aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be used to build them. - + params: dict of parameters to pass in to the Muscle app controller. """ if not params: diff --git a/brokit/raxml_v730.py b/brokit/raxml_v730.py index 480f1f2..aa2e424 100644 --- a/brokit/raxml_v730.py +++ b/brokit/raxml_v730.py @@ -25,118 +25,118 @@ class Raxml(CommandLineApplication): _options ={ - # Specify a column weight file name to assign individual wieghts to - # each column of the alignment. Those weights must be integers - # separated by any number and type of whitespaces whithin a separate + # Specify a column weight file name to assign individual wieghts to + # each column of the alignment. Those weights must be integers + # separated by any number and type of whitespaces whithin a separate # file, see file "example_weights" for an example. '-a':ValuedParameter('-',Name='a',Delimiter=' '), # Specify one of the secondary structure substitution models implemented - # in RAxML. The same nomenclature as in the PHASE manual is used, - # available models: S6A, S6B, S6C, S6D, S6E, S7A, S7B, S7C, S7D, S7E, + # in RAxML. The same nomenclature as in the PHASE manual is used, + # available models: S6A, S6B, S6C, S6D, S6E, S7A, S7B, S7C, S7D, S7E, # S7F, S16, S16A, S16B # DEFAULT: 16-state GTR model (S16) '-A':ValuedParameter('-',Name='A',Delimiter=' '), - + # Specify an integer number (random seed) for bootstrapping '-b':ValuedParameter('-',Name='b',Delimiter=' '), - - # specify a floating point number between 0.0 and 1.0 that will be used - # as cutoff threshold for the MR-based bootstopping criteria. The + + # specify a floating point number between 0.0 and 1.0 that will be used + # as cutoff threshold for the MR-based bootstopping criteria. The # recommended setting is 0.03. '-B':ValuedParameter('-',Name='B',Delimiter=' '), - - # Specify number of distinct rate catgories for raxml when + + # Specify number of distinct rate catgories for raxml when # ModelOfEvolution is set to GTRCAT or HKY85CAT. - # Individual per-site rates are categorized into numberOfCategories + # Individual per-site rates are categorized into numberOfCategories # rate categories to accelerate computations. (Default = 50) '-c':ValuedParameter('-',Name='c',Delimiter=' '), - # Conduct model parameter optimization on gappy, partitioned multi-gene - # alignments with per-partition branch length estimates (-M enabled) + # Conduct model parameter optimization on gappy, partitioned multi-gene + # alignments with per-partition branch length estimates (-M enabled) # using the fast method with pointer meshes described in: - # Stamatakis and Ott: "Efficient computation of the phylogenetic - # likelihood function on multi-gene alignments and multi-core + # Stamatakis and Ott: "Efficient computation of the phylogenetic + # likelihood function on multi-gene alignments and multi-core # processors" - # WARNING: We can not conduct useful tree searches using this method + # WARNING: We can not conduct useful tree searches using this method # yet! Does not work with Pthreads version. - '-C':ValuedParameter('-',Name='C',Delimiter=' '), - - # This option allows you to start the RAxML search with a complete - # random starting tree instead of the default Maximum Parsimony - # Starting tree. On smaller datasets (around 100-200 taxa) it has - # been observed that this might sometimes yield topologies of distinct - # local likelihood maxima which better correspond to empirical - # expectations. + '-C':ValuedParameter('-',Name='C',Delimiter=' '), + + # This option allows you to start the RAxML search with a complete + # random starting tree instead of the default Maximum Parsimony + # Starting tree. On smaller datasets (around 100-200 taxa) it has + # been observed that this might sometimes yield topologies of distinct + # local likelihood maxima which better correspond to empirical + # expectations. '-d':FlagParameter('-',Name='d'), - # ML search convergence criterion. This will break off ML searches if - # the relative Robinson-Foulds distance between the trees obtained from - # two consecutive lazy SPR cycles is smaller or equal to 1%. Usage - # recommended for very large datasets in terms of taxa. On trees with - # more than 500 taxa this will yield execution time improvements of + # ML search convergence criterion. This will break off ML searches if + # the relative Robinson-Foulds distance between the trees obtained from + # two consecutive lazy SPR cycles is smaller or equal to 1%. Usage + # recommended for very large datasets in terms of taxa. On trees with + # more than 500 taxa this will yield execution time improvements of # approximately 50% While yielding only slightly worse trees. # DEFAULT: OFF - '-D':ValuedParameter('-',Name='D'), + '-D':ValuedParameter('-',Name='D'), # This allows you to specify up to which likelihood difference. # Default is 0.1 log likelihood units, author recommends 1 or 2 to # rapidly evaluate different trees. '-e':ValuedParameter('-',Name='e',Delimiter=' '), - - # specify an exclude file name, that contains a specification of - # alignment positions you wish to exclude. Format is similar to Nexus, - # the file shall contain entries like "100-200 300-400", to exclude a - # single column write, e.g., "100-100", if you use a mixed model, an + + # specify an exclude file name, that contains a specification of + # alignment positions you wish to exclude. Format is similar to Nexus, + # the file shall contain entries like "100-200 300-400", to exclude a + # single column write, e.g., "100-100", if you use a mixed model, an # appropriatly adapted model file will be written. '-E':ValuedParameter('-',Name='E',Delimiter=' '), - # select search algorithm: - # a rapid Bootstrap analysis and search for best-scoring ML tree in + # select search algorithm: + # a rapid Bootstrap analysis and search for best-scoring ML tree in # one program run # A compute marginal ancestral states on a ROOTED reference tree # provided with "t" - ONLY IN 7.3.0 - # b draw bipartition information on a tree provided with "-t" based on - # multiple trees (e.g., from a bootstrap) in a file specifed by + # b draw bipartition information on a tree provided with "-t" based on + # multiple trees (e.g., from a bootstrap) in a file specifed by # "-z" # c check if the alignment can be properly read by RAxML # d for normal hill-climbing search (Default) # when -f option is omitted this algorithm will be used - # e optimize model+branch lengths for given input tree under + # e optimize model+branch lengths for given input tree under # GAMMA/GAMMAI only - # E execute very fast experimental tree search, at present only for + # E execute very fast experimental tree search, at present only for # testing # F execute fast experimental tree search, at present only for testing # g compute per site log Likelihoods for one ore more trees passed via # "-z" and write them to a file that can be read by CONSEL # WARNING: does not print likelihoods in the original column order - # h compute log likelihood test (SH-test) between best tree passed via - # "-t" and a bunch of other trees passed via "-z" - # i EXPERIMENTAL do not use for real tree inferences: conducts a - # single cycle of fast lazy SPR moves on a given input tree, to be - # used in combination with -C and -M - # I EXPERIMENTAL do not use for real tree inferences: conducts a - # single cycle of thorough lazy SPR moves on a given input tree, - # to be used in combination with -C and -M - # j generate a bunch of bootstrapped alignment files from an original - # alignemnt file. You need to specify a seed with "-b" and the - # number of replicates with "-#" + # h compute log likelihood test (SH-test) between best tree passed via + # "-t" and a bunch of other trees passed via "-z" + # i EXPERIMENTAL do not use for real tree inferences: conducts a + # single cycle of fast lazy SPR moves on a given input tree, to be + # used in combination with -C and -M + # I EXPERIMENTAL do not use for real tree inferences: conducts a + # single cycle of thorough lazy SPR moves on a given input tree, + # to be used in combination with -C and -M + # j generate a bunch of bootstrapped alignment files from an original + # alignemnt file. You need to specify a seed with "-b" and the + # number of replicates with "-#" # following "J" is for version 7.2.8 # J Compute SH-like support values on a given tree passed via "-t". - # m compare bipartitions between two bunches of trees passed via "-t" - # and "-z" respectively. This will return the Pearson correlation - # between all bipartitions found in the two tree files. A file - # called RAxML_bipartitionFrequencies.outpuFileName will be - # printed that contains the pair-wise bipartition frequencies of + # m compare bipartitions between two bunches of trees passed via "-t" + # and "-z" respectively. This will return the Pearson correlation + # between all bipartitions found in the two tree files. A file + # called RAxML_bipartitionFrequencies.outpuFileName will be + # printed that contains the pair-wise bipartition frequencies of # the two sets - # n compute the log likelihood score of all trees contained in a tree + # n compute the log likelihood score of all trees contained in a tree # file provided by "-z" under GAMMA or GAMMA+P-Invar # o old (slower) algorithm from v. 2.1.3 - # p perform pure stepwise MP addition of new sequences to an + # p perform pure stepwise MP addition of new sequences to an # incomplete starting tree and exit - # r compute pairwise Robinson-Foulds (RF) distances between all pairs - # of trees in a tree file passed via "-z" if the trees have node - # labales represented as integer support values the program will + # r compute pairwise Robinson-Foulds (RF) distances between all pairs + # of trees in a tree file passed via "-z" if the trees have node + # labales represented as integer support values the program will # also compute two flavors of the weighted Robinson-Foulds (WRF) # distance # following "R" is for version 7.2.8 @@ -148,41 +148,41 @@ class Raxml(CommandLineApplication): # S compute site-specific placement bias using a leave one out test # inspired by the evolutionary placement algorithm # t do randomized tree searches on one fixed starting tree - # u execute morphological weight calibration using maximum likelihood, - # this will return a weight vector. you need to provide a - # morphological alignment and a reference tree via "-t" - # U execute morphological wieght calibration using parsimony, this - # will return a weight vector. you need to provide a morphological + # u execute morphological weight calibration using maximum likelihood, + # this will return a weight vector. you need to provide a + # morphological alignment and a reference tree via "-t" + # U execute morphological wieght calibration using parsimony, this + # will return a weight vector. you need to provide a morphological # alignment and a reference tree via "-t" - DEPRECATED IN 7.3.0 - # v classify a bunch of environmental sequences into a reference tree - # using the slow heuristics without dynamic alignment you will - # need to start RAxML with a non-comprehensive reference tree and + # v classify a bunch of environmental sequences into a reference tree + # using the slow heuristics without dynamic alignment you will + # need to start RAxML with a non-comprehensive reference tree and # an alignment containing all sequences (reference + query) - # w compute ELW test on a bunch of trees passed via "-z" - # x compute pair-wise ML distances, ML model parameters will be - # estimated on an MP starting tree or a user-defined tree passed - # via "-t", only allowed for GAMMA-based models of rate + # w compute ELW test on a bunch of trees passed via "-z" + # x compute pair-wise ML distances, ML model parameters will be + # estimated on an MP starting tree or a user-defined tree passed + # via "-t", only allowed for GAMMA-based models of rate # heterogeneity - # y classify a bunch of environmental sequences into a reference tree - # using the fast heuristics without dynamic alignment you will - # need to start RAxML with a non-comprehensive reference tree and + # y classify a bunch of environmental sequences into a reference tree + # using the fast heuristics without dynamic alignment you will + # need to start RAxML with a non-comprehensive reference tree and # an alignment containing all sequences (reference + query) '-f':ValuedParameter('-',Name='f',Delimiter=' ', Value="d"), - # enable ML tree searches under CAT model for very large trees without - # switching to GAMMA in the end (saves memory). This option can also be - # used with the GAMMA models in order to avoid the thorough optimization + # enable ML tree searches under CAT model for very large trees without + # switching to GAMMA in the end (saves memory). This option can also be + # used with the GAMMA models in order to avoid the thorough optimization # of the best-scoring ML tree in the end. # DEFAULT: OFF '-F':FlagParameter('-',Name='F'), - + # select grouping file name: allows incomplete multifurcating constraint # tree in newick format -- resolves multifurcations randomly, adds # other taxa using parsimony insertion '-g':ValuedParameter('-', Name='g',Delimiter=' '), - # enable the ML-based evolutionary placement algorithm heuristics by - # specifiyng a threshold value (fraction of insertion branches to be + # enable the ML-based evolutionary placement algorithm heuristics by + # specifiyng a threshold value (fraction of insertion branches to be # evaluated using slow insertions under ML). '-G':FlagParameter('-', Name='G'), @@ -190,10 +190,10 @@ class Raxml(CommandLineApplication): '-h':FlagParameter('-', Name='h'), # enable the MP-based evolutionary placement algorithm heuristics - # by specifiyng a threshold value (fraction of insertion branches to be + # by specifiyng a threshold value (fraction of insertion branches to be # evaluated using slow insertions under ML) - DEPRECATED IN 7.3.0 #'-H':ValuedParameter('-', Name='H',Delimiter=' '), - + # allows initial rearrangement to be constrained, e.g. 10 means # insertion will not be more than 10 nodes away from original. # default is to pick a "good" setting. @@ -203,133 +203,133 @@ class Raxml(CommandLineApplication): # "-I autoFC" for the frequency-based criterion # "-I autoMR" for the majority-rule consensus tree criterion # "-I autoMRE" for the extended majority-rule consensus tree criterion - # "-I autoMRE_IGN" for metrics similar to MRE, but include + # "-I autoMRE_IGN" for metrics similar to MRE, but include # bipartitions under the threshold whether they are compatible # or not. This emulates MRE but is faster to compute. - # You also need to pass a tree file containg several bootstrap + # You also need to pass a tree file containg several bootstrap # replicates via "-z" '-I':ValuedParameter('-', Name='I', Delimiter=' '), - + # writes checkpoints (off by default) '-j':FlagParameter('-', Name='j'), - # Compute majority rule consensus tree with "-J MR" or extended majority - # rule consensus tree with "-J MRE" or strict consensus tree with "-J - # STRICT" You will need to provide a tree file containing several + # Compute majority rule consensus tree with "-J MR" or extended majority + # rule consensus tree with "-J MRE" or strict consensus tree with "-J + # STRICT" You will need to provide a tree file containing several # UNROOTED trees via "-z" '-J':ValuedParameter('-', Name='J', Delimiter=' '), - + #specifies that RAxML will optimize model parameters (for GTRMIX and # GTRGAMMA) as well as calculating likelihoods for bootstrapped trees. '-k':FlagParameter('-', Name='k'), - # Specify one of the multi-state substitution models (max 32 states) + # Specify one of the multi-state substitution models (max 32 states) # implemented in RAxML. Available models are: ORDERED, MK, GTR '-K':ValuedParameter('-', Name='K', Delimiter=' '), - - # Model of Binary (Morphological), Nucleotide, Multi-State, or Amino + + # Model of Binary (Morphological), Nucleotide, Multi-State, or Amino # Acid Substitution:: # BINARY: - # -m BINCAT : Optimization of site-specific evolutionary rates which - # are categorized into numberOfCategories distinct rate categories - # for greater computational efficiency. Final tree might be - # evaluated automatically under BINGAMMA, depending on the tree + # -m BINCAT : Optimization of site-specific evolutionary rates which + # are categorized into numberOfCategories distinct rate categories + # for greater computational efficiency. Final tree might be + # evaluated automatically under BINGAMMA, depending on the tree # search option - # -m BINCATI : Optimization of site-specific evolutionary rates which - # are categorized into numberOfCategories distinct rate categories - # for greater computational efficiency. Final tree might be - # evaluated automatically under BINGAMMAI, depending on the tree - # search option - # -m BINGAMMA : GAMMA model of rate heterogeneity (alpha parameter + # -m BINCATI : Optimization of site-specific evolutionary rates which + # are categorized into numberOfCategories distinct rate categories + # for greater computational efficiency. Final tree might be + # evaluated automatically under BINGAMMAI, depending on the tree + # search option + # -m BINGAMMA : GAMMA model of rate heterogeneity (alpha parameter # will be estimated) - # -m BINGAMMAI : Same as BINGAMMA, but with estimate of proportion of + # -m BINGAMMAI : Same as BINGAMMA, but with estimate of proportion of # invariable sites # NUCLEOTIDES - # -m GTRCAT: GTR + Optimization of substitution rates + Optimization - # of site-specific evolutionary rates which are categorized into - # numberOfCategories distinct rate categories for greater + # -m GTRCAT: GTR + Optimization of substitution rates + Optimization + # of site-specific evolutionary rates which are categorized into + # numberOfCategories distinct rate categories for greater # computational efficiency - # -m GTRCAT_FLOAT : Same as above but uses single-precision floating - # point arithemtics instead of double-precision Usage only - # recommened for testing, the code will run slower, but can save - # almost 50% of memory. If you have problems with phylogenomic - # datasets and large memory requirements you may give it a shot. - # Keep in mind that numerical stability seems to be okay but needs + # -m GTRCAT_FLOAT : Same as above but uses single-precision floating + # point arithemtics instead of double-precision Usage only + # recommened for testing, the code will run slower, but can save + # almost 50% of memory. If you have problems with phylogenomic + # datasets and large memory requirements you may give it a shot. + # Keep in mind that numerical stability seems to be okay but needs # further testing. - DEPRECATED IN 7.3.0 - # -m GTRCATI : GTR + Optimization of substitution rates + Optimization - # of site-specific evolutionary rates which are categorized into - # numberOfCategories distinct rate categories for greater - # computational efficiency. Final tree might be evaluated under + # -m GTRCATI : GTR + Optimization of substitution rates + Optimization + # of site-specific evolutionary rates which are categorized into + # numberOfCategories distinct rate categories for greater + # computational efficiency. Final tree might be evaluated under # GTRGAMMAI, depending on the tree search option # -m GTRGAMMA: GTR + Optimization of substitution rates + Gamma - # -m GTRGAMMA_FLOAT : Same as GTRGAMMA, but also with - # single-precision arithmetics, same cautionary notes as for + # -m GTRGAMMA_FLOAT : Same as GTRGAMMA, but also with + # single-precision arithmetics, same cautionary notes as for # GTRCAT_FLOAT apply. - DEPRECATED IN 7.3.0 - # -m GTRGAMMAI : Same as GTRGAMMA, but with estimate of proportion of - # invariable sites + # -m GTRGAMMAI : Same as GTRGAMMA, but with estimate of proportion of + # invariable sites # MULTI-STATE: - # -m MULTICAT : Optimization of site-specific evolutionary rates which - # are categorized into numberOfCategories distinct rate categories - # for greater computational efficiency. Final tree might be - # evaluated automatically under MULTIGAMMA, depending on the tree + # -m MULTICAT : Optimization of site-specific evolutionary rates which + # are categorized into numberOfCategories distinct rate categories + # for greater computational efficiency. Final tree might be + # evaluated automatically under MULTIGAMMA, depending on the tree # search option - # -m MULTICATI : Optimization of site-specific evolutionary rates - # which are categorized into numberOfCategories distinct rate - # categories for greater computational efficiency. Final tree - # might be evaluated automatically under MULTIGAMMAI, depending on - # the tree search option - # -m MULTIGAMMA : GAMMA model of rate heterogeneity (alpha parameter + # -m MULTICATI : Optimization of site-specific evolutionary rates + # which are categorized into numberOfCategories distinct rate + # categories for greater computational efficiency. Final tree + # might be evaluated automatically under MULTIGAMMAI, depending on + # the tree search option + # -m MULTIGAMMA : GAMMA model of rate heterogeneity (alpha parameter # will be estimated) - # -m MULTIGAMMAI : Same as MULTIGAMMA, but with estimate of proportion + # -m MULTIGAMMAI : Same as MULTIGAMMA, but with estimate of proportion # of invariable sites # You can use up to 32 distinct character states to encode multi-state - # regions, they must be used in the following order: 0, 1, 2, 3, 4, 5, - # 6, 7, 8, 9, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, - # T, U, V i.e., if you have 6 distinct character states you would use 0, + # regions, they must be used in the following order: 0, 1, 2, 3, 4, 5, + # 6, 7, 8, 9, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, + # T, U, V i.e., if you have 6 distinct character states you would use 0, # 1, 2, 3, 4, 5 to encode these. The substitution model for the # multi-state regions can be selected via the "-K" option # Amino Acid Models: - # -m PROTCATmatrixName[F] : specified AA matrix + Optimization of - # substitution rates + Optimization of site-specific evolutionary - # rates which are categorized into numberOfCategories distinct + # -m PROTCATmatrixName[F] : specified AA matrix + Optimization of + # substitution rates + Optimization of site-specific evolutionary + # rates which are categorized into numberOfCategories distinct # rate categories for greater computational efficiency. Final - # tree might be evaluated automatically under + # tree might be evaluated automatically under # PROTGAMMAmatrixName[f], depending on the tree search option - # -m PROTCATmatrixName[F]_FLOAT : PROTCAT with single precision + # -m PROTCATmatrixName[F]_FLOAT : PROTCAT with single precision # arithmetics, same cautionary notes as for GTRCAT_FLOAT apply # - DEPRECATED IN 7.3.0 - # -m PROTCATImatrixName[F] : specified AA matrix + Optimization of + # -m PROTCATImatrixName[F] : specified AA matrix + Optimization of # substitution rates + Optimization of site-specific - # evolutionary rates which are categorized into numberOfCategories - # distinct rate categories for greater computational efficiency. - # Final tree might be evaluated automatically under + # evolutionary rates which are categorized into numberOfCategories + # distinct rate categories for greater computational efficiency. + # Final tree might be evaluated automatically under # PROTGAMMAImatrixName[f], depending on the tree search option - # -m PROTGAMMAmatrixName[F] : specified AA matrix + Optimization of - # substitution rates + GAMMA model of rate heterogeneity (alpha + # -m PROTGAMMAmatrixName[F] : specified AA matrix + Optimization of + # substitution rates + GAMMA model of rate heterogeneity (alpha # parameter will be estimated) - # -m PROTGAMMAmatrixName[F]_FLOAT : PROTGAMMA with single precision + # -m PROTGAMMAmatrixName[F]_FLOAT : PROTGAMMA with single precision # arithmetics, same cautionary notes as for GTRCAT_FLOAT apply # - DEPRECATED IN 7.3.0 - # -m PROTGAMMAImatrixName[F] : Same as PROTGAMMAmatrixName[F], but - # with estimate of proportion of invariable sites - # Available AA substitution models: DAYHOFF, DCMUT, JTT, MTREV, WAG, - # RTREV, CPREV, VT, BLOSUM62, MTMAM, LG, GTR. With the optional "F" + # -m PROTGAMMAImatrixName[F] : Same as PROTGAMMAmatrixName[F], but + # with estimate of proportion of invariable sites + # Available AA substitution models: DAYHOFF, DCMUT, JTT, MTREV, WAG, + # RTREV, CPREV, VT, BLOSUM62, MTMAM, LG, GTR. With the optional "F" # appendix you can specify if you want to use empirical base frequencies - # Please note that for mixed models you can in addition specify the - # per-gene AA model in the mixed model file (see manual for details). + # Please note that for mixed models you can in addition specify the + # per-gene AA model in the mixed model file (see manual for details). # Also note that if you estimate AA GTR parameters on a partitioned - # dataset, they will be linked (estimated jointly) across all partitions + # dataset, they will be linked (estimated jointly) across all partitions # to avoid over-parametrization '-m':ValuedParameter('-',Name='m',Delimiter=' '), - # Switch on estimation of individual per-partition branch lengths. Only - # has effect when used in combination with "-q". Branch lengths for - # individual partitions will be printed to separate files. A weighted - # average of the branch lengths is computed by using the respective - # partition lengths. + # Switch on estimation of individual per-partition branch lengths. Only + # has effect when used in combination with "-q". Branch lengths for + # individual partitions will be printed to separate files. A weighted + # average of the branch lengths is computed by using the respective + # partition lengths. # DEFAULT: OFF '-M':FlagParameter('-',Name='M'), - + # Specifies the name of the output file. '-n':ValuedParameter('-',Name='n',Delimiter=' '), @@ -337,21 +337,21 @@ class Raxml(CommandLineApplication): # no spaces, should be monophyletic). '-o':ValuedParameter('-',Name='o',Delimiter=' '), - # Enable checkpointing using the dmtcp library available at - # http://dmtcp.sourceforge.net/. This only works if you call the program - # by preceded by the command "dmtcp_checkpoint" and if you compile a - # dedicated binary using the appropriate Makefile. With "-O" you can + # Enable checkpointing using the dmtcp library available at + # http://dmtcp.sourceforge.net/. This only works if you call the program + # by preceded by the command "dmtcp_checkpoint" and if you compile a + # dedicated binary using the appropriate Makefile. With "-O" you can # specify the interval between checkpoints in seconds. # DEFAULT: 3600.0 seconds - DEPRECATED IN 7.3.0 #'-O':ValuedParameter('-',Name='O',Delimiter=' ',Value=3600.0), - # Specify a random number seed for the parsimony inferences. This allows + # Specify a random number seed for the parsimony inferences. This allows # you to reproduce your results and will help me debug the program. '-p':ValuedParameter('-',Name='p',Delimiter=' '), - - # Specify the file name of a user-defined AA (Protein) substitution - # model. This file must contain 420 entries, the first 400 being the AA - # substitution rates (this must be a symmetric matrix) and the last 20 + + # Specify the file name of a user-defined AA (Protein) substitution + # model. This file must contain 420 entries, the first 400 being the AA + # substitution rates (this must be a symmetric matrix) and the last 20 # are the empirical base frequencies '-P':ValuedParameter('-',Name='P',Delimiter=' '), @@ -366,60 +366,60 @@ class Raxml(CommandLineApplication): # Turn on computation of SH-like support values on tree. # DEFAULT: OFF '-Q':FlagParameter('-', Name='Q'), - + # Constraint file name: allows a bifurcating Newick tree to be passed # in as a constraint file, other taxa will be added by parsimony. '-r':ValuedParameter('-',Name='r',Delimiter=' '), - - # THE FOLLOWING "R" is IN 7.2.8 + + # THE FOLLOWING "R" is IN 7.2.8 # Specify the file name of a binary model parameter file that has # previously been generated with RAxML using the -f e tree evaluation # option. The file name should be: RAxML_binaryModelParameters.runID '-R':ValuedParameter('-',Name='R',Delimiter=' '), - + # specify the name of the alignment data file, in relaxed PHYLIP # format. '-s':ValuedParameter('-',Name='s',Delimiter=' '), - # Specify the name of a secondary structure file. The file can contain - # "." for alignment columns that do not form part of a stem and + # Specify the name of a secondary structure file. The file can contain + # "." for alignment columns that do not form part of a stem and # characters "()<>[]{}" to define stem regions and pseudoknots '-S':ValuedParameter('-',Name='S',Delimiter=' '), - + # Specify a user starting tree file name in Newick format '-t':ValuedParameter('-',Name='t',Delimiter=' '), # PTHREADS VERSION ONLY! Specify the number of threads you want to run. - # Make sure to set "-T" to at most the number of CPUs you have on your - # machine, otherwise, there will be a huge performance decrease! + # Make sure to set "-T" to at most the number of CPUs you have on your + # machine, otherwise, there will be a huge performance decrease! '-T':ValuedParameter('-',Name='T',Delimiter=' '), - - # THE FOLLOWING "U" is IN 7.2.8 + + # THE FOLLOWING "U" is IN 7.2.8 # Try to save memory by using SEV-based implementation for gap columns # on large gappy alignments # WARNING: this will only work for DNA under GTRGAMMA and is still in an # experimental state. '-U':ValuedParameter('-',Name='U',Delimiter=' '), - + # Print the version '-v':FlagParameter('-',Name='v'), - # Name of the working directory where RAxML-V will write its output + # Name of the working directory where RAxML-V will write its output # files. '-w':ValuedParameter('-',Name='w',Delimiter=' '), # THE FOLLOWING "W" is IN 7.2.8 # Sliding window size for leave-one-out site-specific placement bias - # algorithm only effective when used in combination with "-f S" + # algorithm only effective when used in combination with "-f S" # DEFAULT: 100 sites '-W':ValuedParameter('-',Name='W',Delimiter=' '), - - # Specify an integer number (random seed) and turn on rapid - # bootstrapping. CAUTION: unlike in version 7.0.4 RAxML will conduct - # rapid BS replicates under the model of rate heterogeneity you + + # Specify an integer number (random seed) and turn on rapid + # bootstrapping. CAUTION: unlike in version 7.0.4 RAxML will conduct + # rapid BS replicates under the model of rate heterogeneity you # specified via "-m" and not by default under CAT '-x':ValuedParameter('-',Name='x',Delimiter=' '), - + # EXPERIMENTAL OPTION: This option will do a per-site estimate of # protein substitution models by looping over all given, fixed models # LG, WAG, JTT, etc and using their respective base frequencies to @@ -433,9 +433,9 @@ class Raxml(CommandLineApplication): # optimize an ML analysis of the tree '-y':FlagParameter('-', Name='y'), - # Do a more thorough parsimony tree search using a parsimony ratchet and - # exit. Specify the number of ratchet searches via "-#" or "-N". This - # has just been implemented for completeness, if you want a fast MP + # Do a more thorough parsimony tree search using a parsimony ratchet and + # exit. Specify the number of ratchet searches via "-#" or "-N". This + # has just been implemented for completeness, if you want a fast MP # implementation use TNT # DEFAULT: OFF - DEPRECATED IN 7.3.0 #'-Y':FlagParameter('-', Name='Y'), @@ -461,7 +461,7 @@ def _format_output(self, outfile_name, out_type): """ Prepend proper output prefix to output filename """ outfile_name = self._absolute(outfile_name) - outparts = outfile_name.split("/") + outparts = outfile_name.split("/") outparts[-1] = self._out_format % (out_type, outparts[-1] ) return '/'.join(outparts) @@ -482,7 +482,7 @@ def _input_as_lines(self,data): def _input_as_string(self,data): """Makes data the value of a specific parameter - + This method returns the empty string. The parameter will be printed automatically once set. """ @@ -495,7 +495,7 @@ def _input_as_multiline_string(self, data): self.Parameters['-s']\ .on(super(Raxml,self)._input_as_multiline_string(data)) return '' - + def _absolute(self,path): path = FilePath(path) if isabs(path): @@ -509,21 +509,21 @@ def _log_out_filename(self): if self.Parameters['-n'].isOn(): return self._format_output(str(self.Parameters['-n'].Value), "log") else: - raise ValueError, "No output file specified." + raise ValueError, "No output file specified." def _info_out_filename(self): if self.Parameters['-n'].isOn(): return self._format_output(str(self.Parameters['-n'].Value), "info") else: - raise ValueError, "No output file specified." + raise ValueError, "No output file specified." def _parsimony_tree_out_filename(self): if self.Parameters['-n'].isOn(): return self._format_output(str(self.Parameters['-n'].Value), \ "parsimonyTree") else: - raise ValueError, "No output file specified." - + raise ValueError, "No output file specified." + # added for tree-insertion def _originallabelled_tree_out_filename(self): if self.Parameters['-n'].isOn(): @@ -531,7 +531,7 @@ def _originallabelled_tree_out_filename(self): "originalLabelledTree") else: raise ValueError, "No output file specified." - + # added for tree-insertion def _labelled_tree_out_filename(self): if self.Parameters['-n'].isOn(): @@ -547,7 +547,7 @@ def _classification_out_filename(self): "classification") else: raise ValueError, "No output file specified." - + # added for tree-insertion def _classificationlikelihoodweights_out_filename(self): if self.Parameters['-n'].isOn(): @@ -555,7 +555,7 @@ def _classificationlikelihoodweights_out_filename(self): "classificationLikelihoodWeights") else: raise ValueError, "No output file specified." - + # added for tree-insertion def _best_tree_out_filename(self): if self.Parameters['-n'].isOn(): @@ -579,7 +579,7 @@ def _json_out_filename(self): "portableTree") else: raise ValueError, "No output file specified." - + # added for tree-insertion def _parsimony_out_filename(self): if self.Parameters['-n'].isOn(): @@ -587,13 +587,13 @@ def _parsimony_out_filename(self): "equallyParsimoniousPlacements") else: raise ValueError, "No output file specified." - + def _result_tree_out_filename(self): if self.Parameters['-n'].isOn(): return self._format_output(str(self.Parameters['-n'].Value), \ "result") else: - raise ValueError, "No output file specified." + raise ValueError, "No output file specified." def _result_bootstrap_out_filename(self): if self.Parameters['-n'].isOn(): @@ -611,7 +611,7 @@ def _checkpoint_out_filenames(self): if self.Parameters['-n'].isOn(): out_name = str(self.Parameters['-n'].Value) walk_root = self.WorkingDir - if self.Parameters['-w'].isOn(): + if self.Parameters['-w'].isOn(): walk_root = str(self.Parameters['-w'].Value) for tup in walk(walk_root): dpath, dnames, dfiles = tup @@ -622,7 +622,7 @@ def _checkpoint_out_filenames(self): break else: - raise ValueError, "No output file specified." + raise ValueError, "No output file specified." return out_filenames def _handle_app_result_build_failure(self,out,err,exit_status,result_paths): @@ -650,10 +650,10 @@ def _get_result_paths(self,data): result['Classification'] = ResultPath( Path=self._classification_out_filename(), IsWritten=True) - result['ClassificationLikelihoodWeights'] = ResultPath( + result['ClassificationLikelihoodWeights'] = ResultPath( Path=self._classificationlikelihoodweights_out_filename(), IsWritten=True) - result['OriginalLabelledTree'] = ResultPath( + result['OriginalLabelledTree'] = ResultPath( Path=self._originallabelled_tree_out_filename(), IsWritten=True) result['Result'] = ResultPath( @@ -664,11 +664,11 @@ def _get_result_paths(self,data): Path=self._json_out_filename()+'.jplace',IsWritten=True) elif self.Parameters["-f"].Value == 'y': #these were added to handle the results from tree-insertion - - result['Parsimony'] = ResultPath( + + result['Parsimony'] = ResultPath( Path=self._parsimony_out_filename(), IsWritten=True) - result['OriginalLabelledTree'] = ResultPath( + result['OriginalLabelledTree'] = ResultPath( Path=self._originallabelled_tree_out_filename(), IsWritten=True) result['json'] = ResultPath( @@ -686,7 +686,7 @@ def _get_result_paths(self,data): result['besttree'] = ResultPath( Path=self._best_tree_out_filename(), IsWritten=True) - + for checkpoint_file in self._checkpoint_out_filenames(): checkpoint_num = checkpoint_file.split(".")[-1] try: @@ -696,7 +696,7 @@ def _get_result_paths(self,data): result['Checkpoint%d' % checkpoint_num] = ResultPath( Path=checkpoint_file, IsWritten=True) - + return result @@ -706,14 +706,14 @@ def raxml_alignment(align_obj, params={}, SuppressStderr=True, SuppressStdout=True): - """Run raxml on alignment object + """Run raxml on alignment object align_obj: Alignment object params: you can set any params except -w and -n - returns: tuple (phylonode, - parsimonyphylonode, - log likelihood, + returns: tuple (phylonode, + parsimonyphylonode, + log likelihood, total exec time) """ @@ -724,7 +724,7 @@ def raxml_alignment(align_obj, params["-p"] = randint(1,100000) ih = '_input_as_multiline_string' seqs, align_map = align_obj.toPhylip() - + #print params["-n"] # set up command @@ -756,17 +756,17 @@ def raxml_alignment(align_obj, return tree_node, parsimony_tree_node, log_likelihood, total_exec_time -def build_tree_from_alignment(aln, moltype, best_tree=False, params={}): +def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params={}): """Returns a tree from Alignment object aln. - + aln: an xxx.Alignment object, or data that can be used to build one. - + moltype: cogent.core.moltype.MolType object best_tree: best_tree suppport is currently not implemented - + params: dict of parameters to pass in to the RAxML app controller. - + The result will be an xxx.Alignment object, or None if tree fails. """ if best_tree: @@ -787,47 +787,47 @@ def build_tree_from_alignment(aln, moltype, best_tree=False, params={}): aln = Alignment(aln) seqs, align_map = aln.toPhylip() - # generate temp filename for output - params["-w"] = "/tmp/" + # generate temp filename for output + params["-w"] = "/tmp/" params["-n"] = get_tmp_filename().split("/")[-1] params["-k"] = True params["-p"] = randint(1,100000) params["-x"] = randint(1,100000) - - ih = '_input_as_multiline_string' + + ih = '_input_as_multiline_string' raxml_app = Raxml(params=params, InputHandler=ih, WorkingDir=None, SuppressStderr=True, SuppressStdout=True) - + raxml_result = raxml_app(seqs) - + tree = DndParser(raxml_result['Bootstrap'], constructor=PhyloNode) - + for node in tree.tips(): node.Name = align_map[node.Name] raxml_result.cleanUp() return tree - - + + def insert_sequences_into_tree(seqs, moltype, params={}, write_log=True): """Insert sequences into Tree. - + aln: an xxx.Alignment object, or data that can be used to build one. - + moltype: cogent.core.moltype.MolType object - + params: dict of parameters to pass in to the RAxML app controller. - + The result will be a tree. """ - - ih = '_input_as_multiline_string' + + ih = '_input_as_multiline_string' raxml_app = Raxml(params=params, InputHandler=ih, @@ -835,20 +835,20 @@ def insert_sequences_into_tree(seqs, moltype, params={}, SuppressStderr=False, SuppressStdout=False, HALT_EXEC=False) - + raxml_result = raxml_app(seqs) - + # write a log file if write_log: log_fp = join(params["-w"],'log_raxml_'+split(get_tmp_filename())[-1]) log_file=open(log_fp,'w') log_file.write(raxml_result['StdOut'].read()) log_file.close() - - ''' + + ''' # getting setup since parsimony doesn't output tree..only jplace, however # it is currently corrupt - + # use guppy to convert json file into a placement tree guppy_params={'tog':None} @@ -856,7 +856,7 @@ def insert_sequences_into_tree(seqs, moltype, params={}, output_dir=params["-w"], \ params=guppy_params) ''' - + # get tree from 'Result Names' new_tree=raxml_result['Result'].readlines() filtered_tree=re.sub('\[I\d+\]','',str(new_tree)) @@ -865,6 +865,3 @@ def insert_sequences_into_tree(seqs, moltype, params={}, raxml_result.cleanUp() return tree - - -