From 31b6aa851cb5daa7f8b0ba800bbf2912d1472c44 Mon Sep 17 00:00:00 2001 From: Greg Caporaso Date: Tue, 13 May 2014 21:28:31 -0700 Subject: [PATCH] added default moltype - this should have been part of #19 --- brokit/infernal.py | 639 ++++++++++++++++++++++----------------------- 1 file changed, 319 insertions(+), 320 deletions(-) diff --git a/brokit/infernal.py b/brokit/infernal.py index 745ca99..ec8c8ce 100644 --- a/brokit/infernal.py +++ b/brokit/infernal.py @@ -22,7 +22,7 @@ 'RNA':'--rna',\ RNA:'--rna',\ } - + SEQ_CONSTRUCTOR_MAP = {'DNA':ChangedDnaSequence,\ DNA:ChangedDnaSequence,\ 'RNA':ChangedRnaSequence,\ @@ -32,128 +32,128 @@ class Cmalign(CommandLineApplication): """cmalign application controller.""" _options = { - + # -o Save the alignment in Stockholm format to a file . The default # is to write it to standard output. '-o':ValuedParameter(Prefix='-',Name='o',Delimiter=' '),\ - + # -l Turn on the local alignment algorithm. Default is global. '-l':FlagParameter(Prefix='-',Name='l'),\ - + # -p Annotate the alignment with posterior probabilities calculated using # the Inside and Outside algorithms. '-p':FlagParameter(Prefix='-',Name='p'),\ - + # -q Quiet; suppress the verbose banner, and only print the resulting # alignment to stdout. '-q':FlagParameter(Prefix='-',Name='q'),\ - + # --informat Assert that the input seqfile is in format . Do not run # Babelfish format autodection. Acceptable formats are: FASTA, EMBL, # UNIPROT, GENBANK, and DDBJ. is case-insensitive. '--informat':ValuedParameter(Prefix='--',Name='informat',Delimiter=' '),\ - + # --mpi Run as an MPI parallel program. (see User's Guide for details). '--mpi':FlagParameter(Prefix='--',Name='mpi'),\ - - # Expert Options - - # --optacc Align sequences using the Durbin/Holmes optimal accuracy - # algorithm. This is default behavior, so this option is probably useless. + + # Expert Options + + # --optacc Align sequences using the Durbin/Holmes optimal accuracy + # algorithm. This is default behavior, so this option is probably useless. '--optacc':FlagParameter(Prefix='--',Name='optacc'),\ - - # --cyk Do not use the Durbin/Holmes optimal accuracy alignment to align the + + # --cyk Do not use the Durbin/Holmes optimal accuracy alignment to align the # sequences, instead use the CYK algorithm which determines the optimally - # scoring alignment of the sequence to the model. + # scoring alignment of the sequence to the model. '--cyk':FlagParameter(Prefix='--',Name='cyk'),\ - + # --sample Sample an alignment from the posterior distribution of # alignments. '--sample':FlagParameter(Prefix='--',Name='sample'),\ - - # -s Set the random number generator seed to , where is a - # positive integer. This option can only be used in combination with + + # -s Set the random number generator seed to , where is a + # positive integer. This option can only be used in combination with # --sample. The default is to use time() to generate a different seed for # each run, which means that two different runs of cmalign --sample on the # same alignment will give slightly different results. You can use this # option to generate reproducible results. '-s':ValuedParameter(Prefix='-',Name='s',Delimiter=' '),\ - + # --viterbi Do not use the CM to align the sequences, instead use the HMM # Viterbi algorithm to align with a CM Plan 9 HMM. '--viterbi':FlagParameter(Prefix='--',Name='viterbi'),\ - + # --sub Turn on the sub model construction and alignment procedure. '--sub':FlagParameter(Prefix='--',Name='sub'),\ - - # --small Use the divide and conquer CYK alignment algorithm described in + + # --small Use the divide and conquer CYK alignment algorithm described in # SR Eddy, BMC Bioinformatics 3:18, 2002. '--small':FlagParameter(Prefix='--',Name='small'),\ - + # --hbanded This option is turned on by default. Accelerate alignment by # pruning away regions of the CM DP matrix that are deemed negligible by # an HMM. '--hbanded':FlagParameter(Prefix='--',Name='hbanded'),\ - + # --nonbanded Turns off HMM banding. '--nonbanded':FlagParameter(Prefix='--',Name='nonbanded'),\ - + # --tau Set the tail loss probability used during HMM band calculation # to . '--tau':ValuedParameter(Prefix='--',Name='tau',Delimiter=' '),\ - + # --mxsize Set the maximum allowable DP matrix size to megabytes. '--mxsize':ValuedParameter(Prefix='--',Name='mxsize',Delimiter=' '),\ # --rna Output the alignments as RNA sequence alignments. This is true by # default. '--rna':FlagParameter(Prefix='--',Name='rna'),\ - + # --dna Output the alignments as DNA sequence alignments. '--dna':FlagParameter(Prefix='--',Name='dna'),\ - + # --matchonly Only include match columns in the output alignment, do not # include any insertions relative to the consensus model. '--matchonly':FlagParameter(Prefix='--',Name='matchonly'),\ - + # --resonly Only include match columns in the output alignment that have at # least 1 residue (non-gap character) in them. '--resonly':FlagParameter(Prefix='--',Name='resonly'),\ - - # --fins Change the behavior of how insert emissions are placed in the + + # --fins Change the behavior of how insert emissions are placed in the # alignment. '--fins':FlagParameter(Prefix='--',Name='fins'),\ - + # --onepost Modifies behavior of the -p option. Use only one character # instead of two to annotate the posterior probability of each aligned # residue. '--onepost':FlagParameter(Prefix='--',Name='onepost'),\ - + # --withali Reads an alignment from file and aligns it as a single # object to the CM; e.g. the alignment in is held fixed. '--withali':ValuedParameter(Prefix='--',Name='withali',Delimiter=' '),\ - + # --withpknots Must be used in combination with --withali . Propogate # structural information for any pseudoknots that exist in to the # output alignment. '--withpknots':FlagParameter(Prefix='--',Name='withpknots'),\ - + # --rf Must be used in combination with --withali . Specify that the # alignment in has the same "#=GC RF" annotation as the alignment file - # the CM was built from using cmbuild and further that the --rf option was + # the CM was built from using cmbuild and further that the --rf option was # supplied to cmbuild when the CM was constructed. '--rf':FlagParameter(Prefix='--',Name='rf'),\ - + # --gapthresh Must be used in combination with --withali . Specify # that the --gapthresh option was supplied to cmbuild when the CM was # constructed from the alignment file . '--gapthresh':ValuedParameter(Prefix='--',Name='gapthresh',Delimiter=' '),\ - + # --tfile Dump tabular sequence tracebacks for each individual sequence # to a file . Primarily useful for debugging. '--tfile':ValuedParameter(Prefix='--',Name='tfile',Delimiter=' '),\ - - + + } _parameters = {} _parameters.update(_options) @@ -162,20 +162,20 @@ class Cmalign(CommandLineApplication): def getHelp(self): """Method that points to the Infernal documentation.""" - + help_str = \ """ See Infernal documentation at: http://infernal.janelia.org/ """ return help_str - + def _tempfile_as_multiline_string(self, data): """Write a multiline string to a temp file and return the filename. data: a multiline string to be written to a file. - * Note: the result will be the filename as a FilePath object + * Note: the result will be the filename as a FilePath object (which is a string subclass). """ @@ -186,7 +186,7 @@ def _tempfile_as_multiline_string(self, data): return filename def _alignment_out_filename(self): - + if self.Parameters['-o'].isOn(): refined_filename = self._absolute(str(\ self.Parameters['-o'].Value)) @@ -199,191 +199,191 @@ def _get_result_paths(self,data): if self.Parameters['-o'].isOn(): out_name = self._alignment_out_filename() result['Alignment'] = ResultPath(Path=out_name,IsWritten=True) - + return result class Cmbuild(CommandLineApplication): """cmbuild application controller.""" _options = { - + # -n Name the covariance model . (Does not work if alifile contains # more than one alignment). '-n':ValuedParameter(Prefix='-',Name='n',Delimiter=' '),\ - + # -A Append the CM to cmfile, if cmfile already exists. '-A':FlagParameter(Prefix='-',Name='A'),\ - + # -F Allow cmfile to be overwritten. Normally, if cmfile already exists, # cmbuild exits with an error unless the -A or -F option is set. '-F':FlagParameter(Prefix='-',Name='F'),\ - + # -v Run in verbose output mode instead of using the default single line # tabular format. This output format is similar to that used by older # versions of Infernal. '-v':FlagParameter(Prefix='-',Name='v'),\ - + # --iins Allow informative insert emissions for the CM. By default, all CM # insert emission scores are set to 0.0 bits. '--iins':FlagParameter(Prefix='--',Name='iins'),\ - + # --Wbeta Set the beta tail loss probability for query-dependent banding # (QDB) to The QDB algorithm is used to determine the maximium length # of a hit to the model. For more information on QDB see (Nawrocki and # Eddy, PLoS Computational Biology 3(3): e56). '--Wbeta':ValuedParameter(Prefix='--',Name='Wbeta',Delimiter=' '),\ - + # Expert Options - + # --rsearch Parameterize emission scores a la RSEARCH, using the - # RIBOSUM matrix in file . For more information see the RSEARCH + # RIBOSUM matrix in file . For more information see the RSEARCH # publication (Klein and Eddy, BMC Bioinformatics 4:44, 2003). Actually, # the emission scores will not exactly With --rsearch enabled, all # alignments in alifile must contain exactly one sequence or the --call # option must also be enabled. '--rsearch':ValuedParameter(Prefix='--',Name='rsearch',Delimiter=' '),\ - + # --binary Save the model in a compact binary format. The default is a more # readable ASCII text format. '--binary':FlagParameter(Prefix='--',Name='binary'),\ - + # --rf Use reference coordinate annotation (#=GC RF line, in Stockholm) to # determine which columns are consensus, and which are inserts. '--rf':FlagParameter(Prefix='--',Name='rf'),\ - + # --gapthresh Set the gap threshold (used for determining which columns # are insertions versus consensus; see --rf above) to . The default is # 0.5. '--gapthresh':ValuedParameter(Prefix='--',Name='gapthresh',Delimiter=' '),\ - + # --ignorant Strip all base pair secondary structure information from all # input alignments in alifile before building the CM(s). '--ignorant':FlagParameter(Prefix='--',Name='ignorant'),\ - + # --wgsc Use the Gerstein/Sonnhammer/Chothia (GSC) weighting algorithm. # This is the default unless the number of sequences in the alignment # exceeds a cutoff (see --pbswitch), in which case the default becomes # the faster Henikoff position-based weighting scheme. '--wgsc':FlagParameter(Prefix='--',Name='wgsc'),\ - + # --wblosum Use the BLOSUM filtering algorithm to weight the sequences, # instead of the default GSC weighting. '--wblosum':FlagParameter(Prefix='--',Name='wblosum'),\ - + # --wpb Use the Henikoff position-based weighting scheme. This weighting # scheme is automatically used (overriding --wgsc and --wblosum) if the # number of sequences in the alignment exceeds a cutoff (see --pbswitch). '--wpb':FlagParameter(Prefix='--',Name='wpb'),\ - + # --wnone Turn sequence weighting off; e.g. explicitly set all sequence # weights to 1.0. '--wnone':FlagParameter(Prefix='--',Name='wnone'),\ - + # --wgiven Use sequence weights as given in annotation in the input # alignment file. If no weights were given, assume they are all 1.0. # The default is to determine new sequence weights by the Gerstein/ # Sonnhammer/Chothia algorithm, ignoring any annotated weights. '--wgiven':FlagParameter(Prefix='--',Name='wgiven'),\ - + # --pbswitch Set the cutoff for automatically switching the weighting # method to the Henikoff position-based weighting scheme to . If the # number of sequences in the alignment exceeds Henikoff weighting is # used. By default is 5000. '--pbswitch':ValuedParameter(Prefix='--',Name='pbswitch',Delimiter=' '),\ - + # --wid Controls the behavior of the --wblosum weighting option by # setting the percent identity for clustering the alignment to . '--wid':ValuedParameter(Prefix='--',Name='wid',Delimiter=' '),\ - + # --eent Use the entropy weighting strategy to determine the effective # sequence number that gives a target mean match state relative entropy. '--wgiven':FlagParameter(Prefix='--',Name='wgiven'),\ - + # --enone Turn off the entropy weighting strategy. The effective sequence # number is just the number of sequences in the alignment. '--wgiven':FlagParameter(Prefix='--',Name='wgiven'),\ - + # --ere Set the target mean match state entropy as . By default the # target entropy 1.46 bits. '--ere':ValuedParameter(Prefix='--',Name='ere',Delimiter=' '),\ - + # --null Read a null model from . The null model defines the # probability of each RNA nucleotide in background sequence, the default # is to use 0.25 for each nucleotide. '--null':ValuedParameter(Prefix='--',Name='null',Delimiter=' '),\ - - # --prior Read a Dirichlet prior from , replacing the default mixture + + # --prior Read a Dirichlet prior from , replacing the default mixture # Dirichlet. '--prior':ValuedParameter(Prefix='--',Name='prior',Delimiter=' '),\ - + # --ctarget Cluster each alignment in alifile by percent identity. # find a cutoff percent id threshold that gives exactly clusters and - # build a separate CM from each cluster. If is greater than the number + # build a separate CM from each cluster. If is greater than the number # of sequences in the alignment the program will not complain, and each # sequence in the alignment will be its own cluster. Each CM will have a # positive integer appended to its name indicating the order in which it # was built. '--ctarget':ValuedParameter(Prefix='--',Name='ctarget',Delimiter=' '),\ - + # --cmaxid Cluster each sequence alignment in alifile by percent # identity. Define clusters at the cutoff fractional id similarity of # and build a separate CM from each cluster. '--cmaxid':ValuedParameter(Prefix='--',Name='cmaxid',Delimiter=' '),\ - + # --call Build a separate CM from each sequence in each alignment in # alifile. Naming of CMs takes place as described above for --ctarget. '--call':FlagParameter(Prefix='--',Name='call'),\ - + # --corig After building multiple CMs using --ctarget, --cmindiff or --call # as described above, build a final CM using the complete original # alignment from alifile. '--corig':FlagParameter(Prefix='--',Name='corig'),\ - + # --cdump Dump the multiple alignments of each cluster to in # Stockholm format. This option only works in combination with --ctarget, # --cmindiff or --call. '--cdump':ValuedParameter(Prefix='--',Name='cdump',Delimiter=' '),\ - + # --refine Attempt to refine the alignment before building the CM using # expectation-maximization (EM). The final alignment (the alignment used # to build the CM that gets written to cmfile) is written to . '--refine':ValuedParameter(Prefix='--',Name='refine',Delimiter=' '),\ - + # --gibbs Modifies the behavior of --refine so Gibbs sampling is used # instead of EM. '--gibbs':FlagParameter(Prefix='--',Name='gibbs'),\ - + # -s Set the random seed to , where is a positive integer. - # This option can only be used in combination with --gibbs. The default is + # This option can only be used in combination with --gibbs. The default is # to use time() to generate a different seed for each run, which means # that two different runs of cmbuild --refine --gibbs on the same # alignment will give slightly different results. You can use this option # to generate reproducible results. '-s':ValuedParameter(Prefix='-',Name='s',Delimiter=' '),\ - + # -l With --refine, turn on the local alignment algorithm, which allows the # alignment to span two or more subsequences if necessary (e.g. if the # structures of the query model and target sequence are only partially # shared), allowing certain large insertions and deletions in the - # structure to be penalized differently than normal indels. The default is + # structure to be penalized differently than normal indels. The default is # to globally align the query model to the target sequences. '-l':ValuedParameter(Prefix='-',Name='l',Delimiter=' '),\ - + # -a With --refine, print the scores of each individual sequence alignment. '-a':ValuedParameter(Prefix='-',Name='a',Delimiter=' '),\ - + # --cyk With --refine, align with the CYK algorithm. '--cyk':FlagParameter(Prefix='--',Name='cyk'),\ - + # --sub With --refine, turn on the sub model construction and alignment # procedure. '--sub':FlagParameter(Prefix='--',Name='sub'),\ - + # --nonbanded With --refine, do not use HMM bands to accelerate alignment. # Use the full CYK algorithm which is guaranteed to give the optimal # alignment. This will slow down the run significantly, especially for # large models. '--nonbanded':FlagParameter(Prefix='--',Name='nonbanded'),\ - + # --tau With --refine, set the tail loss probability used during HMM # band calculation to . This is the amount of probability mass within # the HMM posterior probabilities that is considered negligible. The @@ -391,60 +391,60 @@ class Cmbuild(CommandLineApplication): # acceleration, but increase the chance of missing the optimal alignment # due to the HMM bands. '--tau':ValuedParameter(Prefix='--',Name='tau',Delimiter=' '),\ - + # --fins With --refine, change the behavior of how insert emissions are # placed in the alignment. '--fins':FlagParameter(Prefix='--',Name='fins'),\ - + # --mxsize With --refine, set the maximum allowable matrix size for # alignment to megabytes. '--mxsize':ValuedParameter(Prefix='--',Name='mxsize',Delimiter=' '),\ - + # --rdump With --refine, output the intermediate alignments at each # iteration of the refinement procedure (as described above for --refine ) # to file . '--rdump':ValuedParameter(Prefix='--',Name='rdump',Delimiter=' '),\ - + } _parameters = {} _parameters.update(_options) _command = "cmbuild" _suppress_stderr=True - + def getHelp(self): """Method that points to the Infernal documentation.""" - + help_str = \ """ See Infernal documentation at: http://infernal.janelia.org/ """ return help_str - + def _refine_out_filename(self): - + if self.Parameters['--refine'].isOn(): refined_filename = self._absolute(str(\ self.Parameters['--refine'].Value)) else: raise ValueError, 'No refine output file specified.' return refined_filename - + def _cm_out_filename(self): - + if self.Parameters['-n'].isOn(): refined_filename = self._absolute(str(\ self.Parameters['-n'].Value)) else: raise ValueError, 'No cm output file specified.' return refined_filename - + def _tempfile_as_multiline_string(self, data): """Write a multiline string to a temp file and return the filename. data: a multiline string to be written to a file. - * Note: the result will be the filename as a FilePath object + * Note: the result will be the filename as a FilePath object (which is a string subclass). """ @@ -453,7 +453,7 @@ def _tempfile_as_multiline_string(self, data): data_file.write(data) data_file.close() return filename - + def _get_result_paths(self,data): result = {} if self.Parameters['--refine'].isOn(): @@ -462,161 +462,161 @@ def _get_result_paths(self,data): if self.Parameters['-n'].isOn(): cm_name = self._cm_out_filename() result['CmFile'] = ResultPath(Path=cm_name,IsWritten=True) - + return result - + class Cmcalibrate(CommandLineApplication): """cmcalibrate application controller.""" _options = { - + # -s Set the random number generator seed to , where is a # positive integer. The default is to use time() to generate a different - # seed for each run, which means that two different runs of cmcalibrate on + # seed for each run, which means that two different runs of cmcalibrate on # the same CM will give slightly different E-value and HMM filter # threshold parameters. You can use this option to generate reproducible # results. '-s':ValuedParameter(Prefix='-',Name='s',Delimiter=' '),\ - + # --forecast Predict the running time of the calibration for cmfile and # provided options and exit, DO NOT perform the calibration. '--forecast':ValuedParameter(Prefix='--',Name='forecast',Delimiter=' '),\ - + # --mpi Run as an MPI parallel program. '--mpi':FlagParameter(Prefix='--',Name='mpi'),\ - + # Expert Options - + # --exp-cmL-glc Set the length of random sequence to search for the CM # glocal exponential tail fits to megabases (Mb). '--exp-cmL-glc':ValuedParameter(Prefix='--',Name='exp-cmL-glc',\ Delimiter=' '),\ - + # --exp-cmL-loc Set the length of random sequence to search for the CM # local exponential tail fits to megabases (Mb). '--exp-cmL-loc':ValuedParameter(Prefix='--',Name='exp-cmL-loc',\ Delimiter=' '),\ - + # --exp-hmmLn-glc Set the minimum random sequence length to search for # the HMM glocal exponential tail fits to megabases (Mb). '--exp-hmmLn-glc':ValuedParameter(Prefix='--',Name='exp-hmmLn-glc',\ Delimiter=' '),\ - + # --exp-hmmLn-loc Set the minimum random sequence length to search for # the HMM local exponential tail fits to megabases (Mb). '--exp-hmmLn-loc':ValuedParameter(Prefix='--',Name='exp-hmmLn-loc',\ Delimiter=' '),\ - + # --exp-hmmLx Set the maximum random sequence length to search when # determining HMM E-values to megabases (Mb). '--exp-hmmLx':ValuedParameter(Prefix='--',Name='exp-hmmLx',Delimiter=' '),\ - + # --exp-fract Set the HMM/CM fraction of dynamic programming # calculations to . '--exp-fract':ValuedParameter(Prefix='--',Name='exp-fract',Delimiter=' '),\ - + # --exp-tailn-cglc During E-value calibration of glocal CM search modes # fit the exponential tail to the high scores in the histogram tail that # includes hits per Mb searched. '--exp-tailn-cglc':ValuedParameter(Prefix='--',Name='exp-tailn-cglc',\ Delimiter=' '),\ - + # --exp-tailn-cloc During E-value calibration of local CM search modes # fit the exponential tail to the high scores in the histogram tail that # includes hits per Mb searched. '--exp-tailn-cloc':ValuedParameter(Prefix='--',Name='exp-tailn-cloc',\ Delimiter=' '),\ - + # --exp-tailn-hglc During E-value calibration of glocal HMM search modes # fit the exponential tail to the high scores in the histogram tail that # includes hits per Mb searched. '--exp-tailn-hglc':ValuedParameter(Prefix='--',Name='exp-tailn-hglc',\ Delimiter=' '),\ - + # --exp-tailn-hloc During E-value calibration of local HMM search modes # fit the exponential tail to the high scores in the histogram tail that # includes hits per Mb searched. '--exp-tailn-hloc':ValuedParameter(Prefix='--',Name='exp-tailn-hloc',\ Delimiter=' '),\ - + # --exp-tailp Ignore the --exp-tailn prefixed options and fit the # fraction right tail of the histogram to exponential tails, for all # search modes. '--exp-tailp':ValuedParameter(Prefix='--',Name='exp-tailp',Delimiter=' '),\ - + # --exp-tailxn With --exp-tailp enforce that the maximum number of hits # in the tail that is fit is . '--exp-tailxn':ValuedParameter(Prefix='--',Name='exp-tailxn',\ Delimiter=' '),\ - + # --exp-beta During E-value calibration, by default query-dependent # banding (QDB) is used to accelerate the CM search algorithms with a beta # tail loss probability of 1E-15. '--exp-beta':ValuedParameter(Prefix='--',Name='exp-beta',Delimiter=' '),\ - + # --exp-no-qdb Turn of QDB during E-value calibration. This will slow down # calibration, and is not recommended unless you plan on using --no-qdb in # cmsearch. '--exp-no-qdb':FlagParameter(Prefix='--',Name='exp-no-qdb'),\ - + # --exp-hfile Save the histograms fit for the E-value calibration to # file . The format of this file is two tab delimited columns. '--exp-hfile':ValuedParameter(Prefix='--',Name='exp-hfile',Delimiter=' '),\ - + # --exp-sfile Save a survival plot for the E-value calibration to file # . The format of this file is two tab delimited columns. '--exp-sfile':ValuedParameter(Prefix='--',Name='exp-sfile',Delimiter=' '),\ - + # --exp-qqfile Save a quantile-quantile plot for the E-value calibration # to file . The format of this file is two tab delimited columns. '--exp-qqfile':ValuedParameter(Prefix='--',Name='exp-qqfile',\ Delimiter=' '),\ - + # --exp-ffile Save statistics on the exponential tail statistics to file # . The file will contain the lambda and mu values for exponential # tails fit to tails of different sizes. '--exp-ffile':ValuedParameter(Prefix='--',Name='exp-ffile',Delimiter=' '),\ - + # --fil-N Set the number of sequences sampled and searched for the HMM # filter threshold calibration to . By default, is 10,000. '--fil-N':ValuedParameter(Prefix='--',Name='fil-N',Delimiter=' '),\ - + # --fil-F Set the fraction of sample sequences the HMM filter must be # able to recognize, and allow to survive, to , where is a positive # real number less than or equal to 1.0. By default, is 0.995. '--fil-F':ValuedParameter(Prefix='--',Name='fil-F',Delimiter=' '),\ - + # --fil-xhmm Set the target number of dynamic programming calculations # for a HMM filtered CM QDB search with beta = 1E-7 to times the # number of calculations required to do an HMM search. By default, is # 2.0. '--fil-xhmm':ValuedParameter(Prefix='--',Name='fil-xhmm',Delimiter=' '),\ - + # --fil-tau Set the tail loss probability during HMM band calculation # for HMM filter threshold calibration to . '--fil-tau':ValuedParameter(Prefix='--',Name='fil-tau',Delimiter=' '),\ - + # --fil-gemit During HMM filter calibration, always sample sequences from a # globally configured CM, even when calibrating local modes. '--fil-gemit':FlagParameter(Prefix='--',Name='fil-gemit'),\ - + # --fil-dfile Save statistics on filter threshold calibration, including # HMM and CM scores for all sampled sequences, to file . '--fil-dfile':ValuedParameter(Prefix='--',Name='fil-dfile',Delimiter=' '),\ - + # --mxsize Set the maximum allowable DP matrix size to megabytes. '--mxsize':ValuedParameter(Prefix='--',Name='mxsize',Delimiter=' '),\ } - + _parameters = {} _parameters.update(_options) _command = "cmcalibrate" _suppress_stderr=True - + def getHelp(self): """Method that points to the Infernal documentation.""" - + help_str = \ """ See Infernal documentation at: @@ -627,90 +627,90 @@ def getHelp(self): class Cmemit(CommandLineApplication): """cmemit application controller.""" _options = { - + # -o Save the synthetic sequences to file rather than writing them # to stdout. '-o':ValuedParameter(Prefix='-',Name='o',Delimiter=' '),\ - + # -n Generate sequences. Default is 10. '-n':ValuedParameter(Prefix='-',Name='n',Delimiter=' '),\ - + # -u Write the generated sequences in unaligned format (FASTA). This is the # default, so this option is probably useless. '-u':FlagParameter(Prefix='-',Name='u'),\ - + # -a Write the generated sequences in an aligned format (STOCKHOLM) with # consensus structure annotation rather than FASTA. '-a':FlagParameter(Prefix='-',Name='a'),\ - + # -c Predict a single majority-rule consensus sequence instead of sampling # sequences from the CM's probability distribution. '-c':FlagParameter(Prefix='-',Name='c'),\ - + # -l Configure the CMs into local mode before emitting sequences. See the # User's Guide for more information on locally configured CMs. '-l':FlagParameter(Prefix='-',Name='l'),\ - + # -s Set the random seed to , where is a positive integer. The # default is to use time() to generate a different seed for each run, # which means that two different runs of cmemit on the same CM will give # different results. You can use this option to generate reproducible # results. '-s':ValuedParameter(Prefix='-',Name='s',Delimiter=' '),\ - + # --rna Specify that the emitted sequences be output as RNA sequences. This # is true by default. '--rna':FlagParameter(Prefix='--',Name='rna'),\ - + # --dna Specify that the emitted sequences be output as DNA sequences. By # default, the output alphabet is RNA. '--dna':FlagParameter(Prefix='--',Name='dna'),\ - + # --tfile Dump tabular sequence parsetrees (tracebacks) for each emitted # sequence to file . Primarily useful for debugging. '--tfile':ValuedParameter(Prefix='--',Name='tfile',Delimiter=' '),\ - + # --exp Exponentiate the emission and transition probabilities of the CM # by and then renormalize those distributions before emitting # sequences. '--exp':ValuedParameter(Prefix='--',Name='exp',Delimiter=' '),\ - + # --begin Truncate the resulting alignment by removing all residues # before consensus column , where is a positive integer no greater # than the consensus length of the CM. Must be used in combination with # --end and either -a or --shmm (a developer option). '--begin':ValuedParameter(Prefix='--',Name='begin',Delimiter=' '),\ - + # --end Truncate the resulting alignment by removing all residues after # consensus column , where is a positive integer no greater than # the consensus length of the CM. Must be used in combination with --begin # and either -a or --shmm (a developer option). '--end':ValuedParameter(Prefix='--',Name='end',Delimiter=' '),\ - + } _parameters = {} _parameters.update(_options) _command = "cmemit" _suppress_stderr=True - + def getHelp(self): """Method that points to the Infernal documentation.""" - + help_str = \ """ See Infernal documentation at: http://infernal.janelia.org/ """ return help_str - + class Cmscore(CommandLineApplication): """cmscore application controller.""" _options = { - + # -n Set the number of sequences to generate and align to . This # option is incompatible with the --infile option. '-n':ValuedParameter(Prefix='-',Name='n',Delimiter=' '),\ - + # -l Turn on the local alignment algorithm, which allows the alignment to # span two or more subsequences if necessary (e.g. if the structures of # the query model and target sequence are only partially shared), allowing @@ -718,7 +718,7 @@ class Cmscore(CommandLineApplication): # differently than normal indels. The default is to globally align the # query model to the target sequences. '-l':FlagParameter(Prefix='-',Name='l'),\ - + # -s Set the random seed to , where is a positive integer. The # default is to use time() to generate a different seed for each run, # which means that two different runs of cmscore on the same CM will give @@ -727,63 +727,63 @@ class Cmscore(CommandLineApplication): # score, so -s is incompatible with the --infile option which supplies # the sequences to score in an input file. '-s':ValuedParameter(Prefix='-',Name='s',Delimiter=' '),\ - + # -a Print individual timings and score comparisons for each sequence in # seqfile. By default only summary statistics are printed. '-a':FlagParameter(Prefix='-',Name='a'),\ - + # --sub Turn on the sub model construction and alignment procedure. '--sub':FlagParameter(Prefix='--',Name='sub'),\ - + # --mxsize Set the maximum allowable DP matrix size to megabytes. '--mxsize':ValuedParameter(Prefix='--',Name='mxsize',Delimiter=' '),\ - + # --mpi Run as an MPI parallel program. '--mpi':FlagParameter(Prefix='--',Name='mpi'),\ - + # Expert Options - + # --emit Generate sequences to score by sampling from the CM. '--emit':FlagParameter(Prefix='--',Name='emit'),\ - + # --random Generate sequences to score by sampling from the CMs null # distribution. This option turns the --emit option off. '--random':FlagParameter(Prefix='--',Name='random'),\ - + # --infile Sequences to score are read from the file . All the # sequences from are read and scored, the -n and -s options are # incompatible with --infile. '--infile':ValuedParameter(Prefix='--',Name='infile',Delimiter=' '),\ - + # --outfile Save generated sequences that are scored to the file in # FASTA format. This option is incompatible with the --infile option. '--outfile':ValuedParameter(Prefix='--',Name='outfile',Delimiter=' '),\ - + # --Lmin Must be used in combination with --random and --Lmax . '--Lmin':ValuedParameter(Prefix='--',Name='Lmin',Delimiter=' '),\ - + # --pad Must be used in combination with --emit and --search. Add cm->W # (max hit length) minus L (sequence length) residues to the 5' and 3' # end of each emitted sequence . '--pad':FlagParameter(Prefix='--',Name='pad'),\ - + # --hbanded Specify that the second stage alignment algorithm be HMM banded # CYK. This option is on by default. '--hbanded':FlagParameter(Prefix='--',Name='hbanded'),\ - + # --tau For stage 2 alignment, set the tail loss probability used during # HMM band calculation to . '--tau':ValuedParameter(Prefix='--',Name='tau',Delimiter=' '),\ - + # --aln2bands With --search, when calculating HMM bands, use an HMM # alignment algorithm instead of an HMM search algorithm. '--aln2bands':FlagParameter(Prefix='--',Name='aln2bands'),\ - + # --hsafe For stage 2 HMM banded alignment, realign any sequences with a # negative alignment score using non-banded CYK to guarantee finding the # optimal alignment. '--hsafe':FlagParameter(Prefix='--',Name='hsafe'),\ - + # --nonbanded Specify that the second stage alignment algorithm be standard, # non-banded, non-D&C CYK. When --nonbanded is enabled, the program fails # with a non-zero exit code and prints an error message if the parsetree @@ -793,39 +793,39 @@ class Cmscore(CommandLineApplication): # larger RNAs (more than 300 residues) if memory is limiting, --nonbanded # should be used in combination with --scoreonly. '--nonbanded':FlagParameter(Prefix='--',Name='nonbanded'),\ - + # --scoreonly With --nonbanded during the second stage standard non-banded # CYK alignment, use the "score only" variant of the algorithm to save # memory, and don't recover a parse tree. '--scoreonly':FlagParameter(Prefix='--',Name='scoreonly'),\ - + # --viterbi Specify that the second stage alignment algorithm be Viterbi to # a CM Plan 9 HMM. '--viterbi':FlagParameter(Prefix='--',Name='viterbi'),\ - + # --search Run all algorithms in scanning mode, not alignment mode. '--search':FlagParameter(Prefix='--',Name='search'),\ - + # --inside With --search Compare the non-banded scanning Inside algorithm to # the HMM banded scanning Inside algorith, instead of using CYK versions. '--inside':FlagParameter(Prefix='--',Name='inside'),\ - + # --forward With --search Compare the scanning Forward scoring algorithm # against CYK. '--forward':FlagParameter(Prefix='--',Name='forward'),\ - + # --taus Specify the first alignment algorithm as non-banded D&C CYK, # and multiple stages of HMM banded CYK alignment. The first HMM banded # alignment will use tau=1E-, which will be the highest value of tau # used. Must be used in combination with --taue. '--taus':ValuedParameter(Prefix='--',Name='taus',Delimiter=' '),\ - + # --taue Specify the first alignment algorithm as non-banded D&C CYK, # and multiple stages of HMM banded CYK alignment. The final HMM banded # alignment will use tau=1E-, which will be the lowest value of tau # used. Must be used in combination with --taus. '--taue':ValuedParameter(Prefix='--',Name='taue',Delimiter=' '),\ - + # --tfile Print the parsetrees for each alignment of each sequence to # file . '--tfile':ValuedParameter(Prefix='--',Name='tfile',Delimiter=' '),\ @@ -835,10 +835,10 @@ class Cmscore(CommandLineApplication): _parameters.update(_options) _command = "cmscore" _suppress_stderr=True - + def getHelp(self): """Method that points to the Infernal documentation.""" - + help_str = \ """ See Infernal documentation at: @@ -849,45 +849,45 @@ def getHelp(self): class Cmsearch(CommandLineApplication): """cmsearch application controller.""" _options = { - + # -o Save the high-scoring alignments of hits to a file . The default # is to write them to standard output. '-o':ValuedParameter(Prefix='-',Name='o',Delimiter=' '),\ - + # -g Turn on the 'glocal' alignment algorithm, local with respect to the # target database, and global with respect to the model. By default, the # local alignment algorithm is used which is local with respect to both # the target sequence and the model. '-g':ValuedParameter(Prefix='-',Name='g',Delimiter=' '),\ - + # -p Append posterior probabilities to alignments of hits. '-p':FlagParameter(Prefix='-',Name='p'),\ - + # -x Annotate non-compensatory basepairs and basepairs that include a gap in # the left and/or right half of the pair with x's in the alignments of # hits. '-x':FlagParameter(Prefix='-',Name='x'),\ - + # -Z Calculate E-values as if the target database size was megabases # (Mb). Ignore the actual size of the database. This option is only valid # if the CM file has been calibrated. Warning: the predictions for timings # and survival fractions will be calculated as if the database was of size # Mb, which means they will be inaccurate. '-Z':ValuedParameter(Prefix='-',Name='Z',Delimiter=' '),\ - + # --toponly Only search the top (Watson) strand of the sequences in seqfile. # By default, both strands are searched. '--toponly':FlagParameter(Prefix='--',Name='toponly'),\ - + # --bottomonly Only search the bottom (Crick) strand of the sequences in # seqfile. By default, both strands are searched. '--bottomonly':FlagParameter(Prefix='--',Name='bottomonly'),\ - + # --forecast Predict the running time of the search with provided files # and options and exit, DO NOT perform the search. This option is only # available with calibrated CM files. '--forecast':ValuedParameter(Prefix='--',Name='forecast',Delimiter=' '),\ - + # --informat Assert that the input seqfile is in format . Do not run # Babelfish format autodection. This increases the reliability of the # program somewhat, because the Babelfish can make mistakes; particularly @@ -895,94 +895,94 @@ class Cmsearch(CommandLineApplication): # case-insensitive. Acceptable formats are: FASTA, EMBL, UNIPROT, GENBANK, # and DDBJ. is case-insensitive. '--informat':ValuedParameter(Prefix='--',Name='informat',Delimiter=' '),\ - + # --mxsize Set the maximum allowable DP matrix size to megabytes. '--mxsize':ValuedParameter(Prefix='--',Name='mxsize',Delimiter=' '),\ - + # --mpi Run as an MPI parallel program. '--mpi':FlagParameter(Prefix='--',Name='mpi'),\ - + # Expert Options - + # --inside Use the Inside algorithm for the final round of searching. This # is true by default. '--inside':FlagParameter(Prefix='--',Name='inside'),\ - + # --cyk Use the CYK algorithm for the final round of searching. '--cyk':FlagParameter(Prefix='--',Name='cyk'),\ - + # --viterbi Search only with an HMM. This is much faster but less sensitive # than a CM search. Use the Viterbi algorithm for the HMM search. '--viterbi':FlagParameter(Prefix='--',Name='viterbi'),\ - + # --forward Search only with an HMM. This is much faster but less sensitive # than a CM search. Use the Forward algorithm for the HMM search. '--forward':FlagParameter(Prefix='--',Name='forward'),\ - + # -E Set the E-value cutoff for the per-sequence/strand ranked hit list # to , where is a positive real number. '-E':ValuedParameter(Prefix='-',Name='E',Delimiter=' '),\ - + # -T Set the bit score cutoff for the per-sequence ranked hit list to # , where is a positive real number. '-T':ValuedParameter(Prefix='-',Name='T',Delimiter=' '),\ - + # --nc Set the bit score cutoff as the NC cutoff value used by Rfam curators # as the noise cutoff score. '--nc':FlagParameter(Prefix='--',Name='nc'),\ - + # --ga Set the bit score cutoff as the GA cutoff value used by Rfam curators # as the gathering threshold. '--ga':FlagParameter(Prefix='--',Name='ga'),\ - + # --tc Set the bit score cutoff as the TC cutoff value used by Rfam curators # as the trusted cutoff. '--tc':FlagParameter(Prefix='--',Name='tc'),\ - + # --no-qdb Do not use query-dependent banding (QDB) for the final round of # search. '--no-qdb':FlagParameter(Prefix='--',Name='no-qdb'),\ - + # --beta " " For query-dependent banding (QDB) during the final round of # search, set the beta parameter to where is any positive real # number less than 1.0. '--beta':ValuedParameter(Prefix='--',Name='beta',Delimiter=' '),\ - + # --hbanded Use HMM bands to accelerate the final round of search. # Constraints for the CM search are derived from posterior probabilities # from an HMM. This is an experimental option and it is not recommended # for use unless you know exactly what you're doing. '--hbanded':FlagParameter(Prefix='--',Name='hbanded'),\ - + # --tau Set the tail loss probability during HMM band calculation to # . '--tau':ValuedParameter(Prefix='--',Name='tau',Delimiter=' '),\ - + # --fil-no-hmm Turn the HMM filter off. '--fil-no-hmm':FlagParameter(Prefix='--',Name='fil-no-hmm'),\ - + # --fil-no-qdb Turn the QDB filter off. '--fil-no-qdb':FlagParameter(Prefix='--',Name='fil-no-qdb'),\ - + # --fil-beta For the QDB filter, set the beta parameter to where is # any positive real number less than 1.0. '--fil-beta':FlagParameter(Prefix='--',Name='fil-beta'),\ - + # --fil-T-qdb Set the bit score cutoff for the QDB filter round to , # where is a positive real number. '--fil-T-qdb':ValuedParameter(Prefix='--',Name='fil-T-qdb',Delimiter=' '),\ - + # --fil-T-hmm Set the bit score cutoff for the HMM filter round to , # where is a positive real number. '--fil-T-hmm':ValuedParameter(Prefix='--',Name='fil-T-hmm',Delimiter=' '),\ - + # --fil-E-qdb Set the E-value cutoff for the QDB filter round. , # where is a positive real number. Hits with E-values better than # (less than) or equal to this threshold will survive and be passed to the # final round. This option is only available if the CM file has been # calibrated. '--fil-E-qdb':ValuedParameter(Prefix='--',Name='fil-E-qdb',Delimiter=' '),\ - + # --fil-E-hmm Set the E-value cutoff for the HMM filter round. , # where is a positive real number. Hits with E-values better than # (less than) or equal to this threshold will survive and be passed to the @@ -990,38 +990,38 @@ class Cmsearch(CommandLineApplication): # to the final round of search. This option is only available if the CM # file has been calibrated. '--fil-E-hmm':ValuedParameter(Prefix='--',Name='fil-E-hmm',Delimiter=' '),\ - + # --fil-Smax-hmm Set the maximum predicted survival fraction for an HMM # filter as , where is a positive real number less than 1.0. '--fil-Smax-hmm':ValuedParameter(Prefix='--',Name='fil-Smax-hmm',\ Delimiter=' '),\ - + # --noalign Do not calculate and print alignments of each hit, only print # locations and scores. '--noalign':FlagParameter(Prefix='--',Name='noalign'),\ - + # --aln-hbanded Use HMM bands to accelerate alignment during the hit # alignment stage. '--aln-hbanded':FlagParameter(Prefix='--',Name='aln-hbanded'),\ - + # --aln-optacc Calculate alignments of hits from final round of search using # the optimal accuracy algorithm which computes the alignment that # maximizes the summed posterior probability of all aligned residues given # the model, which can be different from the highest scoring one. '--aln-optacc':FlagParameter(Prefix='--',Name='aln-optacc'),\ - + # --tabfile Create a new output file and print tabular results to # it. '--tabfile':ValuedParameter(Prefix='--',Name='tabfile',Delimiter=' '),\ - + # --gcfile Create a new output file and print statistics of the GC # content of the sequences in seqfile to it. '--gcfile':ValuedParameter(Prefix='--',Name='gcfile',Delimiter=' '),\ - + # --rna Output the hit alignments as RNA sequences alignments. This is true # by default. '--rna':FlagParameter(Prefix='--',Name='rna'),\ - + # --dna Output the hit alignments as DNA sequence alignments. '--dna':FlagParameter(Prefix='--',Name='dna'),\ @@ -1030,32 +1030,32 @@ class Cmsearch(CommandLineApplication): _parameters.update(_options) _command = "cmsearch" _suppress_stderr=True - + def getHelp(self): """Method that points to the Infernal documentation.""" - + help_str = \ """ See Infernal documentation at: http://infernal.janelia.org/ """ return help_str - + def _tabfile_out_filename(self): - + if self.Parameters['--tabfile'].isOn(): tabfile_filename = self._absolute(str(\ self.Parameters['--tabfile'].Value)) else: raise ValueError, 'No tabfile output file specified.' return tabfile_filename - + def _tempfile_as_multiline_string(self, data): """Write a multiline string to a temp file and return the filename. data: a multiline string to be written to a file. - * Note: the result will be the filename as a FilePath object + * Note: the result will be the filename as a FilePath object (which is a string subclass). """ @@ -1064,132 +1064,132 @@ def _tempfile_as_multiline_string(self, data): data_file.write(data) data_file.close() return filename - + def _get_result_paths(self,data): result = {} if self.Parameters['--tabfile'].isOn(): out_name = self._tabfile_out_filename() result['SearchResults'] = ResultPath(Path=out_name,IsWritten=True) - + return result class Cmstat(CommandLineApplication): """cmstat application controller.""" _options = { - + # -g Turn on the 'glocal' alignment algorithm, local with respect to the # target database, and global with respect to the model. By default, the # model is configured for local alignment which is local with respect to # both the target sequence and the model. '-g':FlagParameter(Prefix='-',Name='g'),\ - + # -m print general statistics on the models in cmfile and the alignment it # was built from. '-m':FlagParameter(Prefix='-',Name='m'),\ - + # -Z Calculate E-values as if the target database size was megabases # (Mb). Ignore the actual size of the database. This option is only valid # if the CM file has been calibrated. '-Z':ValuedParameter(Prefix='-',Name='Z',Delimiter=' '),\ - + # --all print all available statistics '--all':FlagParameter(Prefix='--',Name='all'),\ - + # --le print local E-value statistics. This option only works if cmfile has # been calibrated with cmcalibrate. '--le':FlagParameter(Prefix='--',Name='le'),\ - + # --ge print glocal E-value statistics. This option only works if cmfile has # been calibrated with cmcalibrate. '--ge':FlagParameter(Prefix='--',Name='ge'),\ - + # --beta With the --search option set the beta parameter for the query- # dependent banding algorithm stages to Beta is the probability mass # considered negligible during band calculation. The default is 1E-7. '--beta':ValuedParameter(Prefix='--',Name='beta',Delimiter=' '),\ - + # --qdbfile Save the query-dependent bands (QDBs) for each state to file # '--qdbfile':ValuedParameter(Prefix='--',Name='qdbfile',Delimiter=' '),\ - + # Expert Options - + # --lfi Print the HMM filter thresholds for the range of relevant CM bit # score cutoffs for searches with locally configured models using the # Inside algorithm. '--lfi':FlagParameter(Prefix='--',Name='lfi'),\ - + # --gfi Print the HMM filter thresholds for the range of relevant CM bit # score cutoffs for searches with globally configured models using the # Inside algorithm. '--gfi':FlagParameter(Prefix='--',Name='gfi'),\ - + # --lfc Print the HMM filter thresholds for the range of relevant CM bit # score cutoffs for searches with locally configured models using the CYK # algorithm. '--lfc':FlagParameter(Prefix='--',Name='lfc'),\ - + # --gfc Print the HMM filter thresholds for the range of relevant CM bit # score cutoffs for searches with globally configured models using the CYK # algorithm. '--gfc':FlagParameter(Prefix='--',Name='gfc'),\ - + # -E Print filter threshold statistics for an HMM filter if a final CM # E-value cutoff of were to be used for a run of cmsearch on 1 MB of # sequence. '-E':ValuedParameter(Prefix='-',Name='E',Delimiter=' '),\ - + # -T Print filter threshold statistics for an HMM filter if a final CM # bit score cutoff of were to be used for a run of cmsearch. '-T':ValuedParameter(Prefix='-',Name='T',Delimiter=' '),\ - + # --nc Print filter threshold statistics for an HMM filter if a CM bit score # cutoff equal to the Rfam NC cutoff were to be used for a run of # cmsearch. '--nc':FlagParameter(Prefix='--',Name='nc'),\ - + # --ga Print filter threshold statistics for an HMM filter if a CM bit score # cutoff of Rfam GA cutoff value were to be used for a run of cmsearch. '--ga':FlagParameter(Prefix='--',Name='ga'),\ - + # --tc Print filter threshold statistics for an HMM filter if a CM bit score # cutoff equal to the Rfam TC cutoff value were to be used for a run of # cmsearch. '--tc':FlagParameter(Prefix='--',Name='tc'),\ - + # --seqfile With the -E option, use the database size of the database in # instead of the default database size of 1 MB. '--seqfile':ValuedParameter(Prefix='--',Name='seqfile',Delimiter=' '),\ - + # --toponly In combination with --seqfile option, only consider the top # strand of the database in instead of both strands. --search perform # an experiment to determine how fast the CM(s) can search with different # search algorithms. '--toponly':FlagParameter(Prefix='--',Name='toponly'),\ - + # --cmL With the --search option set the length of sequence to search # with CM algorithms as residues. By default, is 1000. '--cmL':ValuedParameter(Prefix='--',Name='cmL',Delimiter=' '),\ - + # --hmmL With the --search option set the length of sequence to search # with HMM algorithms as residues. By default, is 100,000. '--hmmL':ValuedParameter(Prefix='--',Name='hmmL',Delimiter=' '),\ - + # --efile Save a plot of cmsearch HMM filter E value cutoffs versus CM # E-value cutoffs in xmgrace format to file . This option must be used # in combination with --lfi, --gfi, --lfc or --gfc. '--efile':ValuedParameter(Prefix='--',Name='efile',Delimiter=' '),\ - + # --bfile Save a plot of cmsearch HMM bit score cutoffs versus CM bit # score cutoffs in xmgrace format to file . This option must be used in # combination with --lfi, --gfi, --lfc or --gfc. '--bfile':ValuedParameter(Prefix='--',Name='bfile',Delimiter=' '),\ - + # --sfile Save a plot of cmsearch predicted survival fraction from the # HMM filter versus CM E value cutoff in xmgrace format to file . This # option must be used in combination with --lfi, --gfi, --lfc or --gfc. '--sfile':ValuedParameter(Prefix='--',Name='sfile',Delimiter=' '),\ - + # --xfile Save a plot of 'xhmm' versus CM E value cutoff in xmgrace # format to file 'xhmm' is the ratio of the number of dynamic # programming calculations predicted to be required for the HMM filter and @@ -1197,12 +1197,12 @@ class Cmstat(CommandLineApplication): # programming calculations for the filter alone. This option must be # used in combination with --lfi, --gfi, --lfc or --gfc. '--xfile':ValuedParameter(Prefix='--',Name='xfile',Delimiter=' '),\ - + # --afile Save a plot of the predicted acceleration for an HMM filtered # search versus CM E value cutoff in xmgrace format to file . This # option must be used in combination with --lfi, --gfi, --lfc or --gfc. '--afile':ValuedParameter(Prefix='--',Name='afile',Delimiter=' '),\ - + # --bits With --efile, --sfile, --xfile, and --afile use CM bit score # cutoffs instead of CM E value cutoffs for the x-axis values of the plot. '--bits':FlagParameter(Prefix='--',Name='bits'),\ @@ -1212,10 +1212,10 @@ class Cmstat(CommandLineApplication): _parameters.update(_options) _command = "cmstat" _suppress_stderr=True - + def getHelp(self): """Method that points to the Infernal documentation.""" - + help_str = \ """ See Infernal documentation at: @@ -1226,7 +1226,7 @@ def getHelp(self): def cmbuild_from_alignment(aln, structure_string, refine=False, \ return_alignment=False,params=None): """Uses cmbuild to build a CM file given an alignment and structure string. - + - aln: an Alignment object or something that can be used to construct one. All sequences must be the same length. - structure_string: vienna structure string representing the consensus @@ -1236,7 +1236,7 @@ def cmbuild_from_alignment(aln, structure_string, refine=False, \ (Default=False) - return_alignment: Return (in Stockholm format) alignment file used to construct the CM file. This will either be the original alignment - and structure string passed in, or the refined alignment if --refine + and structure string passed in, or the refined alignment if --refine was used. (Default=False) - Note. This will be a string that can either be written to a file or parsed. @@ -1250,26 +1250,26 @@ def cmbuild_from_alignment(aln, structure_string, refine=False, \ #Make new Cmbuild app instance. app = Cmbuild(InputHandler='_input_as_paths',WorkingDir='/tmp',\ params=params) - + #turn on refine flag if True. if refine: _, tmp_file = mkstemp(dir=app.WorkingDir) app.Parameters['--refine'].on(tmp_file) - + #Get alignment in Stockholm format aln_file_string = stockholm_from_alignment(aln,GC_annotation=struct_dict) - + #get path to alignment filename aln_path = app._input_as_multiline_string(aln_file_string) cm_path = aln_path.split('.txt')[0]+'.cm' app.Parameters['-n'].on(cm_path) - + filepaths = [cm_path,aln_path] - + res = app(filepaths) - + cm_file = res['CmFile'].read() - + if return_alignment: #If alignment was refined, return refined alignment and structure, # otherwise return original alignment and structure. @@ -1286,9 +1286,9 @@ def cmbuild_from_alignment(aln, structure_string, refine=False, \ def cmbuild_from_file(stockholm_file_path, refine=False,return_alignment=False,\ params=None): """Uses cmbuild to build a CM file given a stockholm file. - + - stockholm_file_path: a path to a stockholm file. This file should - contain a multiple sequence alignment formated in Stockholm format. + contain a multiple sequence alignment formated in Stockholm format. This must contain a sequence structure line: #=GC SS_cons - refine: refine the alignment and realign before building the cm. @@ -1302,24 +1302,24 @@ def cmbuild_from_file(stockholm_file_path, refine=False,return_alignment=False,\ info, aln, structure_string = \ list(MinimalRfamParser(open(stockholm_file_path,'U'),\ seq_constructor=ChangedSequence))[0] - + #call cmbuild_from_alignment. res = cmbuild_from_alignment(aln, structure_string, refine=refine, \ return_alignment=return_alignment,params=params) return res -def cmalign_from_alignment(aln, structure_string, seqs, moltype,\ +def cmalign_from_alignment(aln, structure_string, seqs, moltype=DNA,\ include_aln=True,refine=False, return_stdout=False,params=None,\ cmbuild_params=None): """Uses cmbuild to build a CM file, then cmalign to build an alignment. - + - aln: an Alignment object or something that can be used to construct one. All sequences must be the same length. - structure_string: vienna structure string representing the consensus stucture for the sequences in aln. Must be the same length as the alignment. - seqs: SequenceCollection object or something that can be used to - construct one, containing unaligned sequences that are to be aligned + construct one, containing unaligned sequences that are to be aligned to the aligned sequences in aln. - moltype: Cogent moltype object. Must be RNA or DNA. - include_aln: Boolean to include sequences in aln in final alignment. @@ -1336,65 +1336,65 @@ def cmalign_from_alignment(aln, structure_string, seqs, moltype,\ int_map, int_keys = seqs.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) - + cm_file, aln_file_string = cmbuild_from_alignment(aln, structure_string,\ refine=refine,return_alignment=True,params=cmbuild_params) - + if params is None: - params = {} + params = {} params.update({MOLTYPE_MAP[moltype]:True}) - + app = Cmalign(InputHandler='_input_as_paths',WorkingDir='/tmp',\ params=params) app.Parameters['--informat'].on('FASTA') - + #files to remove that aren't cleaned up by ResultPath object - to_remove = [] + to_remove = [] #turn on --withali flag if True. if include_aln: app.Parameters['--withali'].on(\ app._tempfile_as_multiline_string(aln_file_string)) #remove this file at end to_remove.append(app.Parameters['--withali'].Value) - + seqs_path = app._input_as_multiline_string(int_map.toFasta()) cm_path = app._tempfile_as_multiline_string(cm_file) - + #add cm_path to to_remove to_remove.append(cm_path) paths = [cm_path,seqs_path] _, tmp_file = mkstemp(dir=app.WorkingDir) app.Parameters['-o'].on(tmp_file) - + res = app(paths) - + info, aligned, struct_string = \ list(MinimalRfamParser(res['Alignment'].readlines(),\ seq_constructor=SEQ_CONSTRUCTOR_MAP[moltype]))[0] - + #Make new dict mapping original IDs new_alignment={} for k,v in aligned.NamedSeqs.items(): new_alignment[int_keys.get(k,k)]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) - + std_out = res['StdOut'].read() #clean up files res.cleanUp() for f in to_remove: remove(f) - + if return_stdout: return new_alignment, struct_string, std_out else: return new_alignment, struct_string - -def cmalign_from_file(cm_file_path, seqs, moltype, alignment_file_path=None,\ + +def cmalign_from_file(cm_file_path, seqs, moltype=DNA, alignment_file_path=None,\ include_aln=False,return_stdout=False,params=None): """Uses cmalign to align seqs to alignment in cm_file_path. - + - cm_file_path: path to the file created by cmbuild, containing aligned sequences. This will be used to align sequences in seqs. - seqs: unaligned sequendes that are to be aligned to the sequences in @@ -1414,39 +1414,39 @@ def cmalign_from_file(cm_file_path, seqs, moltype, alignment_file_path=None,\ """ #NOTE: Must degap seqs or Infernal well seg fault! seqs = SequenceCollection(seqs,MolType=moltype).degap() - + #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seqs.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) - + if params is None: params = {} params.update({MOLTYPE_MAP[moltype]:True}) - + app = Cmalign(InputHandler='_input_as_paths',WorkingDir='/tmp',\ params=params) app.Parameters['--informat'].on('FASTA') - + #turn on --withali flag if True. if include_aln: if alignment_file_path is None: raise DataError, """Must have path to alignment file used to build CM if include_aln=True.""" else: app.Parameters['--withali'].on(alignment_file_path) - + seqs_path = app._input_as_multiline_string(int_map.toFasta()) paths = [cm_file_path,seqs_path] - + _, tmp_file = mkstemp(dir=app.WorkingDir) app.Parameters['-o'].on(tmp_file) res = app(paths) - + info, aligned, struct_string = \ list(MinimalRfamParser(res['Alignment'].readlines(),\ seq_constructor=SEQ_CONSTRUCTOR_MAP[moltype]))[0] - - + + #Make new dict mapping original IDs new_alignment={} for k,v in aligned.items(): @@ -1459,11 +1459,11 @@ def cmalign_from_file(cm_file_path, seqs, moltype, alignment_file_path=None,\ return new_alignment, struct_string, std_out else: return new_alignment, struct_string - + def cmsearch_from_alignment(aln, structure_string, seqs, moltype, cutoff=0.0,\ refine=False,params=None): """Uses cmbuild to build a CM file, then cmsearch to find homologs. - + - aln: an Alignment object or something that can be used to construct one. All sequences must be the same length. - structure_string: vienna structure string representing the consensus @@ -1486,40 +1486,40 @@ def cmsearch_from_alignment(aln, structure_string, seqs, moltype, cutoff=0.0,\ int_map, int_keys = seqs.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) - + cm_file, aln_file_string = cmbuild_from_alignment(aln, structure_string,\ refine=refine,return_alignment=True) - + app = Cmsearch(InputHandler='_input_as_paths',WorkingDir='/tmp',\ params=params) app.Parameters['--informat'].on('FASTA') app.Parameters['-T'].on(cutoff) - + to_remove = [] - + seqs_path = app._input_as_multiline_string(int_map.toFasta()) cm_path = app._tempfile_as_multiline_string(cm_file) paths = [cm_path,seqs_path] to_remove.append(cm_path) - + _, tmp_file = mkstemp(dir=app.WorkingDir) app.Parameters['--tabfile'].on(tmp_file) res = app(paths) - + search_results = list(CmsearchParser(res['SearchResults'].readlines())) if search_results: for i,line in enumerate(search_results): label = line[1] search_results[i][1]=int_keys.get(label,label) - + res.cleanUp() for f in to_remove:remove(f) - + return search_results def cmsearch_from_file(cm_file_path, seqs, moltype, cutoff=0.0, params=None): """Uses cmbuild to build a CM file, then cmsearch to find homologs. - + - cm_file_path: path to the file created by cmbuild, containing aligned sequences. This will be used to search sequences in seqs. - seqs: SequenceCollection object or something that can be used to @@ -1537,28 +1537,27 @@ def cmsearch_from_file(cm_file_path, seqs, moltype, cutoff=0.0, params=None): int_map, int_keys = seqs.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) - + app = Cmsearch(InputHandler='_input_as_paths',WorkingDir='/tmp',\ params=params) app.Parameters['--informat'].on('FASTA') app.Parameters['-T'].on(cutoff) - + seqs_path = app._input_as_multiline_string(int_map.toFasta()) paths = [cm_file_path,seqs_path] - + _, tmp_file = mkstemp(dir=app.WorkingDir) app.Parameters['--tabfile'].on(tmp_file) res = app(paths) - + search_results = list(CmsearchParser(res['SearchResults'].readlines())) - - if search_results: + + if search_results: for i,line in enumerate(search_results): label = line[1] search_results[i][1]=int_keys.get(label,label) - + res.cleanUp() return search_results -