From 56e1ad1054f6c8fb05cee80cde0be241bca906ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Allio?= Date: Wed, 22 Jan 2020 14:51:43 +0100 Subject: [PATCH] Update --- FirstBuildChecker.pyc | Bin 11615 -> 11594 bytes README | 154 --------------- README.md | 168 ++++++++-------- circularizationCheck.pyc | Bin 4015 -> 4001 bytes genbankOutput.py | 86 ++++---- genbankOutput.pyc | Bin 6254 -> 6488 bytes geneChecker.pyc | Bin 16159 -> 16103 bytes geneChecker_fasta.py | 19 +- mitofinder | 160 ++++++++------- rename_fasta_seqID.py | 23 +++ runIDBA.pyc | Bin 5303 -> 5149 bytes runMegahit.pyc | Bin 4756 -> 4635 bytes runMetaspades.pyc | Bin 4614 -> 4501 bytes sort_gff.py | 409 ++++++++++++++++++++++++++++++++++++++- tRNAscanChecker.pyc | Bin 9776 -> 9664 bytes 15 files changed, 662 insertions(+), 357 deletions(-) delete mode 100644 README create mode 100755 rename_fasta_seqID.py diff --git a/FirstBuildChecker.pyc b/FirstBuildChecker.pyc index f5f2a2036198f3ccaf3689394f9f33536d37ab88..4cb5808a10d40bbfbb832205a46f49a7f9927627 100755 GIT binary patch delta 49 wcmcZ~bt;O3`7;uD6>T?feFH#ETZcT0O73>p#T5? delta 81 zcmX>Vbw7%O`7H8#&esF*;86)6!s%FEi9L&@ linux - -$ cd mitofinder_vX/ -$ p=$(pwd) -$ echo -e "\n#Path to mitofinder \nexport PATH=$PATH:$p" >> ~/.bashrc -$ source ~/.bashrc - -# How to use MitoFinder -TIP: use mitofinder --example to print usual examples of use - -First, you can choose the assembler using the following options: --- megahit (default: faster) --- metaspades (recommended: a bit slower but more efficient (see associated paper). WARNING: Not compatible with single-end reads) --- idba - -### For mitochondrial genome assembly - -## Trimmed paired-end reads -$ mitofinder -j [jobname] -1 [left_reads.fastq.gz] -2 [right_reads.fastq.gz] -r [genbank_reference.gb] -o [genetic_code] -p [threads] -m [memory] - -## Trimmed single-end reads -$ mitofinder -j [jobname] -s [SE_reads.fastq.gz] -r [genbank_reference.gb] -o [genetic_code] -p [threads] -m [memory] - -## MitoFinder can be used with your own assembly (one or several contig.s in fasta format) -$ mitofinder -j [jobname] -a [assembly.fasta] -r [genbank_reference.gb] -o [genetic_code] -p [threads] -m [memory] - -### Restart -Use the same command line. -WARNING: If you want to make the assembly again (for example because it failed) you have to remove the result assembly directory. If not, MitoFinder will skip the assembly step. - -# OUTPUT - -## Result folder - -Mitofinder returns several files for each mitochondrial contig found: -- [job_name]_partial_mito_1.fasta containing a mitochondrial contig -- [job_name]_partial_mito_1.gff containing the final annotation for a given contig -- [job_name]_partial_mito_1.arwen containing the result of the tRNA annotation returned by the Arwen software. -- [job_name]_partial_mito_1.gb containing the final annotation for a given contig (option --out_gb) -- [job_name]_final_genes.fasta containing the final genes selected from all contigs by MitoFinder - - -## UCE annotation -MitoFinder starts by assembling both mitochondrial and nuclear reads. It is only in a second step that mitochondrial contigs are identified and extracted. -MitoFinder thus provides UCE contigs already assembled and the annotation can be done from the following file: -- [job_name]link.scafSeq containing all assembled contigs from raw reads. - -To do so, we recommend the PHYLUCE pipeline, which is specifically designed to annotate ultraconserved elements (Faircloth 2015; Tutorial: https://phyluce.readthedocs.io/en/latest/tutorial-one.html#finding-uce-loci). -You can thus use the file [job_name]link.scafSeq and start the pipeline at the "Finding UCE" step. - -# Help -usage: mitofinder [-h] [--megahit] [--idba] [--metaspades] [-j PROCESSNAME] - [-1 PE1] [-2 PE2] [-s SE] [-a ASSEMBLY] [-m MEM] - [-l SHORTESTCONTIG] [-p PROCESSORSTOUSE] [-r REFSEQFILE] - [-e BLASTEVAL] [--out_gb] - [--blastidentitynucl BLASTIDENTITYNUCL] - [--blastidentityprot BLASTIDENTITYPROT] - [--blastsize ALIGNCUTOFF] [--circularsize CIRCULARSIZE] - [--circularoffset CIRCULAROFFSET] [-cove COVECUTOFF] - [-o ORGANISMTYPE] [-v] [--example] - -Mitofinder is a pipeline to assemble and annotate mitochondrial DNA from -trimmed sequencing reads. - -optional arguments: - -h, --help show this help message and exit - --megahit Use Megahit for assembly. (Default) - --idba Use IDBA-UD for assembly. - --metaspades Use MetaSPAdes for assembly. - -j PROCESSNAME, --jobname PROCESSNAME - Job name to be used throughout the process - -1 PE1, --Paired-end1 PE1 - File with forward paired-end reads - -2 PE2, --Paired-end2 PE2 - File with reverse paired-end reads - -s SE, --Single-end SE - File with single-end reads - -a ASSEMBLY, --assembly ASSEMBLY - File with your own assembly - -m MEM, --max-memory MEM - max memory to use in Go (Megahit or MetaSPAdes) - -l SHORTESTCONTIG, --length SHORTESTCONTIG - Shortest contig length to be used in scaffolding. - Default = 100 - -p PROCESSORSTOUSE, --processors PROCESSORSTOUSE - Number of threads Mitofinder will use at most. - -r REFSEQFILE, --refseq REFSEQFILE - Reference mitochondrial genome in GenBank format - (.gb). - -e BLASTEVAL, --blaste BLASTEVAL - e-value of blast program used for contig - identification and annotation. Default = 0.000001 - --out_gb Create annotation output file in GenBank format. - --blastidentitynucl BLASTIDENTITYNUCL - Nucleotide identity percentage for a hit to be - retained. Default = 50 - --blastidentityprot BLASTIDENTITYPROT - Amino acid identity percentage for a hit to be - retained. Default = 40 - --blastsize ALIGNCUTOFF - Percentage of overlap in blast best hit to be - retained. Default = 30 - --circularsize CIRCULARSIZE - Size to consider when checking for circularization. - Default = 45 - --circularoffset CIRCULAROFFSET - Offset from start and finish to consider when looking - for circularization. Default = 200 - -cove COVECUTOFF, --covecutoff COVECUTOFF - Cove cutoff for tRNAscan-SE. Default = 7 - -o ORGANISMTYPE, --organism ORGANISMTYPE - Organism genetic code following NCBI table (integer): - 1. The Standard Code 2. The Vertebrate Mitochondrial - Code 3. The Yeast Mitochondrial Code 4. The Mold, - Protozoan, and Coelenterate Mitochondrial Code and the - Mycoplasma/Spiroplasma Code 5. The Invertebrate - Mitochondrial Code 6. The Ciliate, Dasycladacean and - Hexamita Nuclear Code 9. The Echinoderm and Flatworm - Mitochondrial Code 10. The Euplotid Nuclear Code 11. - The Bacterial, Archaeal and Plant Plastid Code 12. The - Alternative Yeast Nuclear Code 13. The Ascidian - Mitochondrial Code 14. The Alternative Flatworm - Mitochondrial Code 16. Chlorophycean Mitochondrial - Code 21. Trematode Mitochondrial Code 22. Scenedesmus - obliquus Mitochondrial Code 23. Thraustochytrium - Mitochondrial Code 24. Pterobranchia Mitochondrial - Code 25. Candidate Division SR1 and Gracilibacteria - Code - -v, --version Version 1.0.2 - --example Print getting started examples - -# To cite Mitofinder - -Allio R., Schomaker-Bastos A., Romiguier J., Prosdocimi F., Nabholz B. & Delsuc F. (2019). Efficient automated extraction of mitogenomic data in target enrichment phylogenomics with MitoFinder. Molecular Ecology Resources (submitted). diff --git a/README.md b/README.md index e8a50e6..c645828 100755 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# MitoFinder version 1.0.2 06/01/2019 +# MitoFinder version 1.1 Allio, R., Schomaker-Bastos, A., Romiguier, J., Prosdocimi, F., Nabholz, B., & Delsuc, F.

@@ -16,14 +16,12 @@ This software is suitable for all linux-like systems with gcc installed (Unfortu 1. [Installation guide for MitoFinder](#installation-guide-for-mitofinder) 2. [How to use MitoFinder](#how-to-use-mitofinder) -3. [INPUTS](#inputs) -4. [OUTPUTS](#outputs) -5. [Test case](#test-case) -6. [Restart](#restart) -7. [UCE annotation](#uce-annotation) -8. [Associated publication](#associated-publication) -9. [How to get reference mitochondrial genomes from ncbi](#how-to-get-reference-mitochondrial-genomes-from-ncbi) -10. [Detailed options](#detailed-options) +3. [Detailed options](#detailed-options) +4. [INPUTS](#inputs) +5. [OUTPUTS](#outputs) +6. [UCE annotation](#uce-annotation) +7. [Associated publication](#associated-publication) +8. [How to get reference mitochondrial genomes from ncbi](#how-to-get-reference-mitochondrial-genomes-from-ncbi) # Installation guide for MitoFinder @@ -71,7 +69,7 @@ First, you can choose the assembler using the following options: ## Mitochondrial genome assembly -**TIP**: use mitofinder --example to print usual examples of use +TIP: use mitofinder --example to print usual examples of use ### Trimmed paired-end reads ```shell @@ -88,74 +86,17 @@ mitofinder -j [jobname] -s [SE_reads.fastq.gz] -r [genbank_reference.gb] -o [gen mitofinder -j [jobname] -a [assembly.fasta] -r [genbank_reference.gb] -o [genetic_code] -p [threads] -m [memory] ``` -# INPUTS - -Mitofinder needs several files to run depending on the method you have choosen (see above): -- [x] **Reference_file.gb** containing at least one mitochondrial genome of reference extracted from [NCBI](https://www.ncbi.nlm.nih.gov/) -- [ ] **left_reads.fastq.gz** containing the left reads of paired-end sequencing -- [ ] **right_reads.fastq.gz** containing the right reads of paired-end sequencing -- [ ] **SE_reads.fastq.gz** containing the reads of single-end sequencing -- [ ] **assembly.fasta** containing the assembly on which MitoFinder have to find and annotate mitochondrial contig.s - -**TIP 1**: If you hesitate between two or more mitogenomes to use as references, you can put them together in the same reference file. MitoFinder will use all of them to find mitochondrial signal and will choose the best matching one for each gene during the annotation step. -**TIP 2**: If you want to assemble several mitogenomes for which you have different references, you can group them together in the same reference file. This allows to use MitoFinder in a loop and does not affect the result because MitoFinder will select the best matching reference for the annotation step. -**TIP 3**: If you have a large number of reads, from a whole genome sequencing for example, you can pre-filter mitochondrial reads using [mirabait](http://manpages.ubuntu.com/manpages/xenial/man1/mirabait.1.html). This will considerably reduce the computation time. However, this can be done only if you have a reference mitogenome for a closely related species because MIRA uses mapping to select reads and this method is rather stringent. If you don't have the mitogenome of a closely related species you can just use a random subset of your read data (~ 15/20 Millions of reads) to reduce computation time and assemble the mitogenome. Of course, in doing so, you could lose some of the information if the mitogenome coverage is unsufficient. - -# OUTPUTS -### Result folder - -Mitofinder returns several files for each mitochondrial contig found: -- [x] **[job_name]_partial_mito_1.fasta** containing a mitochondrial contig -- [x] **[job_name]_partial_mito_1_genes.fasta** containing the annoted genes for a given contig -- [x] **[job_name]_partial_mito_1.gff** containing the final annotation for a given contig -- [x] **[job_name]_partial_mito_1.arwen** containing the result of the tRNA annotation returned by the Arwen software -- [x] **[job_name]_partial_mito_1.gb** containing the final annotation for a given contig (option --out_gb) -- [x] **[job_name]_final_genes.fasta** containing the final genes selected from all contigs by MitoFinder - -# Test case -A test case is available in the testcase directory. - -To run it: -```shell -cd /PATH/TO/MITOFINDER/testcase/ - -mitofinder -j test -s test.fastq -r test_reference.gb -o 5 -p 5 -m 5 -``` - -# Restart +### Restart Use the same command line. WARNING: If you want to make the assembly again (for example because it failed) you have to remove the result assembly directory. If not, MitoFinder will skip the assembly step. -# UCE annotation -MitoFinder starts by assembling both mitochondrial and nuclear reads. It is only in a second step that mitochondrial contigs are identified and extracted. -MitoFinder thus provides UCE contigs already assembled and the annotation can be done from the following file: -- **[job_name]link.scafSeq** containing all assembled contigs from raw reads. - -To do so, we recommend the PHYLUCE pipeline, which is specifically designed to annotate ultraconserved elements (Faircloth 2015; Tutorial: https://phyluce.readthedocs.io/en/latest/tutorial-one.html#finding-uce-loci). -You can thus use the file **[job_name]link.scafSeq** and start the pipeline at the **"Finding UCE"** step. - -# Associated publication - -Allio, R., Schomaker-Bastos, A., Romiguier, J., Prosdocimi, F., Nabholz, B., & Delsuc, F. (2019). MitoFinder: efficient automated large-scale extraction of mitogenomic data in target enrichment phylogenomics. BioRxiv, 685412. https://doi.org/10.1101/685412 - -# HOW TO GET REFERENCE MITOCHONDRIAL GENOMES FROM NCBI - -1. Go to [NCBI](https://www.ncbi.nlm.nih.gov/) -2. Select "Nucleotide" in the search bar -3. Search for mitochondrion genomes: -- [x] RefSeq (if available) -- [x] Sequence length from 12000 to 20000 -4. Download complete record in GenBank format - -![](/image/NCBI.png) - # Detailed options ```shell usage: mitofinder [-h] [--megahit] [--idba] [--metaspades] [-j PROCESSNAME] [-1 PE1] [-2 PE2] [-s SE] [-a ASSEMBLY] [-m MEM] [-l SHORTESTCONTIG] [-p PROCESSORSTOUSE] [-r REFSEQFILE] - [-e BLASTEVAL] [--out_gb] + [-e BLASTEVAL] [-n NWALK] [--out_gb] [--blastidentitynucl BLASTIDENTITYNUCL] [--blastidentityprot BLASTIDENTITYPROT] [--blastsize ALIGNCUTOFF] [--circularsize CIRCULARSIZE] @@ -163,7 +104,7 @@ usage: mitofinder [-h] [--megahit] [--idba] [--metaspades] [-j PROCESSNAME] [-o ORGANISMTYPE] [-v] [--example] Mitofinder is a pipeline to assemble and annotate mitochondrial DNA from -cleaned sequencing datas. +trimmed sequencing reads. optional arguments: -h, --help show this help message and exit @@ -171,7 +112,7 @@ optional arguments: --idba Use IDBA-UD for assembly. --metaspades Use MetaSPAdes for assembly. -j PROCESSNAME, --jobname PROCESSNAME - Job name to be used throughout the project + Sequence ID to be used throughout the process -1 PE1, --Paired-end1 PE1 File with forward paired-end reads -2 PE2, --Paired-end2 PE2 @@ -181,29 +122,33 @@ optional arguments: -a ASSEMBLY, --assembly ASSEMBLY File with your own assembly -m MEM, --max-memory MEM - max memory to use in Go (megahit or metaspades) + max memory to use in Go (Megahit or MetaSPAdes) -l SHORTESTCONTIG, --length SHORTESTCONTIG Shortest contig length to be used in scaffolding. Default = 100 -p PROCESSORSTOUSE, --processors PROCESSORSTOUSE Number of threads Mitofinder will use at most. -r REFSEQFILE, --refseq REFSEQFILE - Reference mitochondrial genome.s in GenBank format - (.gb). You can put several reference mitgenomes - together. + Reference mitochondrial genome in GenBank format + (.gb). -e BLASTEVAL, --blaste BLASTEVAL - e-Value for blast program for contig identification - and annotation. Default = 0.000001 - --out_gb Create annotation output with genbank format. + e-value of blast program used for contig + identification and annotation. Default = 0.000001 + -n NWALK, --nwalk NWALK + Maximum number of codon steps to be tested on each + size of the gene to find the start and stop codon + during the annotation step. Default = 20 (60 bases) + --out_gb Do not create annotation output file in GenBank + format. --blastidentitynucl BLASTIDENTITYNUCL - Nucleotide percentage of identity or a hit to be - considered good. Default = 50 + Nucleotide identity percentage for a hit to be + retained. Default = 50 --blastidentityprot BLASTIDENTITYPROT - Amino acid percentage of identity or a hit to be - considered good. Default = 40 + Amino acid identity percentage for a hit to be + retained. Default = 40 --blastsize ALIGNCUTOFF - Percentage of covered span in blast best hit to be - considered good. Default = 30 + Percentage of overlap in blast best hit to be + retained. Default = 30 --circularsize CIRCULARSIZE Size to consider when checking for circularization. Default = 45 @@ -213,8 +158,7 @@ optional arguments: -cove COVECUTOFF, --covecutoff COVECUTOFF Cove cutoff for tRNAscan-SE. Default = 7 -o ORGANISMTYPE, --organism ORGANISMTYPE - What should the genome checking and annotation - consider as genetic code type. NCBI table (integer): + Organism genetic code following NCBI table (integer): 1. The Standard Code 2. The Vertebrate Mitochondrial Code 3. The Yeast Mitochondrial Code 4. The Mold, Protozoan, and Coelenterate Mitochondrial Code and the @@ -231,9 +175,57 @@ optional arguments: Mitochondrial Code 24. Pterobranchia Mitochondrial Code 25. Candidate Division SR1 and Gracilibacteria Code - -v, --version Version 1.0.2 - --example Print usual examples of use + -v, --version Version 1.1 + --example Print getting started examples ``` +# INPUTS + +Mitofinder needs several files to run depending on the method you have choosen (see above): +- [x] **Reference_file.gb** containing at least one mitochondrial genome of reference extracted from [NCBI](https://www.ncbi.nlm.nih.gov/) +- [ ] **left_reads.fastq.gz** containing the left reads of paired-end sequencing +- [ ] **right_reads.fastq.gz** containing the right reads of paired-end sequencing +- [ ] **SE_reads.fastq.gz** containing the reads of single-end sequencing +- [ ] **assembly.fasta** containing the assembly on which MitoFinder have to find and annotate mitochondrial contig.s + +# OUTPUTS +### Result folder + +Mitofinder returns several files for each mitochondrial contig found: +- [x] **[job_name]_final_genes.fasta** containing the final genes selected from all contigs by MitoFinder +- [x] **[job_name]_mtDNA_contig.fasta** containing a mitochondrial contig +- [x] **[job_name]_mtDNA_contig_final.gff** containing the final annotation for a given contig +- [x] **[job_name]_mtDNA_contig_final.tbl** containing the final annotation for a given contig (Genbank submission format) +- [x] **[job_name]_mtDNA_contig.gb** containing the final annotation for a given contig +- [x] **[job_name]_mtDNA_contig_genes.fasta** containing the annoted genes for a given contig +- [x] **[job_name]_mtDNA_contig.png** schematic representation of the annotation of the mtDNA contig + + +# UCE annotation +MitoFinder starts by assembling both mitochondrial and nuclear reads. It is only in a second step that mitochondrial contigs are identified and extracted. +MitoFinder thus provides UCE contigs already assembled and the annotation can be done from the following file: +- **[job_name]link.scafSeq** containing all assembled contigs from raw reads. + +To do so, we recommend the PHYLUCE pipeline, which is specifically designed to annotate ultraconserved elements (Faircloth 2015; Tutorial: https://phyluce.readthedocs.io/en/latest/tutorial-one.html#finding-uce-loci). +You can thus use the file **[job_name]link.scafSeq** and start the pipeline at the **"Finding UCE"** step. + +# Associated publication + +Allio, R., Schomaker-Bastos, A., Romiguier, J., Prosdocimi, F., Nabholz, B., & Delsuc, F. (2019). MitoFinder: efficient automated large-scale extraction of mitogenomic data in target enrichment phylogenomics. BioRxiv, 685412. https://doi.org/10.1101/685412 + + +# HOW TO GET REFERENCE MITOCHONDRIAL GENOMES FROM NCBI + +1. Go to [NCBI](https://www.ncbi.nlm.nih.gov/) +2. Select "Nucleotide" in the search bar +3. Search for mitochondrion genomes: +- [x] RefSeq (if available) +- [x] Sequence length from 12000 to 20000 +4. Download complete record in GenBank format + +![](/image/NCBI.png) + + + diff --git a/circularizationCheck.pyc b/circularizationCheck.pyc index ad7fa99a4a8a68f640c2e97b0f84329d6b10e646..9658cec88c65ff7898e62152ada729fdcda01561 100755 GIT binary patch delta 42 ucmZ24zfhio`7H8#$))FuF`$z@xz)UuLLhpl3AMn_qJCLmo#a4B;>UE`1T4 diff --git a/genbankOutput.py b/genbankOutput.py index 2b18007..47f8a40 100755 --- a/genbankOutput.py +++ b/genbankOutput.py @@ -31,7 +31,7 @@ from Bio.Data import CodonTable from decimal import Decimal -def genbankOutput(resultGbFile, resultFile, listOfFeaturesToOutput, buildCloroplast = False, dLoopSize = 800): +def genbankOutput(resultGbFile, resultFile, listOfFeaturesToOutput, buildCloroplast = False, dLoopSize = 800, nWalk = 20): ''' Creates a genbank file based on a fasta file given (resultfile) and a list of features that the genbank file should present (listoffeaturestooutput) @@ -68,21 +68,24 @@ def genbankOutput(resultGbFile, resultFile, listOfFeaturesToOutput, buildCloropl main_start_pos = SeqFeature.ExactPosition(thisFeatureAlignment.startBase) main_end_pos = SeqFeature.ExactPosition(thisFeatureAlignment.endBase) - + if main_feature_type == "gene": - codonDiff = ((main_end_pos - (main_start_pos + 1)) % 3) + codonDiff = ((main_end_pos - main_start_pos+1) % 3) if codonDiff == 2: main_end_pos += 1 elif codonDiff == 1: main_end_pos -= 1 + #print main_start_pos + #print main_end_pos + #print thisFeatureAlignment.frame + # 2. Use the locations do define a FeatureLocation if thisFeatureAlignment.frame < 0: strandToOutput = -1 else: strandToOutput = 1 - main_feature_location = SeqFeature.FeatureLocation(main_start_pos,main_end_pos,strand=strandToOutput) - + main_feature_location = SeqFeature.FeatureLocation(main_start_pos-1,main_end_pos,strand=strandToOutput) # 3. Create a SeqFeature main_feature = SeqFeature.SeqFeature(main_feature_location,type=main_feature_type, qualifiers=main_feature_qualifiers) ''' @@ -103,11 +106,10 @@ def genbankOutput(resultGbFile, resultFile, listOfFeaturesToOutput, buildCloropl lastFeatureAlignment = thisFeatureAlignment ''' - # 4. Append your newly created SeqFeature to your SeqRecord if main_feature_type == "gene": cds_qualifiers = dict(main_feature_qualifiers) - coding_dna = Seq(str(finalResults.seq[thisFeatureAlignment.startBase:thisFeatureAlignment.endBase]), IUPAC.unambiguous_dna) + coding_dna = Seq(str(finalResults.seq[thisFeatureAlignment.startBase-1:thisFeatureAlignment.endBase]), IUPAC.unambiguous_dna) if strandToOutput == -1: coding_dna = coding_dna.reverse_complement() translationTable = thisFeatureAlignment.translationTable @@ -120,8 +122,8 @@ def genbankOutput(resultGbFile, resultFile, listOfFeaturesToOutput, buildCloropl listOfStartCodons.append(startCodonTranslation) startCodons = tuple(listOfStartCodons) #need to make it a tuple so that startswith works with it stopCodons = ('*','$','#','+') - nWalkStart = 20 - nWalkStop = 20 + nWalkStart = nWalk + nWalkStop = nWalk ''' For genes in the -1 strand, we look for the stop codons at the start and the start codons at the end! ''' @@ -130,8 +132,8 @@ def genbankOutput(resultGbFile, resultFile, listOfFeaturesToOutput, buildCloropl tempStopCodons = stopCodons startCodons = tempStopCodons stopCodons = tempStartCodons - nWalkStart = 20 - nWalkStop = 20 + nWalkStart = nWalk + nWalkStop = nWalk try: ''' @@ -153,33 +155,45 @@ def genbankOutput(resultGbFile, resultFile, listOfFeaturesToOutput, buildCloropl and not coding_dna_TranslationBackward.startswith(startCodons) and n < nWalkStart and startBase - (3*(n+1)) >= 0: try: n += 1 - coding_dna_Backward = Seq(str(finalResults.seq[startBase - (3*n):endBase]), IUPAC.unambiguous_dna) + coding_dna_Backward = Seq(str(finalResults.seq[startBase - (3*n) - 1:endBase]), IUPAC.unambiguous_dna) if strandToOutput == -1: coding_dna_Backward = coding_dna_Backward.reverse_complement() coding_dna_TranslationBackward = coding_dna_Backward.translate(table=translationTable) - coding_dna_Forward = Seq(str(finalResults.seq[startBase + (3*n):endBase]), IUPAC.unambiguous_dna) + coding_dna_Forward = Seq(str(finalResults.seq[startBase -1 - (3*n):endBase]), IUPAC.unambiguous_dna) if strandToOutput == -1: + coding_dna_Forward = Seq(str(finalResults.seq[startBase -1 + (3*n):endBase]), IUPAC.unambiguous_dna) coding_dna_Forward = coding_dna_Forward.reverse_complement() coding_dna_TranslationForward = coding_dna_Forward.translate(table=translationTable) if strandToOutput == -1: coding_dna_TranslationBackward = coding_dna_TranslationBackward[::-1] coding_dna_TranslationForward = coding_dna_TranslationForward[::-1] + except: pass else: - if coding_dna_TranslationBackward.startswith(startCodons): + if coding_dna_TranslationForward.startswith(startCodons): + main_start_pos = SeqFeature.ExactPosition(startBase - (3*n)) + if strandToOutput == -1: + SeqFeature.ExactPosition(startBase + (3*n)) + #startBase += (3*n) #backup + #thisFeatureAlignment.startBase = startBase #backup + thisFeatureAlignment.startBase = main_start_pos #update + main_feature_location = SeqFeature.FeatureLocation(main_start_pos-1,main_end_pos,strand=strandToOutput) + elif coding_dna_TranslationBackward.startswith(startCodons): main_start_pos = SeqFeature.ExactPosition(startBase - (3*n)) - startBase += (3*n) - thisFeatureAlignment.startBase = startBase - main_feature_location = SeqFeature.FeatureLocation(main_start_pos,main_end_pos,strand=strandToOutput) - elif coding_dna_TranslationForward.startswith(startCodons): - main_start_pos = SeqFeature.ExactPosition(startBase + (3*n)) - startBase += (3*n) - thisFeatureAlignment.startBase = startBase - main_feature_location = SeqFeature.FeatureLocation(main_start_pos,main_end_pos,strand=strandToOutput) + #startBase += (3*n) #backup + #thisFeatureAlignment.startBase = startBase #backup + thisFeatureAlignment.startBase = main_start_pos #update + main_feature_location = SeqFeature.FeatureLocation(main_start_pos-1,main_end_pos,strand=strandToOutput) except: pass - + + ''' + Updating coding_dna with (new) coordinates + ''' + coding_dna = Seq(str(finalResults.seq[thisFeatureAlignment.startBase-1:thisFeatureAlignment.endBase]), IUPAC.unambiguous_dna) + if strandToOutput == -1: + coding_dna = coding_dna.reverse_complement() ''' Making sure it ends with * (stop codon) ''' @@ -199,16 +213,15 @@ def genbankOutput(resultGbFile, resultFile, listOfFeaturesToOutput, buildCloropl and not coding_dna_TranslationBackward.endswith(stopCodons) and n < nWalkStop and endBase + (3*(n+1)) <= len(finalResults): try: n += 1 - coding_dna_Backward = Seq(str(finalResults.seq[startBase:endBase - (3*n)]), IUPAC.unambiguous_dna) + coding_dna_Backward = Seq(str(finalResults.seq[startBase -1 :endBase - (3*n)]), IUPAC.unambiguous_dna) if strandToOutput == -1: + coding_dna_Backward = Seq(str(finalResults.seq[startBase -1 :endBase + (3*n)]), IUPAC.unambiguous_dna) coding_dna_Backward = coding_dna_Backward.reverse_complement() coding_dna_TranslationBackward = coding_dna_Backward.translate(table=translationTable) - - coding_dna_Forward = Seq(str(finalResults.seq[startBase:endBase + (3*n)]), IUPAC.unambiguous_dna) + coding_dna_Forward = Seq(str(finalResults.seq[startBase - 1:endBase + (3*n)]), IUPAC.unambiguous_dna) if strandToOutput == -1: coding_dna_Forward = coding_dna_Forward.reverse_complement() coding_dna_TranslationForward = coding_dna_Forward.translate(table=translationTable) - if strandToOutput == -1: coding_dna_TranslationBackward = coding_dna_TranslationBackward[::-1] coding_dna_TranslationForward = coding_dna_TranslationForward[::-1] @@ -217,18 +230,23 @@ def genbankOutput(resultGbFile, resultFile, listOfFeaturesToOutput, buildCloropl else: if coding_dna_TranslationBackward.endswith(stopCodons): main_end_pos = SeqFeature.ExactPosition(endBase - (3 * n)) - endBase -= (3 * (n-1)) - thisFeatureAlignment.endBase = endBase - main_feature_location = SeqFeature.FeatureLocation(main_start_pos,main_end_pos,strand=strandToOutput) + if strandToOutput == -1: + main_end_pos = SeqFeature.ExactPosition(endBase + (3 * n)) + #endBase -= (3 * (n-1)) #backup + #thisFeatureAlignment.endBase = endBase #backup + thisFeatureAlignment.endBase = main_end_pos #update + main_feature_location = SeqFeature.FeatureLocation(main_start_pos-1,main_end_pos,strand=strandToOutput) elif coding_dna_TranslationForward.endswith(stopCodons): main_end_pos = SeqFeature.ExactPosition(endBase + (3 * n)) - endBase += (3 * (n-1)) - thisFeatureAlignment.endBase = endBase - main_feature_location = SeqFeature.FeatureLocation(main_start_pos,main_end_pos,strand=strandToOutput) + #endBase += (3 * (n-1)) #backup + #thisFeatureAlignment.endBase = endBase #backup + thisFeatureAlignment.endBase = main_end_pos #update + main_feature_location = SeqFeature.FeatureLocation(main_start_pos-1,main_end_pos,strand=strandToOutput) except: pass + + coding_dna = Seq(str(finalResults.seq[thisFeatureAlignment.startBase -1 :thisFeatureAlignment.endBase]),IUPAC.unambiguous_dna) - coding_dna = Seq(str(finalResults.seq[thisFeatureAlignment.startBase:thisFeatureAlignment.endBase]),IUPAC.unambiguous_dna) if strandToOutput == 1: coding_dna_Translation = coding_dna.translate(table=translationTable) else: diff --git a/genbankOutput.pyc b/genbankOutput.pyc index 7bb886941f1ea5a7a5819b5f927b6e1a6445fe6c..15a6663235c7e14b3ebfc2403d232b83f11977d8 100755 GIT binary patch delta 3011 zcmbtWO=ufO6#i!Qvyv=Fwj^sM%Z@EOiIv2d(3B=^eXvPNZ-J15Od&NPTSBBFu)Gi( zcDe^O&_ihmf)ePp6iT5`Orbrr^w>k7JrtUAFS!*;>9Mrm%>HO4C8?ptyPA1F-@JKm zzIprmg(ol76Ms(UKDw~>_H6`zN&ar}*?;e0-6_~vyq~~Z1=u;YWsVENJIKJ#Fx-2O- zG^^6y0eTk=dk%62UFN!|1}dteT8GoB!t^R|8IB?3tX59LrMIO*v#a2a`ofU(;M5QF zUe4iul^n7JrwnIKSD(j-E9=0jYBYleO`Hnama3;Lt92NS+U!q;TSmL8isVrdL|ucN zgENnI9qt@zvVuD-tA&Hxa0#=(V(&m!)tWBf5pqq3P(ynWc1^kVAyjnpaQSbcW1_d6 z;Glg{NvINB(OFWuzDL74_Wm+jb^~$&ZVGZyziAu={BgpqL(*dbavAQj@^P2O?TLEm zq!0bw#e$IysQtzzE1|>-BbKYN^~|KZ3UWgI&*6T8*`Ru5OthjiSB5;L(B$c$0`iO= z@NBNBb0C{Ic-5eDn~~Sm_Vc~ZQpDGDFA_B2F6u)=2UMQK=H?^p>#6amhMv|$ zRP;PGxRSFxePkqS5&=C@2|8W@T6-HwAq3w$d|L5D_iE|j8W`qBolKG}TIr9JNinv8^e=Fc#oxPCU-2JZeEWCnZj5N>E2G4Jl#y zO^|jSru=c{g~ad)5uV@CTTv6k3$De5%t*d@9!6ZP3D+2pj!UCxALSujNng-$9^K0f zwsW;%U82+ay-Lz?d2!8DytroW@M*=9+uP0tya=;et}>VGVMAWjvLP=b8^sISsIAhH z%y>pW8)F7~0GSu|x1zc+1i!_C`B^>R`#bP{!~Yau z;~#zsoYT$TcRx4g?!(c&<$uo+_(pT(=&*2K93^Id5#s#dN%2-T#V^!u>(;xSqsyf) z4Dre!Uu_Obm8Q5l*sN^Lm<5qF(qh4=m-8Yo)(0O}-rRawBt%YZa9$8a{kx$@c~KJc zMnYtag0QHqCbD9Q8jHWH^hu`Hl}~aoWzBEC7J>NAA)E%GE(Dn$2Qk)-!LIda?j1(n O$Qh6T delta 2775 zcmbVO&1)M+6#vaiyDQ1E^{ri7b{zRjiVvxqK<$PU0xj*Og_b}hN`h&uAw#-fIUDD>7o1=BhMS z0?U3v8f)lhz*E@!HRURJOQ1E%moB*-t*ecG##gBQ^+ zqhFCkAV$e;igzA$QunK{4N+E)lLwKvBE$VzaZ9jGQ7n=N(awSs(j4p>+%jf)1s{>R zioG4U71;As^ES94)^q6{B6CwNZ^B-HH7i^PU?MUa)31t-4(C-bK_mN&5Rs12lKn-Y z6ozx?{h_q1CGZ-oGvM>`y%34_R>GYFC+`|Kxh)DGmn`G<1iJsVx_=-y7YT-+h5Hii zH!7+_WFRV8imdlEF+5C{Jhv`RyZBXbkf4;*Wv)u#%fg+n1o`HxazHFx6BXwT?ES^a z*`|f(#1^xTrMgQK^9I~`$qv%3g1F{3P&Nf)RFM7tLr+K~+ZypAkFDLcc$q~=qblD^ zx+`IoS7hWgjc+-|oe`|z!@e0QGHrN-n^WE85O-PP(lm-o$sHMAQ%^}{tGAY9jM2MI zPb-=t=O;Bdx}&g6K^8hOBf+Lfyv+Uf^G{T_4Y%%%bMI8NNnqpu0RyeB4vN3a(B2dH)$Hd6@j{c(L3t`_z*@)vt$#+|S0rLH>pOVm4@2)1ucUNoEBgq#g z%DI*bI5NtRBp=<(p@7b%S>B=pPZGLsxBaX8Syy`3yG{%oSHbz=1h^#NB>3ik0*{d} zcAQ78z?xBIwcChI_qhAr$A}+VO1;Kr5BoL=SWG!r87Tul% zRD>t`SM$`?MxRE+Q@Y>_vOUmON-%HAdG9#pO}$zm`;GCOY<<4>xe0F{El=((|4WB} zuQr#DKFl95^=6sA*p40)?lZPN{IT@GQKS5w!d@F5n@xTDn&((?^gqi=Y>qT3CC3al!_urqdVjP?@}xTa q)|k8WO4L~iK$N(1apWvU(phP;;k)Kn6^o*8(i3^@#)Ds-z4#A_&i#%6 diff --git a/geneChecker.pyc b/geneChecker.pyc index 56f8fd282b1c14847fad64afacc4847af99d02c7..61b14d8d9d1995ddda1c72bbf7e20e0f7f936e2f 100755 GIT binary patch delta 105 zcmbPV_q>*a`7L%OI@H a--O?ZK$#+%^4w5nvWYJzl=;Bco)G}zydNe2 delta 169 zcmaD}JHL*D`7H8#%ND8LcNb*lMuHml^6A=ow9p7OX}SnII%KSyU(pLlj+| hiEtUJn$4Gm--)0Jz0#29MiVkJ@#RDlT48I?2ml)CF{c0k diff --git a/geneChecker_fasta.py b/geneChecker_fasta.py index e1105fc..3375196 100755 --- a/geneChecker_fasta.py +++ b/geneChecker_fasta.py @@ -145,12 +145,12 @@ def geneCheck(fastaReference, resultFile, cutoffEquality_prot, cutoffEquality_nu for hsp in qhit.hsps: #hsp object checking, this contains the alignment info featureName = qhit.id if float(str(hsp.ident_num)+".00")/float(str(hsp.aln_span)+".00")*100 >= float(cutoffEquality_prot): - if hsp.aln_span*3+3 >= alignCutOff: + if hsp.aln_span*3 >= alignCutOff: if featureName in listOfImportantFeatures: targetFeature = listOfImportantFeatures[featureName] - startBase = min(hsp.query_range[0],hsp.query_range[1]) + startBase = min(hsp.query_range[0],hsp.query_range[1])+1 endBase = max(hsp.query_range[0],hsp.query_range[1]) - alignLen = endBase - startBase + alignLen = (endBase-startBase)+1 if featureName in listOfPresentFeatures: mainFeatureName = featureName mainFeatureFound = listOfPresentFeatures[mainFeatureName] @@ -171,7 +171,7 @@ def geneCheck(fastaReference, resultFile, cutoffEquality_prot, cutoffEquality_nu alignment.frame = featureFrame alignment.startBase = startBase alignment.endBase = endBase - alignment.seqFound = refSeq.seq[startBase:endBase] + alignment.seqFound = refSeq.seq[startBase-1:endBase] listOfPresentFeatures[featureName] = (listOfImportantFeatures[qhit.id], alignment, featureFrame <= -1) else: @@ -184,13 +184,14 @@ def geneCheck(fastaReference, resultFile, cutoffEquality_prot, cutoffEquality_nu alignment.frame = featureFrame alignment.startBase = startBase alignment.endBase = endBase - alignment.seqFound = refSeq.seq[startBase:endBase] + alignment.seqFound = refSeq.seq[(startBase-1):endBase] listOfPresentFeatures[featureName] = (listOfImportantFeatures[qhit.id], alignment, featureFrame <= -1) if alignLen >= (len(targetFeature*3)+3) * 0.99: #if we've already built a lot, dont even bother with finding splits listOfCompleteGenes.append(featureName) break + #exit() #copying the blast result in order for this info to be assessed later if the user desires shutil.copyfile("important_features.blast.xml", out_blast+"_ref.cds.blast.xml") os.remove("important_features.blast.xml") @@ -275,12 +276,13 @@ def geneCheck(fastaReference, resultFile, cutoffEquality_prot, cutoffEquality_nu alignment.seqFound = refSeq.seq[startBase:endBase] listOfPresentFeatures[featureName] = (listOfImportantFeatures[featureName], alignment, featureFrame == -1) break + shutil.copyfile("important_features.blast.xml", out_blast+"_ref.blast.xml") shutil.copyfile("important_features.fasta", out_blast+"_ref.fasta") os.remove("important_features.blast.xml") os.remove("important_features.fasta") - + return (listOfPresentFeatures, listOfImportantFeatures, listOfSplits, listOfCompleteGenes) def createImageOfAnnotation(sequenceObject, outputFile): @@ -308,7 +310,7 @@ def createImageOfAnnotation(sequenceObject, outputFile): for gbkFeature in sequenceObject.features: if gbkFeature.type == 'tRNA' or gbkFeature.type == 'CDS' or gbkFeature.type == 'rRNA' or gbkFeature.type == 'D-loop': - featureLen = gbkFeature.location.end - gbkFeature.location.start + featureLen = (gbkFeature.location.end - gbkFeature.location.start) + 1 featureRelativeSize = horizontalSize * featureLen / len(sequenceObject.seq) featureRelativeStart = (horizontalSize * gbkFeature.location.start / len(sequenceObject.seq)) + 1 @@ -388,6 +390,7 @@ def createImageOfAnnotation(sequenceObject, outputFile): percent_equality_prot=sys.argv[8] percent_equality_nucl=sys.argv[9] genbank=sys.argv[10] + nWalk=sys.argv[11] if sys.argv[1] == '-h' or sys.argv[1] == '--help': print 'Usage: genbank_reference fasta_file output_file organism_type(integer, default=2) alignCutOff(float, default=45) coveCutOff(7)' print 'Only the first, second, and third arguments are required.' @@ -490,7 +493,7 @@ def createImageOfAnnotation(sequenceObject, outputFile): listOfFeaturesToOutput.sort() print 'Total features found after Arwen: ',len(listOfFeaturesToOutput) - finalResults = genbankOutput.genbankOutput(outputFile, resultFile, listOfFeaturesToOutput, False, 900) + finalResults = genbankOutput.genbankOutput(outputFile, resultFile, listOfFeaturesToOutput, False, 900, nWalk) with open(outputFile, "w") as outputResult: count = SeqIO.write(finalResults, outputResult, "genbank") diff --git a/mitofinder b/mitofinder index de66b11..a289171 100755 --- a/mitofinder +++ b/mitofinder @@ -1,5 +1,5 @@ #!/usr/bin/python -#Version: 1.0.2 +#Version: 1.1 #Authors: Allio Remi & Schomaker-Bastos Alex #ISEM - CNRS - LAMPADA - IBQM - UFRJ @@ -65,7 +65,7 @@ if __name__ == "__main__": parser.add_argument('--metaspades', help='Use MetaSPAdes for assembly. ', default=False, dest='metaspades', action='store_true') # parser.add_argument('--annotate', help='Use Mitofinder to annotate a given mitochondrial (-c) ', dest='Annotate', action='store_true') - parser.add_argument('-j', '--jobname', help = 'Job name to be used throughout the process', default="MitoFinder_run", dest='processName') + parser.add_argument('-j', '--jobname', help = 'Sequence ID to be used throughout the process', default="", dest='processName') parser.add_argument('-1', '--Paired-end1', help='File with forward paired-end reads', default="", dest='PE1') parser.add_argument('-2', '--Paired-end2', help='File with reverse paired-end reads', default="", dest='PE2') parser.add_argument('-s', '--Single-end', help='File with single-end reads', default="", dest='SE') @@ -77,11 +77,13 @@ if __name__ == "__main__": default=100, dest='shortestContig') parser.add_argument('-p', '--processors', help='Number of threads Mitofinder will use at most.', type=int, default=4, dest='processorsToUse') - parser.add_argument('-r', '--refseq', help='Reference mitochondrial genome.s in GenBank format (.gb). You can put several reference mitgenomes together.', + parser.add_argument('-r', '--refseq', help='Reference mitochondrial genome in GenBank format (.gb).', default="", dest='refSeqFile') parser.add_argument('-e', '--blaste', help='e-value of blast program used for contig identification and annotation. Default = 0.000001', type=float, default=0.000001, dest='blasteVal') - parser.add_argument('--out_gb', help='Create annotation output file in GenBank format.', default=False, dest='genbk', action='store_true') + parser.add_argument('-n', '--nwalk', help='Maximum number of codon steps to be tested on each size of the gene to find the start and stop codon during the annotation step. Default = 20 (60 bases)', type=int, + default=20, dest='nWalk') + parser.add_argument('--out_gb', help='Do not create annotation output file in GenBank format.', default=True, dest='genbk', action='store_false') parser.add_argument('--blastidentitynucl', help='Nucleotide identity percentage for a hit to be retained. Default = 50', default=50, type=float, dest='blastIdentityNucl') parser.add_argument('--blastidentityprot', help='Amino acid identity percentage for a hit to be retained. Default = 40', @@ -114,7 +116,7 @@ if __name__ == "__main__": 24. Pterobranchia Mitochondrial Code\ 25. Candidate Division SR1 and Gracilibacteria Code", type=int, default=1, dest='organismType') - parser.add_argument('-v', '--version', help="Version 1.0.2", default=False, dest='versionCheck', action='store_true') + parser.add_argument('-v', '--version', help="Version 1.1", default=False, dest='versionCheck', action='store_true') parser.add_argument('--example', help="Print getting started examples", default=False, dest='example', action='store_true') args = parser.parse_args() @@ -128,10 +130,20 @@ if __name__ == "__main__": exit() if args.versionCheck == True: - print "MitoFinder version 1.0.2" + print "MitoFinder version 1.1" exit() - - Logfile=args.processName+".log" + + if args.PE1 == "" and args.PE2 == "" and args.SE == "" and args.Assembly == "" : + print "\nERROR: Read or assembly files are not specified.\n Please, use -1 -2 -s or -a option to specify input data." + exit() + if args.refSeqFile == "": + print "\nERROR: Reference file is required (-r option)" + exit() + if args.processName == "": + print "\nERROR: SeqID is required (-j option)" + exit() + + Logfile=args.processName+"_MitoFinder.log" Logfile=os.path.join(initial_path,Logfile) logfile=open(Logfile,"w") logfile.write('Command line: %s' % ' '.join(sys.argv)+"\n") @@ -145,15 +157,7 @@ if __name__ == "__main__": print 'Now running MitoFinder ...\n' print "Start time : "+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+'\n' - if args.PE1 == "" and args.PE2 == "" and args.SE == "" and args.Assembly == "" : - print "\nERROR: Read or assembly files are not specified.\n Please, use -1 -2 -s or -a option to specify input data." - logfile.write("\nERROR: Read or assembly files are not specified.\n Please, use -1 -2 -s or -a option to specify input data.\n") - exit() - if args.refSeqFile == "": - print "ERROR: Reference file is required (-r option)" - logfile.write("ERROR: Reference file is required (-r option)") - exit() - + print "Job name = "+args.processName args.refSeqFile=os.path.join(initial_path,args.refSeqFile) @@ -240,10 +244,6 @@ if __name__ == "__main__": logfile.write("\n") if pathToMegahitFolder.lower() == 'default' and args.idba == False and args.metaspades == False: pathToMegahitFolder = os.path.join(module_dir, 'megahit/') - command = 'make -C '+pathToMegahitFolder - args1 = shlex.split(command) - make = Popen(args1, stdout=open("log","w")) - make.wait() logfile.write('WARNING: Megahit is still set in its default folder. Change it in the config file if you encounter problems running this script.\n') print 'WARNING: Megahit is still set in its default folder. Change it in the config file if you encounter problems running this script.' @@ -795,8 +795,8 @@ if __name__ == "__main__": #creating some stat file: print "\n" - print "Creating summary statistics for "+args.processName+"_best_contig.fasta" - logfile.write("\n\n"+"Creating summary statistics for "+args.processName+"_best_contig.fasta\n") + print "Creating summary statistics for the mtDNA contig" + logfile.write("\n\n"+"Creating summary statistics for the mtDNA contig\n") finalResults = SeqIO.read(open(resultFile, 'rU'), "fasta", generic_dna) finalStatsFile = open(pathOfFinalResults + args.processName + '.stats', 'w') @@ -809,19 +809,25 @@ if __name__ == "__main__": else: finalStatsFile.write("Circularization: No\n") - shutil.copyfile(resultFile, pathOfFinalResults+"/"+args.processName+"_best_contig.fasta") + command = module_dir+"/rename_fasta_seqID.py " + args.processName + " " + resultFile + " " + pathOfFinalResults+"/"+args.processName+"_mtDNA_contig.fasta" + " " + str(1) + args1 = shlex.split(command) + rename = Popen(args1, stdout=open(os.devnull, 'wb')) + rename.wait() + + os.remove(resultFile) + #shutil.copyfile(resultFile, pathOfFinalResults+"/"+args.processName+"_mtDNA_contig.fasta") # Annotating with gene_checker print "" - print "Annotating mitochondrial contigs" + print "Annotating mitochondrial contig" print "" logfile.write("\nAnnotating\n\n") if recordCount > 1: #if more than 1 ref - if os.path.isfile(pathtowork+'/ref_for_best_contig.fasta') == True: - os.remove(pathtowork+'/ref_for_best_contig.fasta') + if os.path.isfile(pathtowork+'/ref_for_mtDNA_contig.fasta') == True: + os.remove(pathtowork+'/ref_for_mtDNA_contig.fasta') for line in open(pathtowork+"/genes_list"): if line.rstrip() != "rrnL" and line.rstrip() != "rrnS": @@ -831,7 +837,7 @@ if __name__ == "__main__": formatDB = Popen(args1, stdout=open(os.devnull, 'wb')) formatDB.wait() with open(pathtowork+'/'+gene+'_blast_out.txt','w') as BlastResultGene: - command = blastFolder+"/blastx -db " + "ref_" + gene+ "_database.fasta" + " -query "+ resultFile + " -evalue " + str(blasteVal) + " -outfmt 6" + " -query_gencode " + str(args.organismType) + " -seg no" + command = blastFolder+"/blastx -db " + "ref_" + gene+ "_database.fasta" + " -query "+ pathOfFinalResults+"/"+args.processName+"_mtDNA_contig.fasta" + " -evalue " + str(blasteVal) + " -outfmt 6" + " -query_gencode " + str(args.organismType) + " -seg no" args1 = shlex.split(command) blast = Popen(args1, stdout=BlastResultGene) blast.wait() @@ -842,7 +848,7 @@ if __name__ == "__main__": formatDB = Popen(args1, stdout=open(os.devnull, 'wb')) formatDB.wait() with open(pathtowork+'/'+gene+'_blast_out.txt','w') as BlastResultGene: - command = blastFolder+"/blastn -db " + "ref_" + gene+ "_database.fasta"+ " -query "+ resultFile + " -evalue " + str(blasteVal) + " -outfmt 6 -perc_identity " + str(args.blastIdentityNucl) + " -dust no" + command = blastFolder+"/blastn -db " + "ref_" + gene+ "_database.fasta"+ " -query "+ pathOfFinalResults+"/"+args.processName+"_mtDNA_contig.fasta" + " -evalue " + str(blasteVal) + " -outfmt 6 -perc_identity " + str(args.blastIdentityNucl) + " -dust no" args1 = shlex.split(command) blast = Popen(args1, stdout=BlastResultGene) blast.wait() @@ -861,7 +867,7 @@ if __name__ == "__main__": dico_query[testedGene]=query bestScore=score - refFile=open(pathtowork+'/ref_for_best_contig.fasta','a') + refFile=open(pathtowork+'/ref_for_mtDNA_contig.fasta','a') for cle, valeur in dico_query.items(): for name, seq in read_fasta(open(pathtowork+"/ref_"+gene+ "_database.fasta")): if name.replace(">","") == valeur: @@ -869,26 +875,26 @@ if __name__ == "__main__": refFile.close() - command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_best_contig.fasta" + " " + pathOfFinalResults + "/" + args.processName+"_best_contig.fasta" + " " + args.processName+"_mito.gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk) + command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_mtDNA_contig.fasta" + " " + pathOfFinalResults + "/" + args.processName+"_mtDNA_contig.fasta" + " " + args.processName+"_mtDNA_contig.gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk)+ " " + str(args.nWalk) args1 = shlex.split(command) fifthStep = Popen(args1, cwd=pathOfFinalResults, stdout=open('geneChecker.log', 'a'), stderr=open('geneChecker_erreur.log', 'a')) fifthStep.wait() else: - best_ref=open(pathtowork + "/ref_for_best_contig.fasta","w") + best_ref=open(pathtowork + "/ref_for_mtDNA_contig.fasta","w") for line in open(pathtowork+"/genes_list"): gene=line.rstrip() for name, seq in read_fasta(open(pathtowork+"/ref_"+gene+ "_database.fasta")): best_ref.write(name+"\n"+seq+"\n") best_ref.close() - command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_best_contig.fasta" + " " + pathOfFinalResults + "/" + args.processName+"_best_contig.fasta" + " " + args.processName+"_mito.gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk) + command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_mtDNA_contig.fasta" + " " + pathOfFinalResults + "/" + args.processName+"_mtDNA_contig.fasta" + " " + args.processName+"_mtDNA_contig.gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk)+ " " + str(args.nWalk) args1 = shlex.split(command) fifthStep = Popen(args1, cwd=pathOfFinalResults, stdout=open('geneChecker.log', 'w'), stderr=open('geneChecker_erreur.log', 'w')) fifthStep.wait() - test_arwen=pathOfFinalResults + "/" + args.processName+"_best_contig.arwen" + test_arwen=pathOfFinalResults + "/" + args.processName+"_mtDNA_contig.arwen" if os.path.isfile(test_arwen) == True: print "tRNA annotation with Arwen run well.\n" logfile.write("tRNA annotation with Arwen run well.\n\n") @@ -896,7 +902,7 @@ if __name__ == "__main__": print "ERROR: tRNA annotation failed.\nPlease check "+ pathtowork + "/geneChecker_erreur.log or geneChecker.log to see what happened\nAborting\n" logfile.write("ERROR: tRNA annotation failed.\nPlease check "+ pathtowork + "/geneChecker_erreur.log or geneChecker.log to see what happened\nAborting\n\n") exit() - test_gene_checker=pathOfFinalResults+args.processName+"_mito.gb" + test_gene_checker=pathOfFinalResults+args.processName+"_mtDNA_contig.gb" if os.path.isfile(test_gene_checker) == True: print "Annotation completed\n" logfile.write("Annotation completed\n\n") @@ -985,7 +991,7 @@ if __name__ == "__main__": for line in open(pathtowork+"/"+'contig_list.txt','r'): pathOfResult = pathtowork+"/"+args.processName+'_contig_'+str(c)+'.fasta' - resultFile = args.processName + '_partial_mito_'+str(c)+'.fasta' + resultFile = args.processName + '_mtDNA_contig_'+str(c)+'.fasta' with open(resultFile, "w") as outputResult: #create draft file to be checked and annotated finalResults = SeqIO.read(open(pathOfResult, 'rU'), "fasta", generic_dna) @@ -999,32 +1005,38 @@ if __name__ == "__main__": #creating some stat file: print "\n" - print "Creating summary statistics for Partial_mito_"+str(c) + print "Creating summary statistics for mtDNA contig "+str(c) print "" - logfile.write("\n\n"+"Creating summary statistics for Partial_mito_"+str(c)+"\n\n") + logfile.write("\n\n"+"Creating summary statistics for mtDNA contig "+str(c)+"\n\n") finalResults = SeqIO.read(open(resultFile, 'rU'), "fasta", generic_dna) - finalStatsFile = open(pathOfFinalResults + args.processName + '_partial_mito_'+str(c)+'.stats', 'w') + finalStatsFile = open(pathOfFinalResults + args.processName + '_mtDNA_contig_'+str(c)+'.stats', 'w') finalStatsFile.write('Statistics for contig '+str(c)+':\n\n') finalStatsFile.write('Length: ' + str(len(finalResults.seq)) + "\n") finalStatsFile.write('GC content: ' + ("{0:.2f}".format(SeqUtils.GC(finalResults.seq))) + '%\n') finalStatsFile.write("Circularization: No\n") - shutil.copyfile(resultFile, pathOfFinalResults+"/"+args.processName+"_partial_mito_"+str(c)+".fasta") + command = module_dir+"/rename_fasta_seqID.py " + args.processName + " " + resultFile + " " + pathOfFinalResults+"/"+args.processName+"_mtDNA_contig_"+str(c)+".fasta" + " " + str(c) + args1 = shlex.split(command) + rename = Popen(args1, stdout=open(os.devnull, 'wb')) + rename.wait() + + os.remove(resultFile) + #shutil.copyfile(resultFile, pathOfFinalResults+"/"+args.processName+"_mtDNA_contig_"+str(c)+".fasta") # creating best ref file for annotation - command = blastFolder+"/makeblastdb -in " + pathOfFinalResults+"/"+args.processName+"_partial_mito_"+str(c)+".fasta" + " -dbtype nucl" #need to formatdb refseq first + command = blastFolder+"/makeblastdb -in " + pathOfFinalResults+"/"+args.processName+"_mtDNA_contig_"+str(c)+".fasta" + " -dbtype nucl" #need to formatdb refseq first args1 = shlex.split(command) formatDB = Popen(args1, stdout=open(os.devnull, 'wb')) formatDB.wait() if recordCount > 1: #if more than 1 ref - print "Looking for best reference genes for Partial_mito_"+str(c) + print "Looking for best reference genes for mtDNA contig "+str(c) print "" - logfile.write("Looking for best reference genes for Partial_mito_"+str(c)+"\n\n") + logfile.write("Looking for best reference genes for mtDNA contig "+str(c)+"\n\n") if os.path.isfile(pathtowork+'/ref_for_contig_'+str(c)+'.fasta') == True: os.remove(pathtowork+'/ref_for_contig_'+str(c)+'.fasta') @@ -1037,7 +1049,7 @@ if __name__ == "__main__": formatDB = Popen(args1, stdout=open(os.devnull, 'wb')) formatDB.wait() with open(pathtowork+'/'+gene+'_blast_out.txt','w') as BlastResultGene: - command = blastFolder+"/blastx -db " + "ref_" + gene+ "_database.fasta" + " -query "+ pathOfFinalResults+"/"+args.processName+"_partial_mito_"+str(c)+".fasta" + " -evalue " + str(blasteVal) + " -outfmt 6" + " -query_gencode " + str(args.organismType) + " -seg no" + command = blastFolder+"/blastx -db " + "ref_" + gene+ "_database.fasta" + " -query "+ pathOfFinalResults+"/"+args.processName+"_mtDNA_contig_"+str(c)+".fasta" + " -evalue " + str(blasteVal) + " -outfmt 6" + " -query_gencode " + str(args.organismType) + " -seg no" args1 = shlex.split(command) blast = Popen(args1, stdout=BlastResultGene) blast.wait() @@ -1048,7 +1060,7 @@ if __name__ == "__main__": formatDB = Popen(args1, stdout=open(os.devnull, 'wb')) formatDB.wait() with open(pathtowork+'/'+gene+'_blast_out.txt','w') as BlastResultGene: - command = blastFolder+"/blastn -db " + "ref_" + gene+ "_database.fasta"+ " -query "+ pathOfFinalResults+"/"+args.processName+"_partial_mito_"+str(c)+".fasta" + " -evalue " + str(blasteVal) + " -outfmt 6 -perc_identity " + str(args.blastIdentityNucl) + " -dust no" + command = blastFolder+"/blastn -db " + "ref_" + gene+ "_database.fasta"+ " -query "+ pathOfFinalResults+"/"+args.processName+"_mtDNA_contig_"+str(c)+".fasta" + " -evalue " + str(blasteVal) + " -outfmt 6 -perc_identity " + str(args.blastIdentityNucl) + " -dust no" args1 = shlex.split(command) blast = Popen(args1, stdout=BlastResultGene) blast.wait() @@ -1075,9 +1087,9 @@ if __name__ == "__main__": refFile.close() - print "Annotating Partial_mito_"+str(c) + print "Annotating mtDNA contig "+str(c) print "" - logfile.write("Annotating Partial_mito_"+str(c)+"\n\n") + logfile.write("Annotating mtDNA contig "+str(c)+"\n\n") # Annotating with gene_checker """QRY_dico={} @@ -1102,14 +1114,14 @@ if __name__ == "__main__": tmp.write("LOCUS"+x) tmp.close() - command = module_dir+"/geneChecker.py " + "../best_query.gb" + " " + resultFile + " " + args.processName+"_partial_mito_"+str(c)+".gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + command = module_dir+"/geneChecker.py " + "../best_query.gb" + " " + resultFile + " " + args.processName+"_mtDNA_contig_"+str(c)+".gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) args1 = shlex.split(command) fifthStep = Popen(args1, cwd=pathOfFinalResults, stdout=open('geneChecker.log', 'a'), stderr=open('geneChecker_erreur.log', 'a')) fifthStep.wait() """ if recordCount > 1: - command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_contig_" + str(c) + ".fasta" + " " + pathOfFinalResults+"/"+args.processName+"_partial_mito_"+str(c)+".fasta" + " " + args.processName+"_partial_mito_"+str(c)+".gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk) + command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_contig_" + str(c) + ".fasta" + " " + pathOfFinalResults+"/"+args.processName+"_mtDNA_contig_"+str(c)+".fasta" + " " + args.processName+"_mtDNA_contig_"+str(c)+".gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk)+ " " + str(args.nWalk) args1 = shlex.split(command) fifthStep = Popen(args1, cwd=pathOfFinalResults, stdout=open('geneChecker.log', 'a'), stderr=open('geneChecker_erreur.log', 'a')) fifthStep.wait() @@ -1123,28 +1135,28 @@ if __name__ == "__main__": best_ref.write(name+"\n"+seq+"\n") best_ref.close() - command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_contigs.fasta" + " " + pathOfFinalResults+"/"+args.processName+"_partial_mito_"+str(c)+".fasta" + " " + args.processName+"_partial_mito_"+str(c)+".gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk) + command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_contigs.fasta" + " " + pathOfFinalResults+"/"+args.processName+"_mtDNA_contig_"+str(c)+".fasta" + " " + args.processName+"_mtDNA_contig_"+str(c)+".gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk)+ " " + str(args.nWalk) args1 = shlex.split(command) fifthStep = Popen(args1, cwd=pathOfFinalResults, stdout=open('geneChecker.log', 'w'), stderr=open('geneChecker_erreur.log', 'w')) fifthStep.wait() - test_arwen=pathOfFinalResults+"/"+args.processName+"_partial_mito_"+str(c)+".arwen" + test_arwen=pathOfFinalResults+"/"+args.processName+"_mtDNA_contig_"+str(c)+".arwen" if os.path.isfile(test_arwen) == True: - print "tRNA annotation with Arwen run well for partial mito "+str(c)+".\n" - logfile.write("tRNA annotation with Arwen run well for partial mito "+str(c)+".\n"+"\n") + print "tRNA annotation with Arwen run well for mtDNA contig "+str(c)+".\n" + logfile.write("tRNA annotation with Arwen run well for mtDNA contig "+str(c)+".\n"+"\n") else: print "ERROR: tRNA annotation with Arwen failed\nPlease check "+ pathtowork + "/geneChecker_erreur.log or geneChecker.log to see what happened\nAborting\n" logfile.write("ERROR: tRNA annotation with Arwen failed\nPlease check "+ pathtowork + "/geneChecker_erreur.log or geneChecker.log to see what happened\nAborting\n\n") exit() - test_gene_checker=pathOfFinalResults+args.processName+"_partial_mito_"+str(c)+".gb" + test_gene_checker=pathOfFinalResults+args.processName+"_mtDNA_contig_"+str(c)+".gb" if os.path.isfile(test_gene_checker) == True: print "Annotation completed\n" logfile.write("Annotation completed\n"+"\n") else: - print "ERROR: Gene annotation failed for partial mito "+str(c)+".\nPlease check "+ pathtowork + "/geneChecker_error.log to see what happened\nAborting\n" - logfile.write("ERROR: Gene annotation failed for partial mito "+str(c)+".\nPlease check "+ pathtowork + "/geneChecker_error.log to see what happened\nAborting\n"+"\n") + print "ERROR: Gene annotation failed for mtDNA contig "+str(c)+".\nPlease check "+ pathtowork + "/geneChecker_error.log to see what happened\nAborting\n" + logfile.write("ERROR: Gene annotation failed for mtDNA contig "+str(c)+".\nPlease check "+ pathtowork + "/geneChecker_error.log to see what happened\nAborting\n"+"\n") exit() c+=1 @@ -1153,8 +1165,8 @@ if __name__ == "__main__": else: if recordCount > 1: #if more than 1 ref - if os.path.isfile(pathtowork+'/ref_for_best_contig.fasta') == True: - os.remove(pathtowork+'/ref_for_best_contig.fasta') + if os.path.isfile(pathtowork+'/ref_for_mtDNA_contig.fasta') == True: + os.remove(pathtowork+'/ref_for_mtDNA_contig.fasta') for line in open(pathtowork+"/genes_list"): if line.rstrip() != "rrnL" and line.rstrip() != "rrnS": @@ -1194,7 +1206,7 @@ if __name__ == "__main__": dico_query[testedGene]=query bestScore=score - refFile=open(pathtowork+'/ref_for_best_contig.fasta','a') + refFile=open(pathtowork+'/ref_for_mtDNA_contig.fasta','a') for cle, valeur in dico_query.items(): for name, seq in read_fasta(open(pathtowork+"/ref_"+gene+ "_database.fasta")): if name.replace(">","") == valeur: @@ -1202,7 +1214,7 @@ if __name__ == "__main__": refFile.close() - command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_best_contig.fasta" + " " + args.CONTIG + " " + args.processName+"_mito.gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk) + command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_mtDNA_contig.fasta" + " " + args.CONTIG + " " + args.processName+"_mtDNA_contig.gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk)+ " " + str(args.nWalk) args1 = shlex.split(command) fifthStep = Popen(args1, cwd=pathOfFinalResults, stdout=open('geneChecker.log', 'a'), stderr=open('geneChecker_erreur.log', 'a')) fifthStep.wait() @@ -1211,14 +1223,14 @@ if __name__ == "__main__": print "Running Annotation\n" logfile.write("Running Annotation\n"+"\n") - best_ref=open(pathtowork + "/ref_for_best_contig.fasta","w") + best_ref=open(pathtowork + "/ref_for_mtDNA_contig.fasta","w") for line in open(pathtowork+"/genes_list"): gene=line.rstrip() for name, seq in read_fasta(open(pathtowork+"/ref_"+gene+ "_database.fasta")): best_ref.write(name+"\n"+seq+"\n") best_ref.close() - command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_best_contig.fasta" + " " + args.CONTIG + " " + args.processName+"_mito.gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk) + command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_mtDNA_contig.fasta" + " " + args.CONTIG + " " + args.processName+"_mtDNA_contig.gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk)+ " " + str(args.nWalk) args1 = shlex.split(command) fifthStep = Popen(args1, cwd=pathOfFinalResults,stdout=open('geneChecker.log', 'w'), stderr=open('geneChecker_erreur.log', 'w')) fifthStep.wait() @@ -1232,7 +1244,7 @@ if __name__ == "__main__": print "ERROR: tRNA annotation with Arwen failed\nPlease check "+ pathtowork + "/geneChecker_erreur.log or geneChecker.log to see what happened\nAborting\n" logfile.write( "ERROR: tRNA annotation with Arwen failed\nPlease check "+ pathtowork + "/geneChecker_erreur.log or geneChecker.log to see what happened\nAborting\n"+"\n") exit() - test_gene_checker=pathOfFinalResults+args.processName+"_mito.gb" + test_gene_checker=pathOfFinalResults+args.processName+"_mtDNA_contig.gb" if os.path.isfile(test_gene_checker) == True: print "Annotation is done\n" logfile.write("Annotation is done\n"+"\n") @@ -1275,11 +1287,19 @@ if __name__ == "__main__": logfile.write("\n") #sort gff - for f in glob.glob(pathOfFinalResults+"/*_raw.gff"): - command = module_dir+"/sort_gff.py " + f - args1 = shlex.split(command) - sort_gff = Popen(args1, cwd=pathOfFinalResults, stdout=open('geneChecker.log', 'a'), stderr=open('geneChecker_erreur.log', 'a')) - sort_gff.wait() + pathlist = glob.glob(pathOfFinalResults+"/*.gb") + if len(pathlist) == 1 : + for f in glob.glob(pathOfFinalResults+"/*_raw.gff"): + command = module_dir+"/sort_gff.py " + f + " " + args.processName +".1 " + str(args.organismType) + args1 = shlex.split(command) + sort_gff = Popen(args1, cwd=pathOfFinalResults, stdout=open('geneChecker.log', 'a'), stderr=open('geneChecker_erreur.log', 'a')) + sort_gff.wait() + else: + for f in glob.glob(pathOfFinalResults+"/*_raw.gff"): + command = module_dir+"/sort_gff.py " + f + " " + args.processName +"."+ f.split("_raw")[0][-1] + " " + str(args.organismType) + args1 = shlex.split(command) + sort_gff = Popen(args1, cwd=pathOfFinalResults, stdout=open('geneChecker.log', 'a'), stderr=open('geneChecker_erreur.log', 'a')) + sort_gff.wait() #check genes (doublon ?) @@ -1339,7 +1359,9 @@ if __name__ == "__main__": os.remove(f) shutil.copy(pathOfFinalResults+"Arwen.log", tmpfiles+"/") os.remove(pathOfFinalResults+"Arwen.log") - + for f in glob.glob(pathOfFinalResults+"*_raw.gff"): + shutil.copy(f, tmpfiles+"/") + os.remove(f) for f in glob.glob(pathtowork+"/*blast*"): shutil.copy(f, tmpfiles+"/") os.remove(f) diff --git a/rename_fasta_seqID.py b/rename_fasta_seqID.py new file mode 100755 index 0000000..2ecb989 --- /dev/null +++ b/rename_fasta_seqID.py @@ -0,0 +1,23 @@ +#! /usr/bin/env python + +import sys +import os.path + +def read_fasta(fp): + name, seq = None, [] + for line in fp: + line = line.rstrip() + if line.startswith(">"): + if name: yield (name, ''.join(seq)) + name, seq = line, [] + else: + seq.append(line) + if name: yield (name, ''.join(seq)) + +seqID=sys.argv[1] +fasta = open(sys.argv[2]) +fout = open(sys.argv[3], 'w') +c=sys.argv[4] + +for name, seq in read_fasta(fasta): + fout.write(">"+seqID+"."+str(c)+"\n"+seq+"\n") diff --git a/runIDBA.pyc b/runIDBA.pyc index cb78398a3d8272b95964ad5fb94ecf605825556d..9c95e443c1d4b9c3556af4487792c45d2aa16df2 100755 GIT binary patch delta 294 zcmdn4Iah;&`7}O3en(WFQ%Vf$h`8$__8y`arA48T1Lo)*dBha+)6b6RC z2q2?GfFTR$BA`BbpodKum1d}s1~~vI0W!XXnITII$eHZNBgJU8Ig=-adGkyD zWJWGF1_p+r)ZF~C)Xm|7rHqUQlg|oiO#UMz!?}uE{a5b<5 z3`WtV==Etk23H50!y(1Vh|JO`th);86l~e+3`u0mQ38<+!;K1IM%==~hY&CeLl}_; z0vkK9DTP_?$7Tgx0wTz49!7f{EYI)MqtKhomiUaeOIG=%w%GY^$~FTwcSM{MLtV^C zfK7`Uv%#DM+DW)czr8aS#181+T;p%`fJ~*?a1QpZ zTDheNXtgTkTCsZUvC#_ABI3pWuQnX-HFiqGu0q_5?48+G*2?i_JVl=Qb-Wi+iAnjC Wo*q_-O(%#>#G!mczxbti*YjT>gJARk diff --git a/runMegahit.pyc b/runMegahit.pyc index 409f2bcbd63e5b23dc8d2053a9ec4370ab4706d7..8af34e8f495bb747b7dd4c4d576a38304b0e456a 100755 GIT binary patch delta 204 zcmbQDI$MQ<`7SMyUoOCJlRl4W3s5A t5@X$D3&BhYUREAP9!7O0bw)WxE=CM zR+H-mG}z@8S~rt!I;2>@MFNOAxG diff --git a/runMetaspades.pyc b/runMetaspades.pyc index 56d017e5036e938f03e66414ef0420eb9a6d1c4c..e810c5ef378d7e81e430296b00a99e84f44f9ce2 100755 GIT binary patch delta 232 zcmZounX1ge{F#?)>z=8x8##Jd8JBFH%DRt9c^XRzBSQ@jLy8Q8c#14TGZRDqLB?7! zh8jVJ6gh@$CWfNTlgl_JvutKgQJpNnB|ce&Q&FLCGjkpTScy7VNiRzc2SW-snEiw~ zMPqUwrv#(s1mHIF delta 360 zcmb7-KTE?<6vfYddBG$mm}+SZ)S|Rtv?Zt@4*mf*agz=@h@!q0R48alad>sG;3POa z`Vrif><)eo!PQX^y6Q_tH>Z2<;hf(YR_|+;@$ESqH>1NB>%pW^Iy7Hd^^yXskro|R*-pLT$*mB%>(S=1GMMNMl>$Sml09)ZS?&SsD( zhkS5pL5%0x<6;5Qft-X!NCRz-Qa?Z^gjiI!c8jN6)RkukR98Rt?9M+|_XV(bkj>k; zC}IGRcA8Nm-Ev&RODq6!a(2>gwyx4$?}^c-`kIKwj(f}TQoN$3{fO?=wtrrBsKH%w aDP$p4$f1Z#@@P)0Ri@tj"): + if name: yield (name, ''.join(seq)) + name, seq = line, [] + else: + seq.append(line) + if name: yield (name, ''.join(seq)) + file1=open(sys.argv[1]) +fasta=open(sys.argv[1].split("_raw.gff")[0]+".fasta") +seqID=sys.argv[2] dico={} dicotrna={} dicof={} c=0 +for name, seq in read_fasta(fasta): + length=len(seq) + for line in open(sys.argv[1]): dico[line.rstrip().split("\t")[8]]=line.rstrip() @@ -45,14 +61,399 @@ sorted_x = sorted(dicof.items(), key=operator.itemgetter(0)) sorted_dict = collections.OrderedDict(sorted_x) -fout=sys.argv[1].split("_raw.gff")[0]+"_filtered.gff" -fout=open(fout, "w") +gout=sys.argv[1].split("_raw.gff")[0]+"_final.gff" +gout=open(gout, "w") +gout.write(seqID+"\t"+"mitofinder\tsource\t1\t"+str(length)+"\t.\t+\t.\tName=source "+seqID.replace("_"," ")+"\n") +tout=sys.argv[1].split("_raw.gff")[0]+"_final.tbl" +tout=open(tout, "w") +tout.write(">Feature "+seqID+"\n") +#tout.write("1\t"+str(length)+"\tREFERENCE\n") +#tout.write("\t\t\tMitoFinder\txxxxxx\n") for k, v in sorted_dict.items(): if not dicotrna.has_key(v.split("\t")[8]): - fout.write(v+"\n") - + col1=seqID + col2=v.split("\t")[1] + col3=v.split("\t")[2] + col4=v.split("\t")[3] + col5=v.split("\t")[4] + col6=v.split("\t")[5] + col7=v.split("\t")[6] + col8=v.split("\t")[7] + col9=v.split("\t")[8].rstrip() + if "COX1" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome c oxidase subunit I\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome c oxidase subunit I\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome c oxidase subunit I\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "COX2" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome c oxidase subunit II\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome c oxidase subunit II\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome c oxidase subunit II\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "COX3" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome c oxidase subunit III\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome c oxidase subunit III\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome c oxidase subunit III\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + if "ND1" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 1\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 1\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 1\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "ND2" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 2\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 2\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 2\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "ND3" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 3\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 3\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 3\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "ND4" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 4\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 4\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 4\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "ND4L" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 4L\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 4L\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 4L\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "ND5" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 5\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 5\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 5\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "CYTB" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome b\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome b\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome b\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "ATP8" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tATP synthase F0 subunit 8\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tATP synthase F0 subunit 8\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tATP synthase F0 subunit 8\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "ATP6" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tATP synthase F0 subunit 6\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tATP synthase F0 subunit 6\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tATP synthase F0 subunit 6\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "tRNA" in col9: + size=int(col5)-int(col4)+1 + if size <= 120: + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"tRNA\n") + tout.write("\t\t\ttRNA\t"+col9+"\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+"\n") + gout.write(col1+"\t"+col2+"\t"+"tRNA"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" "+"\n") + if "trn" in col9: + size=int(col5)-int(col4)+1 + if size <= 120: + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"tRNA\n") + tout.write("\t\t\ttRNA\t"+col9+"\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+"\n") + gout.write(col1+"\t"+col2+"\t"+"tRNA"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" tRNA"+"\n") + if "rrnL" in col9: + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"rRNA\n") + tout.write("\t\t\trRNA\t"+col9+"\n") + tout.write("\t\t\tproduct\tlarge subunit ribosomal RNA\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+"\n") + gout.write(col1+"\t"+col2+"\t"+"rRNA"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" rRNA"+"\n") + if "rrnS" in col9: + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"rRNA\n") + tout.write("\t\t\trRNA\t"+col9+"\n") + tout.write("\t\t\tproduct\tsmall subunit ribosomal RNA\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+"\n") + gout.write(col1+"\t"+col2+"\t"+"rRNA"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" rRNA"+"\n") +gout.close() +tout.close() + diff --git a/tRNAscanChecker.pyc b/tRNAscanChecker.pyc index d47678a75ac1b38838e0cf9f2b96f4bc393eb442..00e3f296a5ba8eb9f9651d00ee1a8c04f55206c4 100755 GIT binary patch delta 237 zcmdnsbHJN}`7$p3O{9?nM!i z$s76SBJm~(1S0X&1z|jpp?d_w5yngo7m9-OvfwO<$&-Z(kW6qFfvVbkQ6!xMu1sWe ZjqEB`ByN^+0V9GtSx$915|2|&9RNZxOY8su delta 361 zcmX@$y}^fr`7