From 66b105d7adaf4efb597572d7ac24852e5e2e6504 Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Thu, 20 Oct 2022 10:57:15 +1030 Subject: [PATCH 1/3] meta mode speed up trnascan --- README.md | 17 ++++++++-- bin/input_commands.py | 12 +++---- bin/pharokka.py | 26 ++++++++++---- bin/post_processing.py | 6 ++-- bin/processes.py | 77 +++++++++++++++++++++++++++++++++++------- 5 files changed, 107 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index 24f7663..cdcaca3 100644 --- a/README.md +++ b/README.md @@ -175,7 +175,8 @@ In v0.1.7, the ability to specify an E-value threshold for PHROGs CDS functional pharokka defaults to 1 thread. ``` -usage: pharokka.py [-h] -i INFILE [-o OUTDIR] [-d DATABASE] [-t THREADS] [-f] [-p PREFIX] [-l LOCUSTAG] [-g GENE_PREDICTOR] [-m] [-c CODING_TABLE] [-e EVALUE] [-V] +usage: pharokka.py [-h] -i INFILE [-o OUTDIR] [-d DATABASE] [-t THREADS] [-f] [-p PREFIX] [-l LOCUSTAG] + [-g GENE_PREDICTOR] [-m] [-c CODING_TABLE] [-e EVALUE] [-V] pharokka: fast phage annotation program @@ -196,7 +197,7 @@ optional arguments: User specified locus tag for the gff/gbk files. This is not required. A random locus tag will be generated instead. -g GENE_PREDICTOR, --gene_predictor GENE_PREDICTOR User specified gene predictor. Use "-g phanotate" or "-g prodigal". Defaults to phanotate (not required unless prodigal is desired). - -m, --meta Metagenomic option for Prodigal + -m, --meta meta mode for metavirome input samples -c CODING_TABLE, --coding_table CODING_TABLE translation table for prodigal. Defaults to 11. Experimental only. -e EVALUE, --evalue EVALUE @@ -218,11 +219,21 @@ On a standard 16GB RAM laptop specifying 8 threads, pharokka should take between # Benchmarking -The 673 crAss-like phage genomes were taken from Yutin, N., Benler, S., Shmakov, S.A. et al. Analysis of metagenome-assembled viral genomes from the human gut reveals diverse putative CrAss-like phages with unique genomic features. Nat Commun 12, 1044 (2021). https://doi.org/10.1038/s41467-021-21350-w +Pharokka (v1.1.0) has been benchmarked on an Intel Xeon CPU E5-4610 v2 @ 2.30 specifying 16 threads. +Enterbacteria Phage Lambda (Genbank accession J02459) Staphylococcus Phage SAOMS1 (Genbank Accession MW460250) and 673 crAss-like phage genomes in one multiFASTA input taken from Yutin, N., Benler, S., Shmakov, S.A. et al. Analysis of metagenome-assembled viral genomes from the human gut reveals diverse putative CrAss-like phages with unique genomic features. Nat Commun 12, 1044 (2021) https://doi.org/10.1038/s41467-021-21350-w. +| Time (min) | Pharokka PHANOTATE | Pharokka Prodigal | +|----------------------------------------|--------------------|-------------------| +| Enterobacteria Phage Lambda (48052bp) | 3.43 | 3.28 | +| Staphylococcus Phage SAOMS1 (140315bp) | 5 | 4.5 | +| 673 crAss-like Phage Genomes | 120 | 30 | + + +Pharokka scales well for large metavirome datasets. In fact, as the size of the input file increases, the extra time taken is required for running gene prediction (particularly PHANOTATE) and tRNA-scan SE - the time taken to conduct mmseqs2 searches remain small. + # Bugs and Suggestions diff --git a/bin/input_commands.py b/bin/input_commands.py index c7706f4..09cb6fe 100644 --- a/bin/input_commands.py +++ b/bin/input_commands.py @@ -22,7 +22,7 @@ def get_input(): parser.add_argument('-p', '--prefix', action="store", help='Prefix for output files. This is not required.', default='Default') parser.add_argument('-l', '--locustag', action="store", help='User specified locus tag for the gff/gbk files. This is not required. A random locus tag will be generated instead.', default='Default') parser.add_argument('-g', '--gene_predictor', action="store", help='User specified gene predictor. Use "-g phanotate" or "-g prodigal". Defaults to phanotate (not required unless prodigal is desired).', default='phanotate' ) - parser.add_argument('-m', '--meta', help='meta mode for metavirome imput samples', action="store_true") + parser.add_argument('-m', '--meta', help='meta mode for metavirome input samples', action="store_true") parser.add_argument('-c', '--coding_table', help='translation table for prodigal. Defaults to 11. Experimental only.', action="store", default = "11") parser.add_argument('-e', '--evalue', help='E-value threshold for mmseqs2 PHROGs database search. Defaults to 1E-05.', action="store", default = "1E-05") parser.add_argument('-V', '--version', help='Version', action='version', version=v) @@ -55,11 +55,11 @@ def instantiate_dirs(output_dir, meta, gene_predictor, force): if os.path.isdir(CARD_dir) == False: os.mkdir(CARD_dir) - # tmp dir for phanotate meta mode - phanotate_tmp_dir = os.path.join(output_dir, "phanotate_tmp/") - if meta == True and gene_predictor == 'phanotate': - if os.path.isdir(phanotate_tmp_dir) == False: - os.mkdir(phanotate_tmp_dir) + # tmp dir for meta mode trnascan and phanotate + input_tmp_dir = os.path.join(output_dir, "input_split_tmp/") + if meta == True: + if os.path.isdir(input_tmp_dir) == False: + os.mkdir(input_tmp_dir) return output_dir diff --git a/bin/pharokka.py b/bin/pharokka.py index 5aea75d..2a74a65 100755 --- a/bin/pharokka.py +++ b/bin/pharokka.py @@ -78,6 +78,11 @@ # validates meta mode input_commands.validata_meta(args.infile, args.meta) + # meta mode split input for trnascan and maybe phanotate + if args.meta == True: + num_fastas = processes.split_input_fasta(args.infile, out_dir) + + # CDS predicton # phanotate if gene_predictor == "phanotate": @@ -86,10 +91,9 @@ if args.meta == True: print("Applying meta mode.") logger.info("Applying meta mode.") - num_fastas = processes.split_input_fasta(args.infile, out_dir) - processes.run_phanotate_fasta(args.infile, out_dir, args.threads, num_fastas) - processes.run_phanotate_txt(args.infile, out_dir, args.threads, num_fastas) - processes.concat_phanotate(out_dir, num_fastas) + processes.run_phanotate_fasta_meta(args.infile, out_dir, args.threads, num_fastas) + processes.run_phanotate_txt_meta(args.infile, out_dir, args.threads, num_fastas) + processes.concat_phanotate_meta(out_dir, num_fastas) else: processes.run_phanotate(args.infile, out_dir, logger) if gene_predictor == "prodigal": @@ -100,8 +104,18 @@ logger.info("Translating gene predicted fastas.") processes.translate_fastas(out_dir,gene_predictor) - # run trna-scan and minced - processes.run_trna_scan(args.infile,args.threads, out_dir, logger) + + # run trna-scan meta mode if required + if args.meta == True: + print("Running tRNAscan-SE.") + logger.info("Starting tRNA-scanSE") + processes.run_trnascan_meta(args.infile, out_dir, args.threads, num_fastas) + processes.concat_trnascan_meta(out_dir, num_fastas) + else: + print("Running tRNAscan-SE. Applying meta mode.") + logger.info("Starting tRNA-scanSE. Applying meta mode.") + processes.run_trna_scan(args.infile,args.threads, out_dir, logger) + # run minced and aragorn processes.run_minced(args.infile, out_dir, prefix, logger) processes.run_aragorn(args.infile, out_dir, prefix, logger) diff --git a/bin/post_processing.py b/bin/post_processing.py index 011564a..ad35630 100644 --- a/bin/post_processing.py +++ b/bin/post_processing.py @@ -683,11 +683,11 @@ def remove_post_processing_files(out_dir, gene_predictor, meta): sp.run(["rm", "-rf", os.path.join(out_dir, "phrokka_tmp.gff") ]) if gene_predictor == "phanotate": sp.run(["rm", "-rf", os.path.join(out_dir, "phanotate_out.txt") ]) - # delete the tmp meta files - if meta == True: - sp.run(["rm", "-rf", os.path.join(out_dir, "phanotate_tmp/") ]) if gene_predictor == "prodigal": sp.run(["rm", "-rf", os.path.join(out_dir, "prodigal_out.gff") ]) + # delete the tmp meta files + if meta == True: + sp.run(["rm", "-rf", os.path.join(out_dir, "input_split_tmp/") ]) def get_crispr_count(out_dir, prefix): diff --git a/bin/processes.py b/bin/processes.py index fd79d6a..cd04577 100644 --- a/bin/processes.py +++ b/bin/processes.py @@ -50,17 +50,17 @@ def split_input_fasta(filepath_in, out_dir): # each fasta gets its own file so batch size of 1 batch_size = 1 - phanotate_tmp_dir = os.path.join(out_dir, "phanotate_tmp") + input_tmp_dir = os.path.join(out_dir, "input_split_tmp") for i, batch in enumerate(batch_iterator(record_iter, batch_size)): - filename = "phanotate_subprocess%i.fasta" % (i + 1) - with open(os.path.join(phanotate_tmp_dir, filename), "w") as handle: + filename = "input_subprocess%i.fasta" % (i + 1) + with open(os.path.join(input_tmp_dir, filename), "w") as handle: SeqIO.write(batch, handle, "fasta") return num_fastas -def run_phanotate_fasta(filepath_in, out_dir, threads, num_fastas): +def run_phanotate_fasta_meta(filepath_in, out_dir, threads, num_fastas): """ Runs phanotate to output fastas :param filepath_in: input filepath @@ -70,11 +70,11 @@ def run_phanotate_fasta(filepath_in, out_dir, threads, num_fastas): :return: """ - phanotate_tmp_dir = os.path.join(out_dir, "phanotate_tmp") + phanotate_tmp_dir = os.path.join(out_dir, "input_split_tmp") commands = [] for i in range(1,num_fastas+1): - in_file = "phanotate_subprocess" + str(i) +".fasta" + in_file = "input_subprocess" + str(i) +".fasta" out_file = "phanotate_out_tmp" + str(i) +".fasta" filepath_in = os.path.join(phanotate_tmp_dir,in_file) cmd = "phanotate.py " + filepath_in + " -o " + os.path.join(phanotate_tmp_dir, out_file) + " -f fasta" @@ -88,7 +88,7 @@ def run_phanotate_fasta(filepath_in, out_dir, threads, num_fastas): p.wait() -def run_phanotate_txt(filepath_in, out_dir, threads, num_fastas): +def run_phanotate_txt_meta(filepath_in, out_dir, threads, num_fastas): """ Runs phanotate to output text file :param filepath_in: input filepath @@ -98,12 +98,12 @@ def run_phanotate_txt(filepath_in, out_dir, threads, num_fastas): :return: """ - phanotate_tmp_dir = os.path.join(out_dir, "phanotate_tmp") + phanotate_tmp_dir = os.path.join(out_dir, "input_split_tmp") commands = [] for i in range(1,num_fastas+1): - in_file = "phanotate_subprocess" + str(i) +".fasta" + in_file = "input_subprocess" + str(i) +".fasta" out_file = "phanotate_out_tmp" + str(i) +".txt" filepath_in = os.path.join(phanotate_tmp_dir,in_file) cmd = "phanotate.py " + filepath_in + " -o " + os.path.join(phanotate_tmp_dir, out_file) + " -f tabular" @@ -115,7 +115,7 @@ def run_phanotate_txt(filepath_in, out_dir, threads, num_fastas): for p in procs: p.wait() -def concat_phanotate(out_dir, num_fastas): +def concat_phanotate_meta(out_dir, num_fastas): """ Concatenates phanotate output for downstream analysis :param out_dir: output directory @@ -123,7 +123,7 @@ def concat_phanotate(out_dir, num_fastas): :return: """ - phanotate_tmp_dir = os.path.join(out_dir, "phanotate_tmp") + phanotate_tmp_dir = os.path.join(out_dir, "input_split_tmp") tsvs = [] for i in range(1,int(num_fastas)+1): @@ -145,6 +145,59 @@ def concat_phanotate(out_dir, num_fastas): with open(fname) as infile: outfile.write(infile.read()) + +def run_trnascan_meta(filepath_in, out_dir, threads, num_fastas): + """ + Runs trnascan to output gffs one contig per thread + :param filepath_in: input filepath + :param out_dir: output directory + :param threads: threads + :param num_fastas: number of fastas in input multifasta + :return: + """ + + input_tmp_dir = os.path.join(out_dir, "input_split_tmp") + commands = [] + + for i in range(1,num_fastas+1): + in_file = "input_subprocess" + str(i) +".fasta" + out_file = "trnascan_tmp" + str(i) +".gff" + filepath_in = os.path.join(input_tmp_dir,in_file) + filepath_out = os.path.join(input_tmp_dir,out_file) + cmd = "tRNAscan-SE " + filepath_in + " --thread 1 -G -Q -j " + filepath_out + commands.append(cmd) + + n = int(threads) #the number of parallel processes you want + + for j in range(max(int(len(commands)/n)+1, 1)): + procs = [sp.Popen(i, shell=True , stderr=sp.PIPE, stdout=sp.DEVNULL) for i in commands[j*n: min((j+1)*n, len(commands))] ] + for p in procs: + p.wait() + +def concat_trnascan_meta(out_dir, num_fastas): + """ + Concatenates trnascan output for downstream analysis + :param out_dir: output directory + :param threads: threads + :return: + """ + + input_tmp_dir = os.path.join(out_dir, "input_split_tmp") + + gffs = [] + for i in range(1,int(num_fastas)+1): + out_gff = "trnascan_tmp" + str(i) +".gff" + gffs.append(os.path.join(input_tmp_dir, out_gff)) + + with open(os.path.join(out_dir, "trnascan_out.gff"), 'w') as outfile: + for fname in gffs: + with open(fname) as infile: + outfile.write(infile.read()) + + + + + ##### single contig mode ###### def run_phanotate(filepath_in, out_dir,logger): @@ -267,8 +320,6 @@ def run_trna_scan(filepath_in,threads, out_dir, logger): :param logger logger :return: """ - print("Running tRNAscan-SE.") - logger.info("Starting tRNA-scanSE") try: # needs stderr for trna scan trna = sp.Popen(["tRNAscan-SE", filepath_in, "--thread",threads, "-G", "-Q", "-j", os.path.join(out_dir, "trnascan_out.gff")], stderr=sp.PIPE, stdout=sp.DEVNULL) From 1893e27dc1404db8e1789aa1def9f1de21fe3863 Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Thu, 20 Oct 2022 10:59:51 +1030 Subject: [PATCH 2/3] fix output to log --- bin/pharokka.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/pharokka.py b/bin/pharokka.py index 2a74a65..ae337b7 100755 --- a/bin/pharokka.py +++ b/bin/pharokka.py @@ -107,13 +107,13 @@ # run trna-scan meta mode if required if args.meta == True: - print("Running tRNAscan-SE.") - logger.info("Starting tRNA-scanSE") + print("Running tRNAscan-SE. Applying meta mode.") + logger.info("Starting tRNA-scanSE. Applying meta mode.") processes.run_trnascan_meta(args.infile, out_dir, args.threads, num_fastas) processes.concat_trnascan_meta(out_dir, num_fastas) else: - print("Running tRNAscan-SE. Applying meta mode.") - logger.info("Starting tRNA-scanSE. Applying meta mode.") + print("Running tRNAscan-SE.") + logger.info("Starting tRNA-scanSE") processes.run_trna_scan(args.infile,args.threads, out_dir, logger) # run minced and aragorn processes.run_minced(args.infile, out_dir, prefix, logger) From de91898da883aee14812ad40e89e21d793069ef0 Mon Sep 17 00:00:00 2001 From: gbouras13 <84495559+gbouras13@users.noreply.github.com> Date: Thu, 20 Oct 2022 11:03:10 +1030 Subject: [PATCH 3/3] Update README.md --- README.md | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index cdcaca3..3e9d927 100644 --- a/README.md +++ b/README.md @@ -225,14 +225,7 @@ Enterbacteria Phage Lambda (Genbank accession J02459) Staphylococcus Phage SAOMS -| Time (min) | Pharokka PHANOTATE | Pharokka Prodigal | -|----------------------------------------|--------------------|-------------------| -| Enterobacteria Phage Lambda (48052bp) | 3.43 | 3.28 | -| Staphylococcus Phage SAOMS1 (140315bp) | 5 | 4.5 | -| 673 crAss-like Phage Genomes | 120 | 30 | - - -Pharokka scales well for large metavirome datasets. In fact, as the size of the input file increases, the extra time taken is required for running gene prediction (particularly PHANOTATE) and tRNA-scan SE - the time taken to conduct mmseqs2 searches remain small. +Pharokka scales well for large metavirome datasets. In fact, as the size of the input file increases, the extra time taken is required for running gene prediction (particularly PHANOTATE) and tRNA-scan SE - the time taken to conduct mmseqs2 searches remain small due to its many vs many approach. # Bugs and Suggestions @@ -250,4 +243,4 @@ If you use pharokka, please also cite: * Bland C., Ramsey L., Sabree F., Lowe M., Brown K., Kyrpides N.C., Hugenholtz P. , "CRISPR Recognition Tool (CRT): a tool for automatic detection of clustered regularly interspaced palindromic repeats", BMC Bioinformatics, (2007), https://doi.org/10.1186/1471-2105-8-209. * Laslett D., Canback B., "ARAGORN, a program to detect tRNA genes and tmRNA genes in nucleotide sequences.", Nucleic Acids Research (2004) https://doi.org/10.1093/nar/gkh152. * Chen L., Yang J., Yao Z., Sun L., Shen Y., Jin Q., "VFDB: a reference database for bacterial virulence factors", Nucleic Acids Research (2005) https://doi.org/10.1093/nar/gki008. -* Alcock et al, "CARD 2020: antibiotic resistome surveillance with the comprehensive antibiotic resistance database." Nucleic Acids Research (2020) https:doi.org/10.1093/nar/gkz935. \ No newline at end of file +* Alcock et al, "CARD 2020: antibiotic resistome surveillance with the comprehensive antibiotic resistance database." Nucleic Acids Research (2020) https:doi.org/10.1093/nar/gkz935.