Skip to content

Commit

Permalink
Merge pull request #214 from gbouras13/1.1.0
Browse files Browse the repository at this point in the history
1.1.0
  • Loading branch information
gbouras13 authored Oct 20, 2022
2 parents a9bab00 + de91898 commit cda6d30
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 32 deletions.
12 changes: 8 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,8 @@ In v0.1.7, the ability to specify an E-value threshold for PHROGs CDS functional
pharokka defaults to 1 thread.

```
usage: pharokka.py [-h] -i INFILE [-o OUTDIR] [-d DATABASE] [-t THREADS] [-f] [-p PREFIX] [-l LOCUSTAG] [-g GENE_PREDICTOR] [-m] [-c CODING_TABLE] [-e EVALUE] [-V]
usage: pharokka.py [-h] -i INFILE [-o OUTDIR] [-d DATABASE] [-t THREADS] [-f] [-p PREFIX] [-l LOCUSTAG]
[-g GENE_PREDICTOR] [-m] [-c CODING_TABLE] [-e EVALUE] [-V]
pharokka: fast phage annotation program
Expand All @@ -196,7 +197,7 @@ optional arguments:
User specified locus tag for the gff/gbk files. This is not required. A random locus tag will be generated instead.
-g GENE_PREDICTOR, --gene_predictor GENE_PREDICTOR
User specified gene predictor. Use "-g phanotate" or "-g prodigal". Defaults to phanotate (not required unless prodigal is desired).
-m, --meta Metagenomic option for Prodigal
-m, --meta meta mode for metavirome input samples
-c CODING_TABLE, --coding_table CODING_TABLE
translation table for prodigal. Defaults to 11. Experimental only.
-e EVALUE, --evalue EVALUE
Expand All @@ -218,11 +219,14 @@ On a standard 16GB RAM laptop specifying 8 threads, pharokka should take between

# Benchmarking

The 673 crAss-like phage genomes were taken from Yutin, N., Benler, S., Shmakov, S.A. et al. Analysis of metagenome-assembled viral genomes from the human gut reveals diverse putative CrAss-like phages with unique genomic features. Nat Commun 12, 1044 (2021). https://doi.org/10.1038/s41467-021-21350-w
Pharokka (v1.1.0) has been benchmarked on an Intel Xeon CPU E5-4610 v2 @ 2.30 specifying 16 threads.

Enterbacteria Phage Lambda (Genbank accession J02459) Staphylococcus Phage SAOMS1 (Genbank Accession MW460250) and 673 crAss-like phage genomes in one multiFASTA input taken from Yutin, N., Benler, S., Shmakov, S.A. et al. Analysis of metagenome-assembled viral genomes from the human gut reveals diverse putative CrAss-like phages with unique genomic features. Nat Commun 12, 1044 (2021) https://doi.org/10.1038/s41467-021-21350-w.



Pharokka scales well for large metavirome datasets. In fact, as the size of the input file increases, the extra time taken is required for running gene prediction (particularly PHANOTATE) and tRNA-scan SE - the time taken to conduct mmseqs2 searches remain small due to its many vs many approach.


# Bugs and Suggestions

Expand All @@ -239,4 +243,4 @@ If you use pharokka, please also cite:
* Bland C., Ramsey L., Sabree F., Lowe M., Brown K., Kyrpides N.C., Hugenholtz P. , "CRISPR Recognition Tool (CRT): a tool for automatic detection of clustered regularly interspaced palindromic repeats", BMC Bioinformatics, (2007), https://doi.org/10.1186/1471-2105-8-209.
* Laslett D., Canback B., "ARAGORN, a program to detect tRNA genes and tmRNA genes in nucleotide sequences.", Nucleic Acids Research (2004) https://doi.org/10.1093/nar/gkh152.
* Chen L., Yang J., Yao Z., Sun L., Shen Y., Jin Q., "VFDB: a reference database for bacterial virulence factors", Nucleic Acids Research (2005) https://doi.org/10.1093/nar/gki008.
* Alcock et al, "CARD 2020: antibiotic resistome surveillance with the comprehensive antibiotic resistance database." Nucleic Acids Research (2020) https:doi.org/10.1093/nar/gkz935.
* Alcock et al, "CARD 2020: antibiotic resistome surveillance with the comprehensive antibiotic resistance database." Nucleic Acids Research (2020) https:doi.org/10.1093/nar/gkz935.
12 changes: 6 additions & 6 deletions bin/input_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def get_input():
parser.add_argument('-p', '--prefix', action="store", help='Prefix for output files. This is not required.', default='Default')
parser.add_argument('-l', '--locustag', action="store", help='User specified locus tag for the gff/gbk files. This is not required. A random locus tag will be generated instead.', default='Default')
parser.add_argument('-g', '--gene_predictor', action="store", help='User specified gene predictor. Use "-g phanotate" or "-g prodigal". Defaults to phanotate (not required unless prodigal is desired).', default='phanotate' )
parser.add_argument('-m', '--meta', help='meta mode for metavirome imput samples', action="store_true")
parser.add_argument('-m', '--meta', help='meta mode for metavirome input samples', action="store_true")
parser.add_argument('-c', '--coding_table', help='translation table for prodigal. Defaults to 11. Experimental only.', action="store", default = "11")
parser.add_argument('-e', '--evalue', help='E-value threshold for mmseqs2 PHROGs database search. Defaults to 1E-05.', action="store", default = "1E-05")
parser.add_argument('-V', '--version', help='Version', action='version', version=v)
Expand Down Expand Up @@ -55,11 +55,11 @@ def instantiate_dirs(output_dir, meta, gene_predictor, force):
if os.path.isdir(CARD_dir) == False:
os.mkdir(CARD_dir)

# tmp dir for phanotate meta mode
phanotate_tmp_dir = os.path.join(output_dir, "phanotate_tmp/")
if meta == True and gene_predictor == 'phanotate':
if os.path.isdir(phanotate_tmp_dir) == False:
os.mkdir(phanotate_tmp_dir)
# tmp dir for meta mode trnascan and phanotate
input_tmp_dir = os.path.join(output_dir, "input_split_tmp/")
if meta == True:
if os.path.isdir(input_tmp_dir) == False:
os.mkdir(input_tmp_dir)

return output_dir

Expand Down
26 changes: 20 additions & 6 deletions bin/pharokka.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@
# validates meta mode
input_commands.validata_meta(args.infile, args.meta)

# meta mode split input for trnascan and maybe phanotate
if args.meta == True:
num_fastas = processes.split_input_fasta(args.infile, out_dir)


# CDS predicton
# phanotate
if gene_predictor == "phanotate":
Expand All @@ -86,10 +91,9 @@
if args.meta == True:
print("Applying meta mode.")
logger.info("Applying meta mode.")
num_fastas = processes.split_input_fasta(args.infile, out_dir)
processes.run_phanotate_fasta(args.infile, out_dir, args.threads, num_fastas)
processes.run_phanotate_txt(args.infile, out_dir, args.threads, num_fastas)
processes.concat_phanotate(out_dir, num_fastas)
processes.run_phanotate_fasta_meta(args.infile, out_dir, args.threads, num_fastas)
processes.run_phanotate_txt_meta(args.infile, out_dir, args.threads, num_fastas)
processes.concat_phanotate_meta(out_dir, num_fastas)
else:
processes.run_phanotate(args.infile, out_dir, logger)
if gene_predictor == "prodigal":
Expand All @@ -100,8 +104,18 @@
logger.info("Translating gene predicted fastas.")
processes.translate_fastas(out_dir,gene_predictor)

# run trna-scan and minced
processes.run_trna_scan(args.infile,args.threads, out_dir, logger)

# run trna-scan meta mode if required
if args.meta == True:
print("Running tRNAscan-SE. Applying meta mode.")
logger.info("Starting tRNA-scanSE. Applying meta mode.")
processes.run_trnascan_meta(args.infile, out_dir, args.threads, num_fastas)
processes.concat_trnascan_meta(out_dir, num_fastas)
else:
print("Running tRNAscan-SE.")
logger.info("Starting tRNA-scanSE")
processes.run_trna_scan(args.infile,args.threads, out_dir, logger)
# run minced and aragorn
processes.run_minced(args.infile, out_dir, prefix, logger)
processes.run_aragorn(args.infile, out_dir, prefix, logger)

Expand Down
6 changes: 3 additions & 3 deletions bin/post_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,11 +683,11 @@ def remove_post_processing_files(out_dir, gene_predictor, meta):
sp.run(["rm", "-rf", os.path.join(out_dir, "phrokka_tmp.gff") ])
if gene_predictor == "phanotate":
sp.run(["rm", "-rf", os.path.join(out_dir, "phanotate_out.txt") ])
# delete the tmp meta files
if meta == True:
sp.run(["rm", "-rf", os.path.join(out_dir, "phanotate_tmp/") ])
if gene_predictor == "prodigal":
sp.run(["rm", "-rf", os.path.join(out_dir, "prodigal_out.gff") ])
# delete the tmp meta files
if meta == True:
sp.run(["rm", "-rf", os.path.join(out_dir, "input_split_tmp/") ])


def get_crispr_count(out_dir, prefix):
Expand Down
77 changes: 64 additions & 13 deletions bin/processes.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,17 @@ def split_input_fasta(filepath_in, out_dir):
# each fasta gets its own file so batch size of 1
batch_size = 1

phanotate_tmp_dir = os.path.join(out_dir, "phanotate_tmp")
input_tmp_dir = os.path.join(out_dir, "input_split_tmp")

for i, batch in enumerate(batch_iterator(record_iter, batch_size)):
filename = "phanotate_subprocess%i.fasta" % (i + 1)
with open(os.path.join(phanotate_tmp_dir, filename), "w") as handle:
filename = "input_subprocess%i.fasta" % (i + 1)
with open(os.path.join(input_tmp_dir, filename), "w") as handle:
SeqIO.write(batch, handle, "fasta")
return num_fastas



def run_phanotate_fasta(filepath_in, out_dir, threads, num_fastas):
def run_phanotate_fasta_meta(filepath_in, out_dir, threads, num_fastas):
"""
Runs phanotate to output fastas
:param filepath_in: input filepath
Expand All @@ -70,11 +70,11 @@ def run_phanotate_fasta(filepath_in, out_dir, threads, num_fastas):
:return:
"""

phanotate_tmp_dir = os.path.join(out_dir, "phanotate_tmp")
phanotate_tmp_dir = os.path.join(out_dir, "input_split_tmp")
commands = []

for i in range(1,num_fastas+1):
in_file = "phanotate_subprocess" + str(i) +".fasta"
in_file = "input_subprocess" + str(i) +".fasta"
out_file = "phanotate_out_tmp" + str(i) +".fasta"
filepath_in = os.path.join(phanotate_tmp_dir,in_file)
cmd = "phanotate.py " + filepath_in + " -o " + os.path.join(phanotate_tmp_dir, out_file) + " -f fasta"
Expand All @@ -88,7 +88,7 @@ def run_phanotate_fasta(filepath_in, out_dir, threads, num_fastas):
p.wait()


def run_phanotate_txt(filepath_in, out_dir, threads, num_fastas):
def run_phanotate_txt_meta(filepath_in, out_dir, threads, num_fastas):
"""
Runs phanotate to output text file
:param filepath_in: input filepath
Expand All @@ -98,12 +98,12 @@ def run_phanotate_txt(filepath_in, out_dir, threads, num_fastas):
:return:
"""

phanotate_tmp_dir = os.path.join(out_dir, "phanotate_tmp")
phanotate_tmp_dir = os.path.join(out_dir, "input_split_tmp")

commands = []

for i in range(1,num_fastas+1):
in_file = "phanotate_subprocess" + str(i) +".fasta"
in_file = "input_subprocess" + str(i) +".fasta"
out_file = "phanotate_out_tmp" + str(i) +".txt"
filepath_in = os.path.join(phanotate_tmp_dir,in_file)
cmd = "phanotate.py " + filepath_in + " -o " + os.path.join(phanotate_tmp_dir, out_file) + " -f tabular"
Expand All @@ -115,15 +115,15 @@ def run_phanotate_txt(filepath_in, out_dir, threads, num_fastas):
for p in procs:
p.wait()

def concat_phanotate(out_dir, num_fastas):
def concat_phanotate_meta(out_dir, num_fastas):
"""
Concatenates phanotate output for downstream analysis
:param out_dir: output directory
:param threads: threads
:return:
"""

phanotate_tmp_dir = os.path.join(out_dir, "phanotate_tmp")
phanotate_tmp_dir = os.path.join(out_dir, "input_split_tmp")

tsvs = []
for i in range(1,int(num_fastas)+1):
Expand All @@ -145,6 +145,59 @@ def concat_phanotate(out_dir, num_fastas):
with open(fname) as infile:
outfile.write(infile.read())


def run_trnascan_meta(filepath_in, out_dir, threads, num_fastas):
"""
Runs trnascan to output gffs one contig per thread
:param filepath_in: input filepath
:param out_dir: output directory
:param threads: threads
:param num_fastas: number of fastas in input multifasta
:return:
"""

input_tmp_dir = os.path.join(out_dir, "input_split_tmp")
commands = []

for i in range(1,num_fastas+1):
in_file = "input_subprocess" + str(i) +".fasta"
out_file = "trnascan_tmp" + str(i) +".gff"
filepath_in = os.path.join(input_tmp_dir,in_file)
filepath_out = os.path.join(input_tmp_dir,out_file)
cmd = "tRNAscan-SE " + filepath_in + " --thread 1 -G -Q -j " + filepath_out
commands.append(cmd)

n = int(threads) #the number of parallel processes you want

for j in range(max(int(len(commands)/n)+1, 1)):
procs = [sp.Popen(i, shell=True , stderr=sp.PIPE, stdout=sp.DEVNULL) for i in commands[j*n: min((j+1)*n, len(commands))] ]
for p in procs:
p.wait()

def concat_trnascan_meta(out_dir, num_fastas):
"""
Concatenates trnascan output for downstream analysis
:param out_dir: output directory
:param threads: threads
:return:
"""

input_tmp_dir = os.path.join(out_dir, "input_split_tmp")

gffs = []
for i in range(1,int(num_fastas)+1):
out_gff = "trnascan_tmp" + str(i) +".gff"
gffs.append(os.path.join(input_tmp_dir, out_gff))

with open(os.path.join(out_dir, "trnascan_out.gff"), 'w') as outfile:
for fname in gffs:
with open(fname) as infile:
outfile.write(infile.read())





##### single contig mode ######

def run_phanotate(filepath_in, out_dir,logger):
Expand Down Expand Up @@ -267,8 +320,6 @@ def run_trna_scan(filepath_in,threads, out_dir, logger):
:param logger logger
:return:
"""
print("Running tRNAscan-SE.")
logger.info("Starting tRNA-scanSE")
try:
# needs stderr for trna scan
trna = sp.Popen(["tRNAscan-SE", filepath_in, "--thread",threads, "-G", "-Q", "-j", os.path.join(out_dir, "trnascan_out.gff")], stderr=sp.PIPE, stdout=sp.DEVNULL)
Expand Down

0 comments on commit cda6d30

Please sign in to comment.