Skip to content

Commit

Permalink
Merge pull request #333 from gbouras13/dev
Browse files Browse the repository at this point in the history
v1.7.0
  • Loading branch information
gbouras13 authored Mar 4, 2024
2 parents fbf1b3f + ad21998 commit cc244ee
Show file tree
Hide file tree
Showing 21 changed files with 1,103 additions and 120 deletions.
6 changes: 6 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
History
=======

1.7.0 (2024-01-17)
------------------

* Adds `pharokka_multiplotter.py` to plot multiple phage contigs at once
* Adds separate contig FASTA files if `-s -m` is run (in `single_fastas`)

1.6.1 (2024-01-17)
------------------

Expand Down
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ If you are looking for rapid standardised annotation of bacterial genomes, pleas
- [Paper](#paper)
- [Pharokka with Galaxy Europe Webserver](#pharokka-with-galaxy-europe-webserver)
- [Brief Overview](#brief-overview)
- [Pharokka v 1.7.0 Update](#pharokka-v-170-update)
- [Pharokka v 1.6.0 Update (11 January 2024)](#pharokka-v-160-update-11-january-2024)
- [Pharokka v 1.5.0 Update (20 September 2023)](#pharokka-v-150-update-20-september-2023)
- [Pharokka v 1.4.0 Update (27 August 2023)](#pharokka-v-140-update-27-august-2023)
Expand Down Expand Up @@ -97,6 +98,18 @@ So if you can't get `pharokka` to install on your machine for whatever reason or

`pharokka` uses [PHANOTATE](https://github.com/deprekate/PHANOTATE), the only gene prediction program tailored to bacteriophages, as the default program for gene prediction. [Prodigal](https://github.com/hyattpd/Prodigal) implemented with [pyrodigal](https://github.com/althonos/pyrodigal) and [Prodigal-gv](https://github.com/apcamargo/prodigal-gv) implemented with [pyrodigal-gv](https://github.com/althonos/pyrodigal-gv) are also available as alternatives. Following this, functional annotations are assigned by matching each predicted coding sequence (CDS) to the [PHROGs](https://phrogs.lmge.uca.fr), [CARD](https://card.mcmaster.ca) and [VFDB](http://www.mgc.ac.cn/VFs/main.htm) databases using [MMseqs2](https://github.com/soedinglab/MMseqs2). As of v1.4.0, `pharokka` will also match each CDS to the PHROGs database using more sensitive Hidden Markov Models using [PyHMMER](https://github.com/althonos/pyhmmer). Pharokka's main output is a GFF file suitable for using in downstream pangenomic pipelines like [Roary](https://sanger-pathogens.github.io/Roary/). `pharokka` also generates a `cds_functions.tsv` file, which includes counts of CDSs, tRNAs, tmRNAs, CRISPRs and functions assigned to CDSs according to the PHROGs database. See the full [usage](#usage) and check out the full [documentation](https://pharokka.readthedocs.io) for more details.

## Pharokka v 1.7.0 Update

You can run `pharokka_multiplotter.py` to plot as many phage(s) as you want.

It requires the `pharokka` output Genbank file (here, `pharokka.gbk`). It will save plots for each contig in the output directory (here `pharokka_plots_output_directory`).

e.g.

```
pharokka_multiplotter.py -g pharokka.gbk -o pharokka_plots_output_directory
```

## Pharokka v 1.6.0 Update (11 January 2024)

* Fixes a variety of bugs (#300 `pharokka_proteins.py` crashing if it found VFDB hits, #303 errors in the `.tbl` format, #316 errors with types and where custom HMM dbs had identical scored hits, #317 types and #320 deprecated GC function)
Expand Down Expand Up @@ -160,7 +173,6 @@ SAOMS1 phage (GenBank: MW460250.1) was isolated and sequenced by: Yerushalmy, O.

Please see [plotting](docs/plotting.md) for details on all plotting parameter options.


# Installation

## Conda Installation
Expand Down
52 changes: 18 additions & 34 deletions bin/pharokka.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,42 +11,21 @@
from custom_db import run_custom_pyhmmer
from databases import check_db_installation
from hmm import run_pyhmmer
from input_commands import (
check_dependencies,
get_input,
instantiate_dirs,
instantiate_split_output,
validate_and_extract_genbank,
validate_custom_hmm,
validate_fasta,
validate_gene_predictor,
validate_meta,
validate_terminase,
validate_threads,
)
from input_commands import (check_dependencies, get_input, instantiate_dirs,
instantiate_split_output,
validate_and_extract_genbank, validate_custom_hmm,
validate_fasta, validate_gene_predictor,
validate_meta, validate_terminase,
validate_threads)
from loguru import logger
from post_processing import Pharok, remove_post_processing_files
from processes import (
concat_phanotate_meta,
concat_trnascan_meta,
convert_gff_to_gbk,
reorient_terminase,
run_aragorn,
run_dnaapler,
run_mash_dist,
run_mash_sketch,
run_minced,
run_mmseqs,
run_phanotate,
run_phanotate_fasta_meta,
run_phanotate_txt_meta,
run_pyrodigal,
run_pyrodigal_gv,
run_trna_scan,
run_trnascan_meta,
split_input_fasta,
translate_fastas,
)
from processes import (concat_phanotate_meta, concat_trnascan_meta,
convert_gff_to_gbk, reorient_terminase, run_aragorn,
run_dnaapler, run_mash_dist, run_mash_sketch,
run_minced, run_mmseqs, run_phanotate,
run_phanotate_fasta_meta, run_phanotate_txt_meta,
run_pyrodigal, run_pyrodigal_gv, run_trna_scan,
run_trnascan_meta, split_input_fasta, translate_fastas)
from util import count_contigs, get_version

# add this to make sure of deprecation warning with biopython
Expand Down Expand Up @@ -455,6 +434,11 @@ def main():
pharok.update_fasta_headers()
pharok.update_final_output()

# output single gffs in meta mode
if args.split == True and args.meta == True:
# splits the faa into single .faa
pharok.split_faas_singles()

# extract terL
pharok.extract_terl()

Expand Down
264 changes: 264 additions & 0 deletions bin/pharokka_multiplotter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
#!/usr/bin/env python3
import argparse
import os
import shutil
import sys
from argparse import RawTextHelpFormatter
from pathlib import Path

from loguru import logger
from plot import create_single_plot
from pycirclize.parser import Genbank
from util import get_version


def get_input():
"""gets input for pharokka_plotter.py
:return: args
"""
parser = argparse.ArgumentParser(
description="pharokka_multiplotter.py: pharokka plotting function for muliple phages",
formatter_class=RawTextHelpFormatter,
)
parser.add_argument(
"-g",
"--genbank",
action="store",
required=True,
help="Input genbank file from Pharokka.",
)
parser.add_argument(
"-o",
"--outdir",
action="store",
help="Pharokka output directory.",
required=True,
)
parser.add_argument(
"-f", "--force", help="Overwrites the output plot file.", action="store_true"
)
parser.add_argument(
"--label_hypotheticals",
help="Flag to label hypothetical or unknown proteins. By default these are not labelled.",
action="store_true",
)
parser.add_argument(
"--remove_other_features_labels",
help="Flag to remove labels for tRNA/tmRNA/CRISPRs. By default these are labelled. \nThey will still be plotted in black.",
action="store_true",
)
parser.add_argument(
"--title_size",
action="store",
default="20",
help="Controls title size. Must be an integer. Defaults to 20.",
)
parser.add_argument(
"--label_size",
action="store",
default="8",
help="Controls annotation label size. Must be an integer. Defaults to 8.",
)
parser.add_argument(
"--interval",
action="store",
default="5000",
help="Axis tick interval. Must be an integer. Must be an integer. Defaults to 5000.",
)
parser.add_argument(
"--truncate",
action="store",
default="20",
help="Number of characters to include in annoation labels before truncation with ellipsis. \nMust be an integer. Defaults to 20.",
)
parser.add_argument(
"--dpi",
action="store",
default="600",
help="Resultion (dots per inch). Must be an integer. Defaults to 600.",
)
parser.add_argument(
"--annotations",
action="store",
default="1",
help="Controls the proporition of annotations labelled. Must be a number between 0 and 1 inclusive. \n0 = no annotations, 0.5 = half of the annotations, 1 = all annotations. \nDefaults to 1. Chosen in order of CDS size.",
)
parser.add_argument(
"-t", "--plot_title", action="store", default="Phage", help="Plot name."
)
parser.add_argument(
"--label_ids",
action="store",
default="",
help="Text file with list of CDS IDs (from gff file) that are guaranteed to be labelled.",
)
args = parser.parse_args()
return args


if __name__ == "__main__":
args = get_input()
# preamble
logger.add(lambda _: sys.exit(1), level="ERROR")
logger.info(f"Starting Pharokka v{get_version()}")
logger.info("Running pharokka_multiplotter.py to plot your phages.")
logger.info("Command executed: {}", args)
logger.info("Repository homepage is https://github.com/gbouras13/pharokka")
logger.info("Written by George Bouras: [email protected]")
logger.info("Checking your inputs.")

try:
int(args.interval)
except:
logger.error(
f"--interval {args.interval} specified is not an integer. Please check your input and try again."
)

try:
int(args.label_size)
except:
logger.error(
f"--label_size {args.label_size} specified is not an integer. Please check your input and try again."
)

try:
int(args.title_size)
except:
logger.error(
f"--title_size {args.title_size} specified is not an integer. Please check your input and try again."
)

try:
int(args.dpi)
except:
logger.error(
f"--dpi {args.dpi} specified is not an integer. Please check your input and try again."
)

try:
float(args.annotations)
except:
logger.error(
f"--annotations {args.annotations} specified is not a float. Please check your input and try again."
)

if args.force == True:
if os.path.exists(args.outdir) == True:
logger.info(f"Removing {args.outdir} as --force was specified.")
shutil.rmtree(args.outdir)
else:
logger.warning(
f"--force was specified even though the output plot directory {args.outdir} does not already exist."
)
logger.warning("Continuing")
else:
if os.path.exists(args.outdir) == True:
logger.error(
f"Output directoey {args.outdir} already exists and force was not specified. Please specify -f or --force to overwrite the output directory."
)

# instantiate outdir
if Path(args.outdir).exists() is False:
Path(args.outdir).mkdir(parents=True, exist_ok=True)

# check label_ids

# list of all IDs that need to be labelled from file
label_force_list = []

if args.label_ids != "":
logger.info(
f"You have specified a file {args.label_ids} containing a list of CDS IDs to force label."
)
# check if it is a file
if os.path.isfile(args.label_ids) == False:
logger.error(f"{args.label_ids} was not found.")
# check if it contains text
try:
# Open the file in read mode
with open(Path(args.label_ids), "r") as file:
# Read the first character
first_char = file.read(1)

# read all the labels
with open(Path(args.label_ids)) as f:
ignore_dict = {x.rstrip().split()[0] for x in f}
# label force list
label_force_list = list(ignore_dict)

except FileNotFoundError:
logger.warning(
f"{args.label_id} contains no text. No contigs will be ignored"
)

logger.info("All files checked.")

# single threaded plots
threads = 1

gbk = Genbank(args.genbank)

# gets all contigs and seqs
gb_seq_dict = gbk.get_seqid2seq()

gb_size_dict = gbk.get_seqid2size()

contig_count = len(gb_seq_dict)

# gets all features - will get all regardless of type (tRNA etc from pharokka)
gb_feature_dict = gbk.get_seqid2features()

# check label_ids
# list of all IDs that need to be labelled from file
label_force_list = []

label_ids = args.label_ids

if label_ids != "":
logger.info(
f"You have specified a file {label_ids} containing a list of CDS IDs to force label."
)
# check if it is a file
if Path(label_ids).exists() is False:
logger.error(f"{label_ids} was not found.")
# check if it contains text
try:
# Open the file in read mode
with open(Path(label_ids), "r") as file:
# Read the first character
# will error if file is empty
first_char = file.read(1)

# read all the labels
with open(Path(label_ids)) as f:
ignore_dict = {x.rstrip().split()[0] for x in f}
# label force list
label_force_list = list(ignore_dict)

except FileNotFoundError:
logger.warning(f"{label_ids} contains no text. No contigs will be ignored")

# if there is 1 contig, then all the parameters will apply

for contig_id, contig_sequence in gb_seq_dict.items():
logger.info(f"Plotting {contig_id}")

create_single_plot(
contig_id,
contig_sequence,
contig_count,
gb_size_dict,
gb_feature_dict,
gbk,
int(args.interval),
float(args.annotations),
int(args.title_size),
args.plot_title,
int(args.truncate),
args.outdir,
int(args.dpi),
float(args.label_size),
args.label_hypotheticals,
args.remove_other_features_labels,
label_force_list,
)
Loading

0 comments on commit cc244ee

Please sign in to comment.