From 4fc637162f9f03c8922eba45c3ab66d6f3975725 Mon Sep 17 00:00:00 2001 From: Diana Lin Date: Thu, 9 Jun 2022 17:03:53 -0700 Subject: [PATCH] Add simple BLAST script that does not require AMPlify --- scripts/blast_simple.sh | 191 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100755 scripts/blast_simple.sh diff --git a/scripts/blast_simple.sh b/scripts/blast_simple.sh new file mode 100755 index 0000000..ea4dc0b --- /dev/null +++ b/scripts/blast_simple.sh @@ -0,0 +1,191 @@ +#!/usr/bin/env bash +set -uo pipefail + +FULL_PROGRAM=$0 +PROGRAM=$(basename $FULL_PROGRAM) + +if [[ "$PROGRAM" == "slurm_script" ]]; then + FULL_PROGRAM=$(scontrol show job $SLURM_JOBID | awk '/Command=/ {print $1}' | awk -F "=" '{print $2}') + PROGRAM=$(basename ${FULL_PROGRAM}) +fi +args="$FULL_PROGRAM $*" + +# 0 - table function +function table() { + if column -L <(echo) &>/dev/null; then + cat | column -s $'\t' -t -L + else + cat | column -s $'\t' -t echo + fi +} + +# 1 - get_help function +function get_help() { + { + echo -e "PROGRAM: $PROGRAM\n" + echo "DESCRIPTION:" + echo -e "\ + \tCharacterizes the novelty of AMPs using BLASTp. Does not join with AMPlify results.\n \ + \n \ + \tOUTPUT:\n \ + \t-------\n \ + \t - amps.blast.summary.novel.final.tsv\n \ + \t - amps.blast.summary.known.final.tsv\n \ + \n \ + \tEXIT CODES:\n \ + \t-----------\n \ + \t - 0: successfully completed\n \ + \t - 1: general errors\n \ + \n \ + " | table + + echo "USAGE(S):" + echo -e "\ + \t$PROGRAM [-o ] [-t ] -d \n \ + " | table + + echo "OPTION(S):" + echo -e "\ + \t-b \tPath to BLAST executable if not in PATH\n \ + \t-d \tPre-fromatted BLAST nr database\t(e.g. /path/to/nr)\n \ + \t-h\tShow this help menu\n \ + \t-o \tOutput directory\n \ + \t-t \tNumber of threads\t(default = 8)\n \ + " | table + } 1>&2 + exit 1 +} + +# 1.5 - print_line function +function print_line() { + if command -v tput &>/dev/null; then + end=$(tput cols) + else + end=50 + fi + { + printf '%.0s=' $(seq 1 $end) + echo + } 1>&2 +} + +# 2 - print_error function +function print_error() { + { + echo -e "CALL: $args (wd: $(pwd))\n" + message="$1" + echo "ERROR: $message" + print_line + get_help + } 1>&2 +} +# 3 - no arguments given +if [[ "$#" -eq 0 ]]; then + get_help +fi + + +# 4 - default parameters +outdir=$(pwd) +num_threads=8 + +if ! command -v blastp &>/dev/null; then + RUN_BLASTP="" +else + RUN_BLASTP=$(command -v blastp) +fi + +while getopts :d:ho:t:b: opt; do + case $opt in + b) RUN_BLASTP=$(realpath $OPTARG);; + d) database=$(realpath $OPTARG);; + h) get_help ;; + o) outdir=$(realpath $OPTARG) ;; + t) num_threads=$OPTARG ;; + \?) print_error "Invalid option: -$OPTARG" ;; + esac +done + +shift $((OPTIND-1)) + +# 5 - wrong arguments given +if [[ "$#" -ne 1 ]]; then + print_error "Incorrect number of arguments." +fi + +# 6 - check inputs + + +if [[ ! -f $(realpath $1) ]]; then + print_error "Input file $(realpath $1) does not exist." +elif [[ ! -s $(realpath $1) ]]; then + print_error "Input file $(realpath $1) is empty." +fi + +{ + echo "HOSTNAME: $(hostname)" + echo -e "START: $(date)\n" + + echo -e "PATH=$PATH\n" + echo "CALL: $args (wd: $(pwd))" + echo "THREADS: $num_threads" + echo +} 1>&2 + +mkdir -p $outdir + +infile=$(realpath $1) +# NOW: +# database=/projects/btl/db/blast-20220119/nr + +# sort the files into those <= 30 and those > 30 +$RUN_BLASTP -version 2>&1 || exit 1 + +# blast it against NR database +seqtk comp $infile > $outdir/seqtk.comp.txt + +seqtk subseq $infile <(awk '{if($2>30) print $1}' $outdir/seqtk.comp.txt) > $outdir/amps.gt30.faa +seqtk subseq $infile <(awk '{if($2<=30) print $1}' $outdir/seqtk.comp.txt) > $outdir/amps.short.faa + +# blast it against NR database +echo "Running BLASTp..." 1>&2 +$RUN_BLASTP -db $database -query $outdir/amps.gt30.faa -out $outdir/amps.gt30.tsv -task blastp -outfmt '6 qaccver saccver stitle pident length mismatch gapopen qstart qend sstart send evalue bitscore qcovs' -num_threads $num_threads &> $outdir/blastp.log & +$RUN_BLASTP -db $database -query $outdir/amps.short.faa -out $outdir/amps.short.tsv -task blastp-short -outfmt '6 qaccver saccver stitle pident length mismatch gapopen qstart qend sstart send evalue bitscore qcovs' -num_threads $num_threads &> $outdir/blastp-short.log & + +wait +# add headers to combined blast + +echo "Finished running BLASTP." 1>&2 +echo "qaccver saccver stitle pident length mismatch gapopen qstart qend sstart send evalue bitscore qcovs" | sed 's/ /\t/g' > $outdir/amps.blast.tsv +cat $outdir/amps.gt30.tsv $outdir/amps.short.tsv >> $outdir/amps.blast.tsv + +echo "Known AMPs (100% pident and 100% qcovs) written to $outdir/known.txt and $outdir/known.faa" 1>&2 +awk -F "\t" '{if($4==100 && $NF==100) print $1}' $outdir/amps.blast.tsv | sort -u > $outdir/known.txt + +echo "Novel AMPs written to $outdir/novel.txt and $outdir/novel.faa" 1>&2 +grep -vwf $outdir/known.txt <(awk '{print $1}' $outdir/seqtk.comp.txt) > $outdir/novel.txt + +seqtk subseq $infile $outdir/known.txt > $outdir/known.faa +seqtk subseq $infile $outdir/novel.txt > $outdir/novel.faa + +echo "qaccver saccver stitle pident qcovs" | sed 's/ /\t/g' > $outdir/amps.blast.novel.tsv +grep -wf $outdir/novel.txt $outdir/amps.blast.tsv | cut -f1-4,14 -d $'\t'>> $outdir/amps.blast.novel.tsv +echo "qaccver saccver stitle pident qcovs" | sed 's/ /\t/g' > $outdir/amps.blast.known.tsv +grep -wf $outdir/known.txt $outdir/amps.blast.tsv | cut -f1-4,14 -d$'\t' >> $outdir/amps.blast.known.tsv + +# take top one for each JIRA ticket summary, and AMPlify results combination +echo "qaccver saccver stitle pident qcovs" | sed 's/ /\t/g' > $outdir/amps.blast.summary.novel.tsv +while read line; do + grep -w -m1 "$line" $outdir/amps.blast.tsv | cut -f1-4,14 -d $'\t' >> $outdir/amps.blast.summary.novel.tsv +done < $outdir/novel.txt +# +for i in $(grep -vwf <(awk '{print $1}' $outdir/amps.blast.novel.tsv) $outdir/novel.txt); do + echo -e "$i\tNA\tNA\tNA\tNA" >> $outdir/amps.blast.summary.novel.tsv +done + +seqtk subseq $infile <(grep -vwf <(awk '{print $1}' $outdir/amps.blast.novel.tsv) $outdir/novel.txt) > $outdir/no_hits.faa + +echo "qaccver saccver stitle pident qcovs" | sed 's/ /\t/g' > $outdir/amps.blast.summary.known.tsv +while read line; do + grep -w -m1 "$line" $outdir/amps.blast.tsv | cut -f1-4,14 -d $'\t' >> $outdir/amps.blast.summary.known.tsv +done < $outdir/known.txt