RunOnBamToBigWig.bsh

#!/usr/bin/bash
#
while test $# -gt 0; do
        case "$1" in
                -h|--help)
    echo ""
    echo "Takes bam files from Run-On sequencing data as input and writes" 
    echo "bigWig files as output to the user-assigned output-dir."
    echo ""
    echo "Requirements in current working directory:"
    echo "samtools, bedtools, and bedGraphToBigWig."
    echo ""
    echo "bash RunOnBamToBigWig.bsh [options]"
    echo ""
    echo "options:"
    echo ""
    echo "To get help:"
    echo "-h, --help             Show this brief help menu."
    echo ""
    echo "Required options:"
    echo "-SE, --SEQ=SE          Bam file from Single-end sequencing."
    echo "-PE, --SEQ=PE          Bam file from Paired-end sequencing."
    echo "-c, --chrom-info=PATH  Location of the chromInfo table."
    echo ""
    echo "I/O options:"
    echo "-I, --bam=PREFIX.bam   Input bam file. If not specified, will take"
    echo "                       *.bam in the current working directory as input"
    echo "-T, --tmp=PATH         Path to a temporary storage directory."
    echo "-O, --output-dir=DIR   Specify a directory to store output in."
    echo ""
    echo "Required options for SE"
    echo "-G, --SE_READ=RNA_5prime Single-end sequencing from 5' end of"
    echo "                         nascent RNA, like GRO-seq."
    echo "-P, --SE_READ=RNA_3prime Single-end sequencing from 3' end of"
    echo "                         nascent RNA, like PRO-seq."
    echo ""
    echo "Options for PE"
    echo "--RNA5=R1_5prime    Specify the location of the 5' end of RNA"
    echo "                    [default: R1_5prime]."
    echo "--RNA3=R2_5prime    Specify the location of the 3' end of RNA"
    echo "                    [default: R2_5prime]."
    echo "                    Available options: R1_5prime: the 5' end of R1 reads"
    echo "                                       R2_5prime: the 5' end of R2 reads"
    echo "-5, --map5=TRUE     Report the 5' end of RNA [default on, --map5=TRUE]."
    echo "-3, --map5=FALSE    Report the 3' end of RNA,"
    echo "                    only available for PE [default off, --map5=TRUE]."
    echo "-s, --opposite-strand=TRUE"
    echo "                    Enable this option if the RNA are at the different strand"
    echo "                    as the reads set at RNA5 [default: disable]."
    echo ""
    echo "Optional operations:"
    echo "--thread=1         Number of threads can be used [default: 1]"
    echo ""

                        exit 0
                        ;;
                -SE)
                        export SEQ="SE"
                        shift
                        ;;
                -PE)
                        export SEQ="PE"
                        shift
                        ;;
                -c)
                        shift
                        if test $# -gt 0; then
                                export CHINFO=$1
                        else
                                echo "no chromInfo specified"
                                exit 1
                        fi
                        shift
                        ;;
                --chrom-info*)
                        export CHINFO=`echo $1 | sed -e 's/^[^=]*=//g'`
                        shift
                        ;;
                -I)
                        shift
                        if test $# -gt 0; then
                                export BAM_INPUT=$1
                        else
                                echo "no input bam file specified."
                                exit 1
                        fi
                        shift
                        ;;
                --bam*)
                        export BAM_INPUT=`echo $1 | sed -e 's/^[^=]*=//g'`
                        shift
                        ;;
                -T)
                        shift
                        if test $# -gt 0; then
                                export TMPDIR=$1
                        else
                                echo "no temp folder specified."
                                exit 1
                        fi
                        shift
                        ;;
                --tmp*)
                        export TMPDIR=`echo $1 | sed -e 's/^[^=]*=//g'`
                        shift
                        ;;
                -O)
                        shift
                        if test $# -gt 0; then
                                export OUTPUT=$1
                        else
                                echo "no output dir specified."
                                exit 1
                        fi
                        shift
                        ;;
                --output-dir*)
                        export OUTPUT=`echo $1 | sed -e 's/^[^=]*=//g'`
                        shift
                        ;;
                --thread*)
                        export thread=`echo $1 | sed -e 's/^[^=]*=//g'`
                        shift
                        ;;
                --RNA5*) # report location of the 5 prime end of RNA
                         # acce
                        export RNA5=`echo $1 | sed -e 's/^[^=]*=//g'`
                        shift
                        ;;
                --RNA3*) # report location of the 5 prime end of RNA
                         # acce
                        export RNA3=`echo $1 | sed -e 's/^[^=]*=//g'`
                        shift
                        ;;
                -5)
                        export MAP5="TRUE"
                        shift
                        ;;
                --map5*)
                        export MAP5=`echo $1 | sed -e 's/^[^=]*=//g'`
                        shift
                        ;;
                -3)
                        export MAP5="FALSE"
                        shift
                        ;;
                -s)
                        export OPP="TRUE"
                        shift
                        ;;
                --opposite-strand*)
                        export OPP=`echo $1 | sed -e 's/^[^=]*=//g'`
                        shift
                        ;;
                -G)
                        export SE_OUTPUT="G"
                        export RNA5="R1_5prime"
                        export OPP="FALSE"
                        #export MAP5="TRUE"
                        export SE_READ="RNA_5prime"
                        shift
                        ;;
                -P)
                        export SE_OUTPUT="P"
                        export RNA3="R1_5prime"
                        export OPP="TRUE"
                        #export MAP5="TRUE"
                        export SE_READ="RNA_3prime"
                        shift
                        ;;
                --SE_READ*)
                        export SE_READ=`echo $1 | sed -e 's/^[^=]*=//g'`
                        shift
                        ;;
                *)
                        break
                        ;;
        esac
done


## CHECK ARGUMENTS.
if [ -z "$CHINFO" ]; then
        echo "--chrom-info is required."
        echo " use bash RunOnBamToBigWig.bsh --help."
        exit 1
fi
if [ -z "$SEQ" ]; then
  echo "Please specify the input data is SE or PE."
  echo " use bash RunOnBamToBigWig.bsh --help."
  exit 1
fi

## INPUT & Parameters
if [ -z "$BAM_INPUT" ]; then
   echo "No input files specified.  Using *.bam"
   BAM_INPUT=`ls *.bam`
fi


if [[ "$SEQ" == "PE" ]] ; then 
# PE
  if [ -z "$SE_OUTPUT" ]; then 
    :
  else
    echo "-G and -P can only be used with -SE."
    echo " use bash RunOnBamToBigWig.bsh --help."
    exit 1
  fi

# SE
elif [[ "$SEQ" == "SE" ]] ; then
  if [[ "$SE_READ" == "RNA_5prime" ]] ; then
    SE_OUTPUT="G"
    RNA5="R1_5prime"
    OPP="FALSE"
    #MAP5="TRUE"
  elif [[ "$SE_READ" == "RNA_3prime" ]] ; then
    SE_OUTPUT="P"
    RNA3="R1_5prime"
    OPP="FALSE"
    #MAP5="TRUE"
  fi
  if [ -z "$SE_OUTPUT" ] ; then
  echo "Please specify output format for SE [-G or -P]"
  exit 1
  fi
else
  echo "Please specify the input data is SE or PE."
  echo " use bash RunOnBamToBigWig.bsh --help."
  exit 1
fi


# Check input file number
if [[ ${#BAM_INPUT[@]} == 0 ]]; then  # if the length of array is 0
  echo "#####################################"
  echo " No files is in the correct format."
  echo "#####################################"
  exit 1
fi

if [ -z "$OUTPUT" ]; then
  now=$(date +"%m_%d_%Y")
  OUTPUT=./My_output-${now}
  echo No output path specified.  Using ${OUTPUT}
fi

if [ ! -d $OUTPUT ]; then
  mkdir $OUTPUT
fi
if [ -z "$TMPDIR" ]; then
        TMPDIR="./"
fi
if [ ! -d $TMPDIR ]; then
  mkdir $TMPDIR
fi
# bash generate random 32 character alphanumeric string (upper and lowercase).
tmp=`cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1`
TMPDIR=$TMPDIR/$tmp

if [ -z "$thread" ]; then
   thread=1
fi


if [[ "$SEQ" == "PE" ]]; then
  if [[ -z "$RNA5" && -z "$RNA3" ]]; then
    RNA5="R1_5prime"
  fi
  if [[ "$RNA3" == "R1_5prime" ]]; then
   RNA5="R2_5prime"
  elif [[ "$RNA3" == "R2_5prime" ]]; then
    RNA5="R1_5prime"
  fi
  
  if [[ "$RNA5" == "R1_5prime" || "$RNA5" == "R2_5prime" ]] ; then 
          :
  else
          echo "--RNA5 and --RNA3 value can only be R1_5prime or R2_5prime."
          echo " use bash RunOnBamToBigWig.bsh --help."
          exit 1
  fi
fi

if [ -z "$OPP" ]; then
   OPP="FALSE"
fi
if [ -z "$MAP5" ]; then
        MAP5="TRUE"
fi


if [[ "${MAP5}" == "FALSE" && "$SEQ" == "SE" ]] ; then 
  echo "For single-end (SE), can only report the 5 prime end of reads (--map5=TRUE)"
  exit 1
fi


if [ "${MAP5}" == "TRUE" ] ; then 
        :
elif [ "${MAP5}" == "FALSE" ] ; then
        :
else
        echo "--map5 value can only be TRUE or FALSE."
        echo " use bash RunOnBamToBigWig.bsh --help."
        exit 1
fi

## Check all the bioinformatics tools can be called from current working directory.
for tool in samtools bedtools bedGraphToBigWig sort-bed
do command -v ${tool} >/dev/null 2>&1 || { echo >&2 ${tool}" is required. Please make sure you can call the bioinformatics tools from your current working directoryb.  Aborting."; exit 1; }
done

exec > >(tee ${OUTPUT}/RunOnBamToBigWig.log)
exec 2>&1

## Print out
echo " " 
echo "Processing ..." 


echo "SEQ                       $SEQ"

if [[ "$SEQ" == "SE" ]]; then
echo "SE_OUTPUT                 $SE_OUTPUT"
echo "SE_READ                   $SE_READ"
fi

if [[ "$SEQ" == "PE" ]]; then
echo "Location of 5' of RNA     $RNA5"
echo "Location of 3' of RNA     $RNA3"
fi

echo "Report 5' ends            $MAP5"
echo "Report opposite strand    $OPP"

echo ""
echo "Input files/ paths:"
echo "chromInfo                 $CHINFO"

i=1
for name in ${BAM_INPUT[@]}
  do 
echo "input file $i              ${name}"
  let "i++"
done

echo "temp folder               $TMPDIR"
echo "output-dir                $OUTPUT"
echo " "
echo "Optional operations:"
echo "number of threads         $thread"


mkdir ${TMPDIR}

#############################################
## Write out the bigWigs.
echo " "


if [[ "$SEQ" == "SE" ]] ; then 
   for f in ${BAM_INPUT[@]}
    do
      j=`echo $f | awk -F"/" '{print $NF}' | rev | cut -d \. -f 2- |rev `
      echo $j > ${OUTPUT}/${j}.align.log
      echo "Sorting $f"
      samtools sort -n -@ ${thread} $f > ${TMPDIR}/${j}.sort.bam

   # in SE, MAP5 alwasys TRUE


      #if [[ "${RNA5}" == "R1_5prime" && "${OPP}" == "FALSE" ]] ; then ## report The 5 prime end of the RNA.   #like GRO-seq
      if [[ "$SE_OUTPUT" == "G" ]] ; then
         bedtools bamtobed -i ${TMPDIR}/${j}.sort.bam 2> ${TMPDIR}/kill.warnings| awk 'BEGIN{OFS="\t"} ($5 > 0){print $0}' | \
         awk 'BEGIN{OFS="\t"} ($6 == "+") {print $1,$2,$2+1,$4,$5,$6}; ($6 == "-") {print $1,$3-1,$3,$4,$5,$6}' | gzip > ${TMPDIR}/$j.bed.gz
      #elif [[ "${RNA3}" == "R1_5prime" && "${OPP}" == "TRUE" ]] ; then  
       elif [[ "$SE_OUTPUT" == "P" ]] ; then  #like PRO-seq
         bedtools bamtobed -i ${TMPDIR}/${j}.sort.bam 2> ${TMPDIR}/kill.warnings| awk 'BEGIN{OFS="\t"} ($5 > 0){print $0}' | \
         awk 'BEGIN{OFS="\t"} ($6 == "+") {print $1,$2,$2+1,$4,$5,"-"}; ($6 == "-") {print $1,$3-1,$3,$4,$5,"+"}' | gzip > ${TMPDIR}/$j.bed.gz
      fi


      echo 'Number of mappable reads:' >> ${OUTPUT}/${j}.align.log
      echo `zcat ${TMPDIR}/$j.bed.gz | grep "" -c` >> ${OUTPUT}/${j}.align.log
      
      ## Remove rRNA and reverse the strand (PRO-seq).
      zcat ${TMPDIR}/$j.bed.gz | grep "rRNA\|chrM" -v | grep "_" -v | sort-bed - | gzip > ${TMPDIR}/$j.nr.rs.bed.gz
      echo 'Number of mappable reads (excluding rRNA):' >> ${OUTPUT}/${j}.align.log
      echo `zcat ${TMPDIR}/$j.nr.rs.bed.gz | grep "" -c` >> ${OUTPUT}/${j}.align.log
      
      ## Convert to bedGraph ... Can't gzip these, unfortunately.
      bedtools genomecov -bg -i ${TMPDIR}/$j.nr.rs.bed.gz -g ${CHINFO} -strand + > ${TMPDIR}/$j\_plus.bedGraph
      bedtools genomecov -bg -i ${TMPDIR}/$j.nr.rs.bed.gz -g ${CHINFO} -strand - > ${TMPDIR}/$j\_minus.noinv.bedGraph
      
      ## Invert minus strand.
      cat ${TMPDIR}/$j\_minus.noinv.bedGraph | awk 'BEGIN{OFS="\t"} {print $1,$2,$3,-1*$4}' > ${TMPDIR}/$j\_minus.bedGraph ## Invert read counts on the minus strand.
      
      ## Then to bigWig
      echo "Writing bigWigs:"
      bedGraphToBigWig ${TMPDIR}/$j\_plus.bedGraph ${CHINFO} ${OUTPUT}/$j\_plus.bw
      bedGraphToBigWig ${TMPDIR}/$j\_minus.bedGraph ${CHINFO} ${OUTPUT}/$j\_minus.bw

      
    done

elif [[ "$SEQ" == "PE" ]] ; then
   for f in ${BAM_INPUT[@]}
    do
    j=`echo $f | awk -F"/" '{print $NF}' | rev | cut -d \. -f 2- |rev `
    echo $j > ${OUTPUT}/${j}.align.log
    echo "Sorting $f"
    samtools sort -n -@ ${thread} $f > ${TMPDIR}/${j}.sort.bam


     if [ "${RNA5}" == "R1_5prime" ] ; then
         if [ "${OPP}" == "FALSE" ] ; then
           if [ "${MAP5}" == "TRUE" ] ; then  ## report The 5' end of the RNA. Danko lab leChRO-Seq protocol is on the 5' of _R1 readl, same strand of R1 ($9) 
               bedtools bamtobed -bedpe -mate1 -i ${TMPDIR}/${j}.sort.bam 2> ${TMPDIR}/kill.warnings | awk 'BEGIN{OFS="\t"} ($9 == "+") {print $1,$2,$2+1,$7,$8,$9}; ($9 == "-") {print $1,$3-1,$3,$7,$8,$9}' | gzip > ${TMPDIR}/$j.bed.gz
           else ## report The 3' end of the RNA.  Danko lab leChRO-Seq protocol is on the 5 prime of _R2 read, opposite strand of R2 (R2 strand $10, R1 strand $9)
               bedtools bamtobed -bedpe -mate1 -i ${TMPDIR}/${j}.sort.bam 2> ${TMPDIR}/kill.warnings | awk 'BEGIN{OFS="\t"} ($10 == "-") {print $1,$6-1,$6,$7,$8,$9}; ($10 == "+") {print $1,$5,$5+1,$7,$8,$9}' | gzip > ${TMPDIR}/$j.bed.gz
           fi
        elif [ "${OPP}" == "TRUE" ] ; then
           if [ "${MAP5}" == "TRUE" ] ; then  ## report The 5' end of the RNA. 
              bedtools bamtobed -bedpe -mate1 -i ${TMPDIR}/${j}.sort.bam 2> ${TMPDIR}/kill.warnings | awk 'BEGIN{OFS="\t"} ($9 == "+") {print $1,$2,$2+1,$7,$8,$10}; ($9 == "-") {print $1,$3-1,$3,$7,$8,$10}' | gzip > ${TMPDIR}/$j.bed.gz
           else ## report The 3' end of the RNA.
              bedtools bamtobed -bedpe -mate1 -i ${TMPDIR}/${j}.sort.bam 2> ${TMPDIR}/kill.warnings | awk 'BEGIN{OFS="\t"} ($10 == "-") {print $1,$6-1,$6,$7,$8,$10}; ($10 == "+") {print $1,$5,$5+1,$7,$8,$10}' | gzip > ${TMPDIR}/$j.bed.gz
           fi
        fi
     elif [ "${RNA5}" == "R2_5prime" ] ; then
        if [ "${OPP}" == "FALSE" ] ; then
           if [ "${MAP5}" == "TRUE" ] ; then #report the 5 prime end of RNA, in Engreitz data is 5 prime end of R2, same strand
              bedtools bamtobed -bedpe -mate1 -i ${TMPDIR}/${j}.sort.bam 2> ${TMPDIR}/${j}.kill.warnings | awk 'BEGIN{OFS="\t"} ($10 == "+") {print $1,$5,$5+1,$7,$8,$10}; ($10 == "-") {print $1,$6-1,$6,$7,$8,$10}'|gzip > ${TMPDIR}/${j}.bed.gz      
           else  ## report the 3-prime end of the RNA, in Engreitz data is the 5' end of R1 read, but opposite strand
              bedtools bamtobed -bedpe -mate1 -i ${TMPDIR}/${j}.sort.bam 2> ${TMPDIR}/${j}.kill.warnings | awk 'BEGIN{OFS="\t"} ($9 == "+") {print $1,$2,$2+1,$7,$8,$10}; ($9 == "-") {print $1,$3-1,$3,$7,$8,$10}' |gzip  > ${TMPDIR}/${j}.bed.gz
           fi
       elif [ "${OPP}" == "TRUE" ] ; then
           if [ "${MAP5}" == "TRUE" ] ; then #report the 5 prime end of RNA, in Engreitz data is 5 prime end of R2, same strand
              bedtools bamtobed -bedpe -mate1 -i ${TMPDIR}/${j}.sort.bam 2> ${TMPDIR}/${j}.kill.warnings | awk 'BEGIN{OFS="\t"} ($10 == "+") {print $1,$5,$5+1,$7,$8,$9}; ($10 == "-") {print $1,$6-1,$6,$7,$8,$9}'|gzip > ${TMPDIR}/${j}.bed.gz      
           else  ## report the 3-prime end of the RNA, in Engreitz data is the 5' end of R1 read, but opposite strand
              bedtools bamtobed -bedpe -mate1 -i ${TMPDIR}/${j}.sort.bam 2> ${TMPDIR}/${j}.kill.warnings | awk 'BEGIN{OFS="\t"} ($9 == "+") {print $1,$2,$2+1,$7,$8,$9}; ($9 == "-") {print $1,$3-1,$3,$7,$8,$9}' |gzip  > ${TMPDIR}/${j}.bed.gz
            fi
        fi
     fi


   echo 'Number of mappable reads:' >> ${OUTPUT}/${j}.align.log
   echo `zcat ${TMPDIR}/$j.bed.gz | grep "" -c` >> ${OUTPUT}/${j}.align.log
   
   ## Remove rRNA and reverse the strand (PRO-seq).
   zcat ${TMPDIR}/$j.bed.gz | grep "rRNA\|chrM" -v | grep "_" -v | sort-bed - | gzip > ${TMPDIR}/$j.nr.rs.bed.gz
   echo 'Number of mappable reads (excluding rRNA):' >> ${OUTPUT}/${j}.align.log
   echo `zcat ${TMPDIR}/$j.nr.rs.bed.gz | grep "" -c` >> ${OUTPUT}/${j}.align.log
   
   ## Convert to bedGraph ... Can't gzip these, unfortunately.
   bedtools genomecov -bg -i ${TMPDIR}/$j.nr.rs.bed.gz -g ${CHINFO} -strand + > ${TMPDIR}/$j\_plus.bedGraph
   bedtools genomecov -bg -i ${TMPDIR}/$j.nr.rs.bed.gz -g ${CHINFO} -strand - > ${TMPDIR}/$j\_minus.noinv.bedGraph
   
   ## Invert minus strand.
   cat ${TMPDIR}/$j\_minus.noinv.bedGraph | awk 'BEGIN{OFS="\t"} {print $1,$2,$3,-1*$4}' > ${TMPDIR}/$j\_minus.bedGraph ## Invert read counts on the minus strand.
   
   ## Then to bigWig
   echo "Writing bigWigs:"
   bedGraphToBigWig ${TMPDIR}/$j\_plus.bedGraph ${CHINFO} ${OUTPUT}/$j\_plus.bw
   bedGraphToBigWig ${TMPDIR}/$j\_minus.bedGraph ${CHINFO} ${OUTPUT}/$j\_minus.bw

 done

fi


rm ${TMPDIR} -r