PSI_1.bash

#!/bin/bash 
# 
# Usage: The script generate:   
#    coverage: how many reads fallen into exonic part 
#    exclusion : how many reads spliced out for a certain exonic part  
#        calculate PSI( percentage splicing index) : The usage of a certain exonic part 
# Output:
#    prefix_exon.coverage: The coverage count for each exonic part 
#    prefix_exon.exclusion: The spliced read count for each exonic part     
#        prefix_exon.psi: The psi value for each exonic part 
#    prefix_exon.gff: The exonic part annotation without aggregate gene feature
#
# Author: 
#   Miao kui, kui.miao@duke-nus.edu.sg , Duke-NUS Graduate Medical School Singapore (Duke-NUS)
#   Sebastian Schafer, sebastian.schaefer@nhcs.com.sg, National Heart Centre Singapore
# 
# Please cite "Schafer, S., Miao, K., Benson, C.C., Heinig, M., Cook, S.A., and Hubner, N. 2015. Alternative splicing signatures in RNA-seq data: percent spliced in (PSI). Curr. Protoc. Hum. Genet. doi: 10.1002/0471142905.hg1116s87"
#
# version:1.0 

set -e
set -u
set -o pipefail

## print out help information
function showHelp { 
cat << EOF
Usage: PSI.sh Command
Description:
    Derived from http://www.currentprotocols.com/protocol/hg1116, adding <path_to_bedtools2.23> for specify bedtools2.23

Command: 
    CountInclusion      Counting exon inclusion coverage  
        Usage: PSI.sh CountInclusion <annotation.gtf> <alignment_file.bam> <baseName> <path_to_bedtools2.23>
              annotation.gtf     The gtf annotation file generated by dexseq_prepare_annotation.py
              alignment_file.bam     alignment_file.bam file 
              baseName               baseName of output file
      
    CountExclusion      Counting exon exclusion 
        Usage: PSI.sh CountExclusion <annotation> <junctions_file> <baseName> <path_to_bedtools2.23>
              annotation             The gtf annotation generated by dexseq_prepare_annotation.py
              junctions_file         bed format junctionfile generated by tophat/tophat2
              baseName               baseName of oupput file
    
    getPSI          Calculating PSI value 
        Usage: PSI.sh getPSI <readLength> <inclustion.txt> <exclusion.txt> <baseName> <path_to_bedtools2.23>
                  readlength          the length of the short read, which will be used for exon count correction 
              inclusion.txt          exon inclusion counts generated by CountInclusion 
              exclusion.txt          exon exclusion counts generated by CountExclusion
              baseName               baseName of output file
    StartPSI        Calculating PSI from the beginning
        Usage: PSI.sh StartPSI  <annotation.gtf> <readLength> <alignment_file.bam> <junctions_file> <baseName> <path_to_bedtools2.23>

    StartPSINoFilter        Calculating PSI from the beginning without filtering for junctions falling into known exons
        Usage: PSI.sh StartPSINoFilter  <annotation.gtf> <readLength> <alignment_file.bam> <junctions_file> <baseName> <path_to_bedtools2.23>
For details see: "Schafer, S., et al. 2015. Alternative splicing signatures in RNA-seq data: percent spliced in (PSI). Curr. Protoc. Hum. Genet."
EOF
exit 1 
} 


## Counting exon inclusion coverage for each exonic part. 
function CountInclusion {
    echo "Counting exon coverage...."
    
    ## extracting the exon annotation from input annotation file, and simplify the attribute field as "geneID:exonNumer" to track the exon 
    [ -f ${prefix}_exonic_parts.gff ] || awk '{OFS="\t"}{if ($3 == "exonic_part") print  $1,$2,$3,$4,$5,$6,$7,$8,$14":"$12}' $annotation | sed 's@[";]@@g' > ${prefix}_exonic_parts.gff
    
    ## counting the exon coverage  sort based on exon ID 
    ${BEDTOOLS} coverage -split -abam $reads -b ${prefix}_exonic_parts.gff | awk 'BEGIN{OFS = "\t"}{ print $1,$4,$5,$5-$4+1,$9,$10 }' | sort -k 5 > ${prefix}_exonic_parts.inclusion
    
    echo "Exon coverage counting finished!"
    echo 
} 

## Filtering junction. Filter out the junction records which was not overlaped with any of the anotated exonic part. 
function junctionFilter {
        cat $junctions | sed 's/,/\t/g' | awk 'BEGIN{OFS="\t"}{print $1,$2,$2+$13,$4,$5,$6}' > ${prefix}_left.bed
        cat $junctions | sed 's/,/\t/g' | awk 'BEGIN{OFS="\t"}{print $1,$3-$14,$3,$4,$5,$6}' > ${prefix}_right.bed
        ${BEDTOOLS} intersect -u -s -a ${prefix}_left.bed -b ${prefix}_exonic_parts.gff > ${prefix}_left.overlap
        ${BEDTOOLS} intersect -u -s -a ${prefix}_right.bed -b ${prefix}_exonic_parts.gff > ${prefix}_right.overlap
        cat ${prefix}_left.overlap ${prefix}_right.overlap | cut -f4 | sort | uniq -c | awk '{ if($1 == 2) print $2 }' > ${prefix}_filtered_junctions.txt
        grep -F -f ${prefix}_filtered_junctions.txt $junctions > ${prefix}_filtered_junctions.bed   
        rm ${prefix}_left.bed ${prefix}_right.bed ${prefix}_left.overlap ${prefix}_right.overlap ${prefix}_filtered_junctions.txt
}   


## Counting exon exclusion 
function CountExclusion {
    echo "Counting exclusion...."
    echo "junctions file is: $junctions"
    [ -f ${prefix}_exonic_parts.gff ] || awk '{OFS="\t"}{if ($3 == "exonic_part") print  $1,$2,$3,$4,$5,$6,$7,$8,$14":"$12}' $annotation | sed 's@[";]@@g' > ${prefix}_exonic_parts.gff
    ##  We create an intron list from Tophat junctions
    grep -v description $junctions | sed 's/,/\t/g' | awk '{OFS="\t"}{print $1,$2+$13,$3-$14,$4,$5,$6}' > ${prefix}_intron.bed

    ##  Extract all Introns belonging to an exon and summarize read counts for each exon
    ${BEDTOOLS} intersect -wao -f 1.0 -s -a ${prefix}_exonic_parts.gff -b ${prefix}_intron.bed | awk 'BEGIN{OFS = "\t"}{ $16 == 0? s[$9] += 0:s[$9] += $14 }END{ for (i in s) {print i,s[i]} }' | sort -k 1 > ${prefix}_exonic_parts.exclusion
    rm  ${prefix}_intron.bed    
    echo "Exclusion counting finished! "
    echo 
}


## Calculating the PSI value based on inclusion and exclusion number 
function CountPSI {
    echo "Calculating PSI value..."

    ## checking the inclusion and exclusion input file, if the exonID are not sorted or different, then exit with status 9 
    cut -f5 $inclusion > ${prefix}_exonID1.txt 
    cut -f1 $exclusion > ${prefix}_exonID2.txt 
    diff ${prefix}_exonID1.txt ${prefix}_exonID2.txt > /dev/null  || ( echo "Unsorted exonID exit" &&  return 9 )
    rm ${prefix}_exonID1.txt ${prefix}_exonID2.txt
    
    ## Calculating PSI 
    paste  $inclusion $exclusion | awk -v "len=$readLength" -v "prefix=$prefix" 'BEGIN{OFS = "\t"; print "exon_ID",prefix"_length",prefix"_inclusion",prefix"_exclusion",prefix"_PSI" }{NIR=$6/($4+len-1);NER=$8/(len-1)}{print $5,$4,$6,$8,(NIR+NER==0)? "NA":NIR/(NIR + NER)}' > ${prefix}_exonic_parts.psi 
    echo "PSI calculating finished!"
}


function main {
    command=$1
    case $command in 
    CountInclusion)
        [ $# -lt 5 ] &&  showHelp
        annotation=$2
        reads=$3
        prefix=$4
        BEDTOOLS=$5
        CountInclusion
        ;;
    CountExclusion)
        [ $# -lt 5 ] && showHelp
        annotation=$2
        junctions=$3
        prefix=$4
        BEDTOOLS=$5
        junctionFilter
        junctions=${prefix}_filtered_junctions.bed
        CountExclusion
        ;;
    getPSI)
        [ $# -lt 6 ] && showHelp
        readLength=$2
        inclusion=$3
        exclusion=$4
        prefix=$5
        BEDTOOLS=$6
        CountPSI
        ;; 
    StartPSI)
        [ $# -lt 7 ] && showHelp
        annotation=$2
        readLength=$3
        reads=$4
        junctions=$5
        prefix=$6
        BEDTOOLS=$7
        CountInclusion
        junctionFilter
        junctions=${prefix}_filtered_junctions.bed
        CountExclusion
        inclusion=${prefix}_exonic_parts.inclusion
        exclusion=${prefix}_exonic_parts.exclusion
        CountPSI
        ;;
    StartPSINoFilter)
        [ $# -lt 7 ] && showHelp
        annotation=$2
        readLength=$3
        reads=$4
        junctions=$5
        prefix=$6
        BEDTOOLS=$7
        CountInclusion
        CountExclusion
        inclusion=${prefix}_exonic_parts.inclusion
        exclusion=${prefix}_exonic_parts.exclusion 
        CountPSI
        ;;
    *)
        echo "Wrong options !"
        showHelp
    esac 
}

## Running 
main $*