-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPSI_1.bash
187 lines (165 loc) · 8.04 KB
/
PSI_1.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/bin/bash
#
# Usage: The script generate:
# coverage: how many reads fallen into exonic part
# exclusion : how many reads spliced out for a certain exonic part
# calculate PSI( percentage splicing index) : The usage of a certain exonic part
# Output:
# prefix_exon.coverage: The coverage count for each exonic part
# prefix_exon.exclusion: The spliced read count for each exonic part
# prefix_exon.psi: The psi value for each exonic part
# prefix_exon.gff: The exonic part annotation without aggregate gene feature
#
# Author:
# Miao kui, [email protected] , Duke-NUS Graduate Medical School Singapore (Duke-NUS)
# Sebastian Schafer, [email protected], National Heart Centre Singapore
#
# Please cite "Schafer, S., Miao, K., Benson, C.C., Heinig, M., Cook, S.A., and Hubner, N. 2015. Alternative splicing signatures in RNA-seq data: percent spliced in (PSI). Curr. Protoc. Hum. Genet. doi: 10.1002/0471142905.hg1116s87"
#
# version:1.0
set -e
set -u
set -o pipefail
## print out help information
function showHelp {
cat << EOF
Usage: PSI.sh Command
Description:
Derived from http://www.currentprotocols.com/protocol/hg1116, adding <path_to_bedtools2.23> for specify bedtools2.23
Command:
CountInclusion Counting exon inclusion coverage
Usage: PSI.sh CountInclusion <annotation.gtf> <alignment_file.bam> <baseName> <path_to_bedtools2.23>
annotation.gtf The gtf annotation file generated by dexseq_prepare_annotation.py
alignment_file.bam alignment_file.bam file
baseName baseName of output file
CountExclusion Counting exon exclusion
Usage: PSI.sh CountExclusion <annotation> <junctions_file> <baseName> <path_to_bedtools2.23>
annotation The gtf annotation generated by dexseq_prepare_annotation.py
junctions_file bed format junctionfile generated by tophat/tophat2
baseName baseName of oupput file
getPSI Calculating PSI value
Usage: PSI.sh getPSI <readLength> <inclustion.txt> <exclusion.txt> <baseName> <path_to_bedtools2.23>
readlength the length of the short read, which will be used for exon count correction
inclusion.txt exon inclusion counts generated by CountInclusion
exclusion.txt exon exclusion counts generated by CountExclusion
baseName baseName of output file
StartPSI Calculating PSI from the beginning
Usage: PSI.sh StartPSI <annotation.gtf> <readLength> <alignment_file.bam> <junctions_file> <baseName> <path_to_bedtools2.23>
StartPSINoFilter Calculating PSI from the beginning without filtering for junctions falling into known exons
Usage: PSI.sh StartPSINoFilter <annotation.gtf> <readLength> <alignment_file.bam> <junctions_file> <baseName> <path_to_bedtools2.23>
For details see: "Schafer, S., et al. 2015. Alternative splicing signatures in RNA-seq data: percent spliced in (PSI). Curr. Protoc. Hum. Genet."
EOF
exit 1
}
## Counting exon inclusion coverage for each exonic part.
function CountInclusion {
echo "Counting exon coverage...."
## extracting the exon annotation from input annotation file, and simplify the attribute field as "geneID:exonNumer" to track the exon
[ -f ${prefix}_exonic_parts.gff ] || awk '{OFS="\t"}{if ($3 == "exonic_part") print $1,$2,$3,$4,$5,$6,$7,$8,$14":"$12}' $annotation | sed 's@[";]@@g' > ${prefix}_exonic_parts.gff
## counting the exon coverage sort based on exon ID
${BEDTOOLS} coverage -split -abam $reads -b ${prefix}_exonic_parts.gff | awk 'BEGIN{OFS = "\t"}{ print $1,$4,$5,$5-$4+1,$9,$10 }' | sort -k 5 > ${prefix}_exonic_parts.inclusion
echo "Exon coverage counting finished!"
echo
}
## Filtering junction. Filter out the junction records which was not overlaped with any of the anotated exonic part.
function junctionFilter {
cat $junctions | sed 's/,/\t/g' | awk 'BEGIN{OFS="\t"}{print $1,$2,$2+$13,$4,$5,$6}' > ${prefix}_left.bed
cat $junctions | sed 's/,/\t/g' | awk 'BEGIN{OFS="\t"}{print $1,$3-$14,$3,$4,$5,$6}' > ${prefix}_right.bed
${BEDTOOLS} intersect -u -s -a ${prefix}_left.bed -b ${prefix}_exonic_parts.gff > ${prefix}_left.overlap
${BEDTOOLS} intersect -u -s -a ${prefix}_right.bed -b ${prefix}_exonic_parts.gff > ${prefix}_right.overlap
cat ${prefix}_left.overlap ${prefix}_right.overlap | cut -f4 | sort | uniq -c | awk '{ if($1 == 2) print $2 }' > ${prefix}_filtered_junctions.txt
grep -F -f ${prefix}_filtered_junctions.txt $junctions > ${prefix}_filtered_junctions.bed
rm ${prefix}_left.bed ${prefix}_right.bed ${prefix}_left.overlap ${prefix}_right.overlap ${prefix}_filtered_junctions.txt
}
## Counting exon exclusion
function CountExclusion {
echo "Counting exclusion...."
echo "junctions file is: $junctions"
[ -f ${prefix}_exonic_parts.gff ] || awk '{OFS="\t"}{if ($3 == "exonic_part") print $1,$2,$3,$4,$5,$6,$7,$8,$14":"$12}' $annotation | sed 's@[";]@@g' > ${prefix}_exonic_parts.gff
## We create an intron list from Tophat junctions
grep -v description $junctions | sed 's/,/\t/g' | awk '{OFS="\t"}{print $1,$2+$13,$3-$14,$4,$5,$6}' > ${prefix}_intron.bed
## Extract all Introns belonging to an exon and summarize read counts for each exon
${BEDTOOLS} intersect -wao -f 1.0 -s -a ${prefix}_exonic_parts.gff -b ${prefix}_intron.bed | awk 'BEGIN{OFS = "\t"}{ $16 == 0? s[$9] += 0:s[$9] += $14 }END{ for (i in s) {print i,s[i]} }' | sort -k 1 > ${prefix}_exonic_parts.exclusion
rm ${prefix}_intron.bed
echo "Exclusion counting finished! "
echo
}
## Calculating the PSI value based on inclusion and exclusion number
function CountPSI {
echo "Calculating PSI value..."
## checking the inclusion and exclusion input file, if the exonID are not sorted or different, then exit with status 9
cut -f5 $inclusion > ${prefix}_exonID1.txt
cut -f1 $exclusion > ${prefix}_exonID2.txt
diff ${prefix}_exonID1.txt ${prefix}_exonID2.txt > /dev/null || ( echo "Unsorted exonID exit" && return 9 )
rm ${prefix}_exonID1.txt ${prefix}_exonID2.txt
## Calculating PSI
paste $inclusion $exclusion | awk -v "len=$readLength" -v "prefix=$prefix" 'BEGIN{OFS = "\t"; print "exon_ID",prefix"_length",prefix"_inclusion",prefix"_exclusion",prefix"_PSI" }{NIR=$6/($4+len-1);NER=$8/(len-1)}{print $5,$4,$6,$8,(NIR+NER==0)? "NA":NIR/(NIR + NER)}' > ${prefix}_exonic_parts.psi
echo "PSI calculating finished!"
}
function main {
command=$1
case $command in
CountInclusion)
[ $# -lt 5 ] && showHelp
annotation=$2
reads=$3
prefix=$4
BEDTOOLS=$5
CountInclusion
;;
CountExclusion)
[ $# -lt 5 ] && showHelp
annotation=$2
junctions=$3
prefix=$4
BEDTOOLS=$5
junctionFilter
junctions=${prefix}_filtered_junctions.bed
CountExclusion
;;
getPSI)
[ $# -lt 6 ] && showHelp
readLength=$2
inclusion=$3
exclusion=$4
prefix=$5
BEDTOOLS=$6
CountPSI
;;
StartPSI)
[ $# -lt 7 ] && showHelp
annotation=$2
readLength=$3
reads=$4
junctions=$5
prefix=$6
BEDTOOLS=$7
CountInclusion
junctionFilter
junctions=${prefix}_filtered_junctions.bed
CountExclusion
inclusion=${prefix}_exonic_parts.inclusion
exclusion=${prefix}_exonic_parts.exclusion
CountPSI
;;
StartPSINoFilter)
[ $# -lt 7 ] && showHelp
annotation=$2
readLength=$3
reads=$4
junctions=$5
prefix=$6
BEDTOOLS=$7
CountInclusion
CountExclusion
inclusion=${prefix}_exonic_parts.inclusion
exclusion=${prefix}_exonic_parts.exclusion
CountPSI
;;
*)
echo "Wrong options !"
showHelp
esac
}
## Running
main $*