-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexomedepth_analysis.R
176 lines (145 loc) · 8.7 KB
/
exomedepth_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
library("ExomeDepth")
#define cohort to analyse
#########################
#note that the cohort name should be the name of the folder where case data are stored in $EPHEMERAL
cohort="DCM"
#cohort="HCM"
#########################
#define genes of interest
#HVOL
geneset = c("ACTC1","ACTN2","BAG3","CSRP3","DES","DSP","JPH2","LMNA","MYBPC3","MYH7","MYL2","MYL3","NEXN","PLN","RBM20","SCN5A","TNNC1","TNNI3","TNNT2","TPM1","TRIM63","TTN","VCL")
#DCM
#geneset = c("ACTC1","ACTN2","BAG3","DES","DSP","JPH2","LMNA","MYH7","NEXN","PLN","RBM20","SCN5A","TNNC1","TNNI3","TNNT2","TPM1","TTN","VCL")
#HCM
#geneset = c("ACTC1","ACTN2","CSRP3","JPH2","MYBPC3","MYH7","MYL2","MYL3","PLN","TNNC1","TNNI3","TNNT2","TPM1","TRIM63")
#import target data and add exon identifiers
target_bed = read.table("/rds/general/user/fmazzaro/home/WORK/Large_variants/resources/target/TSC_40bpOverhang_hg38.bed")
colnames(target_bed) = c("chromosome","start","end","name")
exon_id = c()
for(g in unique(target_bed$name)){
ei = c()
gs = subset(target_bed,target_bed$name==g)
for(i in 1:nrow(gs)){
ei = c(ei,paste(g,"_",i,sep=""))
}
exon_id = c(exon_id,ei)
}
target_bed = cbind(target_bed,exon_id)
#define reference genome
fasta="/rds/general/user/fmazzaro/home/WORK/Large_variants/resources/reference/Homo_sapiens_assembly38.fasta"
#generate counts objects and rename samples only in absence of a saved workspace
if(!paste("ExomeDepth_",cohort,".RData",sep="") %in% list.files("/rds/general/user/fmazzaro/home/WORK/Large_variants/analysis/scripts")){
setwd(paste("/rds/general/user/fmazzaro/ephemeral/",cohort,sep=""))
if(cohort == "DCM" | cohort == "HCM"){
bamfiles_cases=subset(list.files(),!grepl("bai",list.files()))
bamfiles_controls = paste("/rds/general/user/fmazzaro/ephemeral/HVOL/",subset(list.files("/rds/general/user/fmazzaro/ephemeral/HVOL"),!grepl("bai",list.files("/rds/general/user/fmazzaro/ephemeral/HVOL"))),sep="")
bamfiles = c(bamfiles_cases,bamfiles_controls)
}
else{
hvolfiles = subset(list.files("/rds/general/user/fmazzaro/ephemeral/HVOL"),!grepl("bai",list.files("/rds/general/user/fmazzaro/ephemeral/HVOL")))
toinclude = which(hvolfiles %in% c("14DB03367.bam","20SH04518.bam")) #these are hvol to be used as cases because other callers detected variants in them
hvols_indexes = 1:1805
hvols_left = subset(hvols_indexes,!hvols_indexes %in% toinclude)
set.seed(491) #set seed for reproducibility
hvols_picked_cases = sort(c(toinclude,sample(hvols_left,900,replace=F)))
hvols_controls = subset(hvols_indexes,!hvols_indexes %in% hvols_picked_cases)
bamfiles_cases=paste("/rds/general/user/fmazzaro/ephemeral/HVOL/",subset(list.files("/rds/general/user/fmazzaro/ephemeral/HVOL"),!grepl("bai",list.files("/rds/general/user/fmazzaro/ephemeral/HVOL"))),sep="")[hvols_picked_cases]
bamfiles_controls = paste("/rds/general/user/fmazzaro/ephemeral/HVOL/",subset(list.files("/rds/general/user/fmazzaro/ephemeral/HVOL"),!grepl("bai",list.files("/rds/general/user/fmazzaro/ephemeral/HVOL"))),sep="")[hvols_controls]
bamfiles = c(bamfiles_cases,bamfiles_controls)
my.counts = getBamCounts(bed.frame=target_bed,bam.files=bamfiles,include.chr=FALSE,referenceFasta=fasta)
new.colnames = substr(colnames(my.counts)[6:ncol(my.counts)],2,10)
colnames(my.counts)[6:ncol(my.counts)] = new.colnames
#clean the workspace by keeping only the my.counts object
rm(list=setdiff(ls(),c("my.counts","bamfiles","bamfiles_cases","bamfiles_controls")))
save.image(paste("/rds/general/user/fmazzaro/home/WORK/Large_variants/analysis/scripts/ExomeDepth_",cohort,".RData",sep=""))
}
else{
load(paste("/rds/general/user/fmazzaro/home/WORK/Large_variants/analysis/scripts/ExomeDepth_",cohort,".RData",sep=""))}
#remove chr prefix from the bed table
target_bed$chromosome = as.character(substr(as.character(target_bed$chromosome),4,5))
#select the most appropriate reference set from HVOL for each case and call CNVs
cnvlist = c()
for(i in 6:(6+length(bamfiles_cases)-1)){
print(colnames(my.counts)[i])
test.sample.name = colnames(my.counts)[i]
ref.sample.names = colnames(my.counts)[(6+length(bamfiles_cases)):ncol(my.counts)]
test.sample = as.vector(my.counts[,test.sample.name])
ref.samples = as.matrix(my.counts[,ref.sample.names])
#identify ideal reference samples
my.choice = select.reference.set(test.counts=test.sample,reference.counts=ref.samples)
if(length(my.choice$reference.choice)==1){
my.matrix = as.matrix(my.counts[,my.choice$reference.choice,drop=FALSE])
}
else{
my.matrix = as.matrix(my.counts[,my.choice$reference.choice])
}
my.reference.selected = apply(X=my.matrix,MAR=1,FUN=sum)
#apply beta-binomial distribution to the full set of exons
all.exons = new("ExomeDepth",test=test.sample,reference=my.reference.selected,formula="cbind(test, reference) ~ 1")
#call CNVs and sort them by confidence level
all.exons = CallCNVs(x=all.exons,transition.probability=10^-4,chromosome=substr(as.character(my.counts$chromosome),4,5),start=my.counts$start,end=my.counts$end,name=my.counts$exon)
if(length([email protected]$BF)>0){
cnv.calls = [email protected][order([email protected]$BF,decreasing=TRUE),]
#annotate CNV calls manually (as the AnnotateExtra function from ExomeDepth seems to add NAs instead of gene names)
sampleID = c()
genes = c()
exons = c()
for(cnv in 1:nrow(cnv.calls)){
sampleID = c(sampleID,test.sample.name)
exonsinvolved = subset(target_bed,(target_bed$chromosome==cnv.calls$chromosome[cnv] & ((target_bed$start<=cnv.calls$start[cnv] & target_bed$end>=cnv.calls$start[cnv]) | (target_bed$start>=cnv.calls$start[cnv] & target_bed$end<=cnv.calls$end[cnv]) | (target_bed$start<=cnv.calls$start[cnv] & target_bed$end>=cnv.calls$end[cnv]) | (target_bed$start<=cnv.calls$start[cnv] & target_bed$end>=cnv.calls$end[cnv]))))
exonids = paste(exonsinvolved$exon_id,collapse=",")
genesinvolved = paste(unique(exonsinvolved$name),collapse=",")
genes = c(genes,genesinvolved)
exons = c(exons,exonids)
#once processed the last CNV, keep only CNVs that hit genes of interest
if(cnv == nrow(cnv.calls)){
tokeep = c()
for(y in 1:nrow(cnv.calls)){
genesofcnv = unlist(strsplit(as.character(genes[y]),","))
if(sum(genesofcnv %in% geneset)>0){
tokeep = c(tokeep,TRUE)
}
else{
tokeep = c(tokeep,FALSE)
}
}
}
}
cnv.calls = cnv.calls[as.logical(tokeep),]
sampleID = sampleID[as.logical(tokeep)]
genes = genes[as.logical(tokeep)]
exons = exons[as.logical(tokeep)]
if(length(genesinvolved)>0){
cnv.calls = cbind(cnv.calls,sampleID,genes,exons)
cnvlist = rbind(cnvlist,cnv.calls)
}
}
}
#create a file with only high-PSI variants in TTN
write.table(cnvlist,paste("/rds/general/user/fmazzaro/home/WORK/Large_variants/analysis/results/exomedepth_",cohort,"_l_RESULTS_ttnall.tsv",sep=""),col.names=T,row.names=F,quote=F,sep="\t")
ttn_ex = read.table("/rds/general/user/fmazzaro/home/WORK/Large_variants/resources/ttnexons/TTN_highPSI_GRCh38.bed",sep="\t")
colnames(ttn_ex) = c("CHROM","START","END","EX_NO","PSI_DCM")
cnvlist$start = as.numeric(as.character(cnvlist$start))
cnvlist$end = as.numeric(as.character(cnvlist$end))
ttn_ex$START = as.numeric(as.character(ttn_ex$START))
ttn_ex$END = as.numeric(as.character(ttn_ex$END))
ttn_ex$PSI_DCM = as.numeric(as.character(ttn_ex$PSI_DCM))
edttn = subset(cnvlist,grepl("TTN",cnvlist$genes))
ednottn = subset(cnvlist,!grepl("TTN",cnvlist$genes))
tokeep = c()
for(v in 1:nrow(edttn)){
n_ex = length(which((ttn_ex$START<=edttn$start[v] & ttn_ex$END>=edttn$end[v]) | (ttn_ex$START>=edttn$start[v] & ttn_ex$START<=edttn$end[v] & ttn_ex$END>=edttn$end[v]) | (ttn_ex$END<=edttn$end[v] & ttn_ex$END>=edttn$start[v] & ttn_ex$START<=edttn$start[v]) | (ttn_ex$START>=edttn$start[v] & ttn_ex$END<=edttn$end[v])))
if(n_ex>0){
tokeep = c(tokeep,T)
}
else{
tokeep = c(tokeep,F)
}
}
edttn = edttn[as.logical(tokeep),]
cnvlist_ttnhigh = rbind(edttn,ednottn)
cnvlist_ttnhigh = cnvlist_ttnhigh[order(cnvlist_ttnhigh$chromosome,cnvlist_ttnhigh$start,decreasing=F),]
#note: if you want to filter for Bayes Factor here, add a subset command to write only variants above threshold here (empirical threshold: 26.3) or do it manually later
write.table(cnvlist_ttnhigh,paste("/rds/general/user/fmazzaro/home/WORK/Large_variants/analysis/results/exomedepth_",cohort,"_l_RESULTS_ttnhighpsi.tsv",sep=""),col.names=T,row.names=F,quote=F,sep="\t")
rm(list=setdiff(ls(),c("my.counts","bamfiles","bamfiles_cases","bamfiles_controls","cnvlist","cnvlist_ttnhigh","cohort")))
save.image(paste("/rds/general/user/fmazzaro/home/WORK/Large_variants/analysis/scripts/ExomeDepth_",cohort,".RData",sep=""))