forked from FredHutch/SEACR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SEACR_1.3.R
153 lines (142 loc) · 6.83 KB
/
SEACR_1.3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!usr/bin/Rscript
## Collect arguments
args <- commandArgs(TRUE)
## Default setting when no arguments passed
if(length(args) < 4) {
args <- c("--help")
}
## Help section
if("--help" %in% args) {
cat("
Calculate area under the curve threshold for CUT&RUN peaks
Arguments:
--exp=someValue - Input AUC values from experiment CUT&RUN
--ctrl=someValue - Input AUC values from control CUT&RUN
--norm=[yes|no] - Whether to normalize control and experimental files
--output=someValue - Output prefix
")
q(save="no")
}
## Parse arguments (we expect the form --arg=value)
parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
argsL <- as.list(as.character(argsDF$V2))
names(argsL) <- argsDF$V1
invis <- gc(verbose=FALSE)
## Arg1 default
#if(is.null(args[1])){
if(is.null(argsL$exp) | is.null(argsL$ctrl) | is.null(argsL$output) | is.null(argsL$norm)) {
stop("Argument is missing!
Calculate area under the curve threshold for CUT&RUN peaks
Arguments:
--exp=someValue - Input AUC values from experiment CUT&RUN
--ctrl=someValue - Input AUC values from control CUT&RUN
--norm=[yes|no] - Whether to normalize control and experimental files
--output=someValue - Output prefix
")
q(save="no")
}
exp<-read.table(argsL$exp)
expvec<-exp$V1
expmax<-exp$V2
rm(exp)
suppressWarnings(numtest<-as.numeric(argsL$ctrl))
invis <- gc(verbose=FALSE)
if(is.na(numtest)){ ## If 2nd field is a bedgraph, calculate empirical threshold
# print("Ctrl is a file")
ctrl<-read.table(argsL$ctrl)
ctrlvec<-ctrl$V1
ctrlmax<-ctrl$V2
rm(ctrl)
invis <- gc(verbose=FALSE)
if(argsL$norm=="yes"){ ## Calculate peaks of density plots to generate normalization factor
dist2d<-function(a,b,c){v1<- b - c; v2<- a - b; m<-cbind(v1,v2); d<-det(m)/sqrt(sum(v1*v1))}
expframe<-data.frame(count=seq(1,0,length=length(expvec)), quant=sort(expvec,decreasing=TRUE)/max(expvec), value=sort(expvec,decreasing=TRUE))
expframe$diff<-abs(expframe$count-expframe$quant)
expframe<-expframe[expframe$diff > 0.9*max(expframe$diff),]
expframe$dist<-apply(expframe,1,function(x) dist2d(c(x[1],x[2]),0,1))
ctrlframe<-data.frame(count=seq(1,0,length=length(ctrlvec)), quant=sort(ctrlvec,decreasing=TRUE)/max(ctrlvec), value=sort(ctrlvec,decreasing=TRUE))
ctrlframe$diff<-abs(ctrlframe$count-ctrlframe$quant)
ctrlframe<-ctrlframe[ctrlframe$diff > 0.9*max(ctrlframe$diff),]
ctrlframe$dist<-apply(ctrlframe,1,function(x) dist2d(c(x[1],x[2]),0,1))
if(ctrlframe$value[ctrlframe$dist==max(ctrlframe$dist)][1] > sort(ctrlvec)[as.integer(0.9*length(ctrlvec))]){
ctrlvalue<-ctrlframe$value[ctrlframe$dist==max(ctrlframe$dist)][1]
}else{
ctrlvalue<-sort(ctrlvec)[as.integer(0.9*length(ctrlvec))] ## Added 7/15/19 to improve memory performance
}
if(expframe$value[expframe$dist==max(expframe$dist)][1] > sort(expvec)[as.integer(0.9*length(expvec))]){
expvalue<-expframe$value[expframe$dist==max(expframe$dist)][1]
}else{
expvalue<-sort(expvec)[as.integer(0.9*length(expvec))] ## Added 7/15/19 to improve memory performance
}
ctrltest<-density(ctrlvec[ctrlvec <= ctrlvalue]) ## New for SEACR_1.1
exptest<-density(expvec[expvec <= expvalue]) ## New for SEACR_1.1
constant<-(exptest$x[exptest$y==max(exptest$y)])/(ctrltest$x[ctrltest$y==max(ctrltest$y)])
ctrlvec<-ctrlvec*constant
} ## Calculate total signal and max signal thresholds
both<-c(expvec,ctrlvec)
pctremain<-function(x) (length(expvec)-(ecdf(expvec)(x)*length(expvec)))/(length(both)-(ecdf(both)(x)*length(both)))
x<-sort(unique(both)) ## New for SEACR_1.1
x0<-x[which(na.omit(pctremain(x[pctremain(x) < 1])) == max(na.omit(pctremain(x[pctremain(x) < 1]))))] ## New for SEACR_1.1
z<-x[x <= x0[1]] ## New for SEACR_1.1
z2<-z[abs(((pctremain(x0)+min(pctremain(z)))/2)-pctremain(z))==min(abs(((pctremain(x0)+min(pctremain(z)))/2)-pctremain(z)))] ## New for SEACR_1.1
if(x0[1]!=z2[1]){ ## Added 7/15/19 to avoid omitting z when x0==z2
z<-z[z > z2[1]] ## New for SEACR_1.1
z0<-z[abs(z-(max(z)-((1/2)*(max(z)-min(z)))))==min(abs(z-(max(z)-((1/2)*(max(z)-min(z))))))] ## New for SEACR_1.1
}else{ ## Added 7/15/19 to avoid omitting z when x0==z2
z0<-x0 ## Added 7/15/19 to avoid omitting z when x0==z2
} ## Added 7/15/19 to avoid omitting z when x0==z2
## The following code segment was added to avoid spurious high thresholding when the peak of a lower threshold is within 95% of the peak of the maximum threshold
frame<-data.frame(thresh=x[1:(length(x)-1)], pct=pctremain(x[1:(length(x)-1)]), diff=abs(diff(pctremain(x))))
frame<-na.omit(frame)
i<-2
output<-0
while(output==0){
test3<-as.numeric(paste(c(0,".",rep(9,i)),sep="",collapse=""))
output<-as.numeric(quantile(frame$diff, test3))
# print(output)
i<-i+1
}
a<-frame$thresh[frame$diff != 0 & frame$diff < quantile(frame$diff, test3)]
a0<-a[which(na.omit(pctremain(a[pctremain(a) < 1])) == max(na.omit(pctremain(a[pctremain(a) < 1]))))]
b<-a[a <= a0[1]]
b2<-b[abs(((pctremain(a0)+min(pctremain(b)))/2)-pctremain(b))==min(abs(((pctremain(a0)+min(pctremain(b)))/2)-pctremain(b)))]
if(a0[1]!=b2[1]){ ## Added 7/15/19 to avoid omitting b when a0==b2
b<-b[b > b2[1]]
b0<-b[abs(b-(max(b)-((1/2)*(max(b)-min(b)))))==min(abs(b-(max(b)-((1/2)*(max(b)-min(b))))))]
}else{ ## Added 7/15/19 to avoid omitting b when a0==b2
b0<-a0 ## Added 7/15/19 to avoid omitting b when a0==b2
} ## Added 7/15/19 to avoid omitting b when a0==b2
if(max(na.omit(pctremain(a[pctremain(a) < 1])))/max(na.omit(pctremain(x[pctremain(x) < 1]))) > 0.95){
x0<-a0
z0<-b0
}
both2<-c(expmax,ctrlmax)
d<-sort(unique(both2))
pctremain2<-function(x) 1-(ecdf(expmax)(x)-ecdf(ctrlmax)(x))
if(length(d[pctremain2(d) > 1]) > 0){
d0<-min(d[pctremain2(d) > 1])
}else{
d0<-1
}
invis <- gc(verbose=FALSE)
fdr<-c(1-pctremain(x0[1]), 1-pctremain(z0[1])) ## New for SEACR_1.1
}else{ ## If 2nd field is numeric, calculate percentile threshold
# print("Ctrl is numeric")
test<-ecdf(expvec)(expvec)
frame<-data.frame(values=expvec, percentile=1-test)
test2<-ecdf(expmax)(expmax)
frame2<-data.frame(values=expmax, percentile=1-test2)
ctrl<-as.vector(as.numeric(paste(0,argsL$ctrl,sep="")))
x0<-min(frame$values[frame$percentile <= ctrl[1]])
z0<-min(frame2$values[frame2$percentile <= ctrl[1]])
d0<-0
fdr<-ctrl[1] ## New for SEACR_1.1
}
invis <- gc(verbose=FALSE)
write.table(c(x0[1],z0[1],d0[1]), file=paste(argsL$output, ".threshold.txt", sep=""), sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE)
if(argsL$norm=="yes"){
write.table(constant, file=paste(argsL$output, ".norm.txt", sep=""), sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE) #Added 7/19/18 to ensure norm value is multiplied by ctrl
}
invis <- gc(verbose=FALSE)
write.table(fdr, file=paste(argsL$output, ".fdr.txt", sep=""), sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE) #Added 5/15/19 to report empirical FDR for threshold detection