From 3874b19ccc102f4b3af54e3e0a376c51fa13d00b Mon Sep 17 00:00:00 2001 From: mpmeers Date: Sun, 6 Oct 2019 03:10:41 -0400 Subject: [PATCH 1/8] SEACR_1.1.sh update Fixed a bug in lines 166 and 168 in which misplaced brackets caused the misreporting of the max signal region terminal coordinate for merged signal blocks --- SEACR_1.1.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SEACR_1.1.sh b/SEACR_1.1.sh index 9e119bd..b123c56 100755 --- a/SEACR_1.1.sh +++ b/SEACR_1.1.sh @@ -163,9 +163,9 @@ mean=`awk '{s+=$3-$2; t++}END{print s/(t*10)}' $password.auc.threshold.bed` if [[ -f $2 ]] then - awk -v value=$mean 'BEGIN{s=1}; {if(s==1){chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6; s++}else{if(chr==$1 && $2 < stop+value){stop=$3; auc=auc+$4; if($5 > max){max=$5; coord=$6}else if($5==max){split(coord,t,"-"); split($6,u,"-"); coord=t[1]"-"}u[2]}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord; chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6}}}' $password.auc.threshold.bed | bedtools intersect -wa -v -a - -b $password2.auc.threshold.bed > $5.auc.threshold.merge.bed + awk -v value=$mean 'BEGIN{s=1}; {if(s==1){chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6; s++}else{if(chr==$1 && $2 < stop+value){stop=$3; auc=auc+$4; if($5 > max){max=$5; coord=$6}else if($5==max){split(coord,t,"-"); split($6,u,"-"); coord=t[1]"-"u[2]}}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord; chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6}}}' $password.auc.threshold.bed | bedtools intersect -wa -v -a - -b $password2.auc.threshold.bed > $5.auc.threshold.merge.bed else - awk -v value=$mean 'BEGIN{s=1}; {if(s==1){chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6; s++}else{if(chr==$1 && $2 < stop+value){stop=$3; auc=auc+$4; if($5 > max){max=$5; coord=$6}else if($5==max){split(coord,t,"-"); split($6,u,"-"); coord=t[1]"-"}u[2]}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord; chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6}}}' $password.auc.threshold.bed > $5.auc.threshold.merge.bed + awk -v value=$mean 'BEGIN{s=1}; {if(s==1){chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6; s++}else{if(chr==$1 && $2 < stop+value){stop=$3; auc=auc+$4; if($5 > max){max=$5; coord=$6}else if($5==max){split(coord,t,"-"); split($6,u,"-"); coord=t[1]"-"u[2]}}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord; chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6}}}' $password.auc.threshold.bed > $5.auc.threshold.merge.bed fi if [[ $height == "relaxed" ]] From aeeb313e168ad8ea12f3fe8db74aa2a11ba9f282 Mon Sep 17 00:00:00 2001 From: mpmeers Date: Tue, 29 Oct 2019 02:50:21 -0400 Subject: [PATCH 2/8] SEACR_1.1.sh update Added a counter to keep track of the number of component bedgraph lines that compose each signal block, to be used in SEACR_1.1.R as means to filter out spurious signal blocks that contain too few bedgraph lines. --- SEACR_1.1.sh | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/SEACR_1.1.sh b/SEACR_1.1.sh index b123c56..75a5fe9 100755 --- a/SEACR_1.1.sh +++ b/SEACR_1.1.sh @@ -98,15 +98,17 @@ fi echo "Creating experimental AUC file: $(date)" -awk 'BEGIN{s=1}; {if(s==1){s++}else if(s==2){chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2); s++}else{if(chr==$1 && $2==stop){stop=$3; auc=auc+($4*($3-$2)); if ($4 > max){max=$4; coord=$1":"$2"-"$3}else if($4 == max){split(coord,t,"-"); coord=t[1]"-"$3}}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord; chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2)}}}' $1 > $password.auc.bed -cut -f 4,5 $password.auc.bed > $password.auc +awk 'BEGIN{s=1}; {if(s==1){s++}else if(s==2){chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2); num=1; s++}else{if(chr==$1 && $2==stop){num++; stop=$3; auc=auc+($4*($3-$2)); if ($4 > max){max=$4; coord=$1":"$2"-"$3 +}else if($4 == max){split(coord,t,"-"); coord=t[1]"-"$3}}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord"\t"num; chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2); num=1}}}' $1 > $password.auc.bed +cut -f 4,7 $password.auc.bed > $password.auc if [[ -f $2 ]] then - echo "Creating control AUC file: $(date)" + echo "Creating control AUC file: $(date)" - awk 'BEGIN{s=1}; {if(s==1){s++}else if(s==2){chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2); s++}else{if(chr==$1 && $2==stop){stop=$3; auc=auc+($4*($3-$2)); if ($4 > max){max=$4; coord=$1":"$2"-"$3}else if($4 == max){split(coord,t,"-"); coord=t[1]"-"$3}}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord; chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2)}}}' $2 > $password2.auc.bed - cut -f 4,5 $password2.auc.bed > $password2.auc + awk 'BEGIN{s=1}; {if(s==1){s++}else if(s==2){chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2); num=1; s++}else{if(chr==$1 && $2==stop){num++; stop=$3; auc=auc+($4*($3-$2)); if ($4 > max){max=$4; coord=$1":"$2"-" +$3}else if($4 == max){split(coord,t,"-"); coord=t[1]"-"$3}}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord"\t"num; chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2); num=1}}}' $2 > $password2.auc.bed + cut -f 4,7 $password2.auc.bed > $password2.auc fi # module load R ## For use on cluster @@ -133,16 +135,17 @@ fdr2=`cat $password.fdr.txt | sed -n '2p'` ## Added 5/15/19 for SEACR_1.1 #thresh=`cat $exp.threshold.txt` thresh=`cat $password.threshold.txt | sed -n '1p'` thresh2=`cat $password.threshold.txt | sed -n '2p'` +thresh3=`cat $password.threshold.txt | sed -n '3p'` echo "Creating thresholded feature file: $(date)" -if [[ $height == "relaxed" ]] +if [[ $height == "relaxed" ]] then - echo "Empirical false discovery rate = $fdr2" - awk -v value=$thresh2 '$4 > value {print $0}' $password.auc.bed > $password.auc.threshold.bed + echo "Empirical false discovery rate = $fdr2" + awk -v value=$thresh2 -v value2=$thresh3 '$4 > value && $7 > value2 {print $0}' $password.auc.bed | cut -f 1,2,3,4,5,6 > $password.auc.threshold.bed else - echo "Empirical false discovery rate = $fdr" - awk -v value=$thresh '$4 > value {print $0}' $password.auc.bed > $password.auc.threshold.bed + echo "Empirical false discovery rate = $fdr" + awk -v value=$thresh -v value2=$thresh3 '$4 > value && $7 > value2 {print $0}' $password.auc.bed | cut -f 1,2,3,4,5,6 > $password.auc.threshold.bed fi if [[ -f $2 ]] @@ -150,7 +153,7 @@ then if [[ $norm == "norm" ]] #If normalizing, multiply control bedgraph by normalization constant then constant=`cat $password.norm.txt | sed -n '1p'` - awk -v mult=$constant 'BEGIN{OFS="\t"}; {$4=$4*mult; print $0}' $password2.auc.bed > $password2.auc2.bed + awk -v mult=$constant 'BEGIN{OFS="\t"}; {$4=$4*mult; print $0}' $password2.auc.bed | cut -f 1,2,3,4,5,6 > $password2.auc2.bed mv $password2.auc2.bed $password2.auc.bed fi awk -v value=$thresh '$4 > value {print $0}' $password2.auc.bed > $password2.auc.threshold.bed From f5d83367bc08c08ea38e2f9d26d7b719af16a3ef Mon Sep 17 00:00:00 2001 From: mpmeers Date: Tue, 29 Oct 2019 02:54:29 -0400 Subject: [PATCH 3/8] SEACR_1.1.R update Added a function to calculate the minimum threshold of lines per signal block at which there is a smaller percentage of target signal blocks remaining than control. This is meant to be used as a filter for signal blocks that pass the total signal threshold despite being composed of very few bedgraph lines, which are unlikely to be true peaks. --- SEACR_1.1.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/SEACR_1.1.R b/SEACR_1.1.R index 72fa123..0a3637f 100644 --- a/SEACR_1.1.R +++ b/SEACR_1.1.R @@ -55,6 +55,7 @@ if(is.na(numtest)){ ## If 2nd field is a bedgraph, calculate empirical threshold # print("Ctrl is a file") ctrl<-read.table(argsL$ctrl) ctrlvec<-ctrl$V1 + ctrlmax<-ctrl$V2 rm(ctrl) invis <- gc(verbose=FALSE) if(argsL$norm=="yes"){ ## Calculate peaks of density plots to generate normalization factor @@ -104,6 +105,10 @@ if(is.na(numtest)){ ## If 2nd field is a bedgraph, calculate empirical threshold x0<-a0 z0<-b0 } + both2<-c(expmax,ctrlmax) + d<-sort(unique(both2)) + pctremain2<-function(x) 1-(ecdf(expmax)(x)-ecdf(ctrlmax)(x)) + d0<-min(d[pctremain2(d) > 1]) invis <- gc(verbose=FALSE) fdr<-c(1-pctremain(x0[1]), 1-pctremain(z0[1])) ## New for SEACR_1.1 }else{ ## If 2nd field is numeric, calculate percentile threshold @@ -118,7 +123,7 @@ if(is.na(numtest)){ ## If 2nd field is a bedgraph, calculate empirical threshold fdr<-ctrl[1] ## New for SEACR_1.1 } invis <- gc(verbose=FALSE) -write.table(c(x0[1],z0[1]), file=paste(argsL$output, ".threshold.txt", sep=""), sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE) +write.table(c(x0[1],z0[1],d0[1]), file=paste(argsL$output, ".threshold.txt", sep=""), sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE) if(argsL$norm=="yes"){ write.table(constant, file=paste(argsL$output, ".norm.txt", sep=""), sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE) #Added 7/19/18 to ensure norm value is multiplied by ctrl } From 58628ebdca7a516c60c966d64fa4d365ed6e1dff Mon Sep 17 00:00:00 2001 From: mpmeers Date: Wed, 30 Oct 2019 09:45:46 -0700 Subject: [PATCH 4/8] Update README.md Added a sort command to sort the bed file by position in the example code under "Preparing input bedgraph files". --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8a8fea0..39c0698 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Here is some example code for converting from a paired-end BAM to a fragment bed bedtools bamtobed -bedpe -i $sample.bam > $sample.bed awk '$1==$4 && $6-$2 < 1000 {print $0}' $sample.bed > $sample.clean.bed - cut -f 1,2,6 $sample.clean.bed > $sample.fragments.bed + cut -f 1,2,6 $sample.clean.bed | sort -k1,1 -k2,2n -k3,3n > $sample.fragments.bed bedtools genomecov -bg -i $sample.fragments.bed -g my.genome > $sample.fragments.bedgraph ## Output file: From 2d120b562fd37d2f8abc3692d0da9e2350b62b14 Mon Sep 17 00:00:00 2001 From: mpmeers Date: Wed, 30 Oct 2019 14:37:33 -0700 Subject: [PATCH 5/8] SEACR_1.1.R update Change how the dataframe for density plotting is truncated (previously a hard-coded 90% cutoff): a dataframe of list quantile (i.e. line #/max line#) vs. value quantile (i.e. value/max value) is derived, and the threshold is selected by finding the dataframe pair for which the orthogonal distance below the line defined by (0,0);(1,1) is maximized. Also, the density dataframe is now made to be as long as the integer value closest to 1/10 the length of the input dataframe. --- SEACR_1.1.R | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/SEACR_1.1.R b/SEACR_1.1.R index 0a3637f..f013939 100644 --- a/SEACR_1.1.R +++ b/SEACR_1.1.R @@ -59,8 +59,25 @@ if(is.na(numtest)){ ## If 2nd field is a bedgraph, calculate empirical threshold rm(ctrl) invis <- gc(verbose=FALSE) if(argsL$norm=="yes"){ ## Calculate peaks of density plots to generate normalization factor - ctrlvalue<-sort(ctrlvec)[as.integer(0.9*length(ctrlvec))] ## Added 7/15/19 to improve memory performance - expvalue<-sort(expvec)[as.integer(0.9*length(expvec))] ## Added 7/15/19 to improve memory performance + dist2d<-function(a,b,c){v1<- b - c; v2<- a - b; m<-cbind(v1,v2); d<-det(m)/sqrt(sum(v1*v1))} + expframe<-data.frame(count=seq(1,0,length=length(expvec)), quant=sort(expvec,decreasing=TRUE)/max(expvec), value=sort(expvec,decreasing=TRUE)) + expframe$diff<-abs(expframe$count-expframe$quant) + expframe<-expframe[expframe$diff > 0.9*max(expframe$diff),] + expframe$dist<-apply(expframe,1,function(x) dist2d(c(x[1],x[2]),0,1)) + ctrlframe<-data.frame(count=seq(1,0,length=length(ctrlvec)), quant=sort(ctrlvec,decreasing=TRUE)/max(ctrlvec), value=sort(ctrlvec,decreasing=TRUE)) + ctrlframe$diff<-abs(ctrlframe$count-ctrlframe$quant) + ctrlframe<-ctrlframe[ctrlframe$diff > 0.9*max(ctrlframe$diff),] + ctrlframe$dist<-apply(ctrlframe,1,function(x) dist2d(c(x[1],x[2]),0,1)) + if(ctrlframe$value[ctrlframe$dist==max(ctrlframe$dist)][1] > sort(ctrlvec)[as.integer(0.9*length(ctrlvec))]){ + ctrlvalue<-ctrlframe$value[ctrlframe$dist==max(ctrlframe$dist)][1] + }else{ + ctrlvalue<-sort(ctrlvec)[as.integer(0.9*length(ctrlvec))] ## Added 7/15/19 to improve memory performance + } + if(expframe$value[expframe$dist==max(expframe$dist)][1] > sort(expvec)[as.integer(0.9*length(expvec))]){ + expvalue<-expframe$value[expframe$dist==max(expframe$dist)][1] + }else{ + expvalue<-sort(expvec)[as.integer(0.9*length(expvec))] ## Added 7/15/19 to improve memory performance + } ctrltest<-density(ctrlvec[ctrlvec <= ctrlvalue]) ## New for SEACR_1.1 exptest<-density(expvec[expvec <= expvalue]) ## New for SEACR_1.1 constant<-(exptest$x[exptest$y==max(exptest$y)])/(ctrltest$x[ctrltest$y==max(ctrltest$y)]) From 9c10d0fb1536460999a35ab8babdcb4447e8be0a Mon Sep 17 00:00:00 2001 From: mpmeers Date: Wed, 30 Oct 2019 14:47:29 -0700 Subject: [PATCH 6/8] Update README.md Described v1.2 updates --- README.md | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 39c0698..b2e19eb 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,12 @@ A web interface for SEACR analysis can be found at https://seacr.fredhutch.org ## Recent changes +### v1.2 + +- Fixed a bug in lines 166 and 168 in which misplaced brackets caused the misreporting of the max signal region terminal coordinate for merged signal blocks +- Added a counter to keep track of the number of component bedgraph lines that compose each signal block, and a function to calculate the minimum threshold of lines per signal block at which there is a smaller percentage of target signal blocks remaining than control. This is meant to be used as a filter for signal blocks that pass the total signal threshold despite being composed of very few bedgraph lines, which are unlikely to be true peaks. +- Changed how the dataframe for density plotting is truncated (previously a hard-coded 90% cutoff): a dataframe of list quantile (i.e. line #/max line#) vs. value quantile (i.e. value/max value) is derived, and the threshold is selected by finding the dataframe pair for which the orthogonal distance below the line defined by (0,0);(1,1) is maximized. + ### v1.1 - Changed "union" and "AUC" modes to "relaxed" and "stringent" modes, respectively. - Removed maximum signal threshold from "relaxed" mode and replaced it with an alternate total signal threshold that uses the point halfway between the knee and the peak of the total signal curve as described in the manuscript text. This change improves performance at high read depth. @@ -22,7 +28,7 @@ A web interface for SEACR analysis can be found at https://seacr.fredhutch.org ## Usage: - bash SEACR_1.1.sh experimental bedgraph [control bedgraph | numeric threshold] ["norm" | "non"] ["relaxed" | "stringent"] output prefix + bash SEACR_1.2.sh experimental bedgraph [control bedgraph | numeric threshold] ["norm" | "non"] ["relaxed" | "stringent"] output prefix ## Description of input fields: @@ -70,11 +76,11 @@ Field 6: Region representing the farthest upstream and farthest downstream bases ## Examples: - bash SEACR_1.1.sh target.bedgraph IgG.bedgraph norm stringent output + bash SEACR_1.2.sh target.bedgraph IgG.bedgraph norm stringent output Calls enriched regions in target data using normalized IgG control track with stringent threshold - bash SEACR_1.1.sh target.bedgraph IgG.bedgraph non relaxed output + bash SEACR_1.2.sh target.bedgraph IgG.bedgraph non relaxed output Calls enriched regions in target data using non-normalized IgG control track with relaxed threshold - bash SEACR_1.1.sh target.bedgraph 0.01 non stringent output + bash SEACR_1.2.sh target.bedgraph 0.01 non stringent output Calls enriched regions in target data by selecting the top 1% of regions by AUC From f5e2326d000a74cc7d0aee8b6f8a6388012715c9 Mon Sep 17 00:00:00 2001 From: mpmeers Date: Wed, 30 Oct 2019 14:49:32 -0700 Subject: [PATCH 7/8] Rename SEACR_1.1.R to SEACR_1.2.R --- SEACR_1.1.R => SEACR_1.2.R | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename SEACR_1.1.R => SEACR_1.2.R (100%) diff --git a/SEACR_1.1.R b/SEACR_1.2.R similarity index 100% rename from SEACR_1.1.R rename to SEACR_1.2.R From acf811695c800ae2962a7b74850b3fdb54c9eb20 Mon Sep 17 00:00:00 2001 From: mpmeers Date: Wed, 30 Oct 2019 14:50:16 -0700 Subject: [PATCH 8/8] Update and rename SEACR_1.1.sh to SEACR_1.2.sh --- SEACR_1.1.sh => SEACR_1.2.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename SEACR_1.1.sh => SEACR_1.2.sh (96%) diff --git a/SEACR_1.1.sh b/SEACR_1.2.sh similarity index 96% rename from SEACR_1.1.sh rename to SEACR_1.2.sh index 75a5fe9..e27f034 100755 --- a/SEACR_1.1.sh +++ b/SEACR_1.2.sh @@ -7,7 +7,7 @@ then echo " SEACR: Sparse Enrichment Analysis for CUT&RUN - Usage: bash SEACR_1.1.sh .bg [.bg | ] ["norm" | "non"] ["relaxed" | "stringent"] output prefix + Usage: bash SEACR_1.2.sh .bg [.bg | ] ["norm" | "non"] ["relaxed" | "stringent"] output prefix Description of input fields: @@ -42,12 +42,12 @@ then Field 6: Region representing the farthest upstream and farthest downstream bases within the denoted coordinates that are represented by the maximum bedgraph signal Examples: - bash SEACR_1.1.sh target.bedgraph IgG.bedgraph norm stringent output + bash SEACR_1.2.sh target.bedgraph IgG.bedgraph norm stringent output Calls enriched regions in target data using normalized IgG control track with stringent threshold - bash SEACR_1.1.sh target.bedgraph IgG.bedgraph non relaxed output + bash SEACR_1.2.sh target.bedgraph IgG.bedgraph non relaxed output Calls enriched regions in target data using non-normalized IgG control track with relaxed threshold - bash SEACR_1.1.sh target.bedgraph 0.01 non stringent output + bash SEACR_1.2.sh target.bedgraph 0.01 non stringent output Calls enriched regions in target data by selecting the top 1% of regions by area under the curve (AUC) " exit 1