From c28dfb771e5aaabd676a910deef2211423ad0fae Mon Sep 17 00:00:00 2001 From: Nahiyan Malik Date: Fri, 27 Sep 2013 23:44:42 -0400 Subject: [PATCH] Added all files --- FactorStateClusterNormalizedPlot.r | 64 ++++ FactorStateClusterPlot.r | 97 ++++++ FactorStateIntersect.sh | 8 + FactorStateTable.pl | 97 ++++++ FactorStateTableNormalized.pl | 43 +++ FactorStateTally.sh | 24 ++ NormalizedStats.pl | 54 ++++ README.md | 6 +- RawDataToTables.r | 34 +++ SplitBroadHMM.pl | 23 ++ all-bedfilesplot.pl | 11 + all-processbedfile-summit.pl | 26 ++ allbedcat.pl | 19 ++ allstats-bedcat.pl | 30 ++ bedfile-statsjob-qsub.sh | 11 + bedfileplot.r | 78 +++++ bedfiles-run-cluster.txt | 150 +++++++++ bedfiles-run.txt | 8 + bedfilesplit.sh | 7 + normalize-bash.sh | 8 + normalize-qsub-remaining.sh | 3 + normalize-qsub.sh | 10 + overallplots-normalized.r | 436 ++++++++++++++++++++++++++ overallplots.r | 475 +++++++++++++++++++++++++++++ overallplotsdata-normalized.pl | 99 ++++++ overallplotsdata.pl | 99 ++++++ processbedfile-summit.pl | 26 ++ processbedfile.pl | 25 ++ readwigbedgraph.pl | 24 ++ run-normalize-todo.txt | 76 +++++ run-normalize.txt | 76 +++++ submit-job-on-cluster.txt | 1 + toolbedgraph-bash.sh | 3 + toolbedgraph.pl | 197 ++++++++++++ 34 files changed, 2346 insertions(+), 2 deletions(-) create mode 100644 FactorStateClusterNormalizedPlot.r create mode 100644 FactorStateClusterPlot.r create mode 100644 FactorStateIntersect.sh create mode 100644 FactorStateTable.pl create mode 100644 FactorStateTableNormalized.pl create mode 100644 FactorStateTally.sh create mode 100644 NormalizedStats.pl create mode 100644 RawDataToTables.r create mode 100644 SplitBroadHMM.pl create mode 100644 all-bedfilesplot.pl create mode 100644 all-processbedfile-summit.pl create mode 100644 allbedcat.pl create mode 100644 allstats-bedcat.pl create mode 100644 bedfile-statsjob-qsub.sh create mode 100644 bedfileplot.r create mode 100644 bedfiles-run-cluster.txt create mode 100644 bedfiles-run.txt create mode 100644 bedfilesplit.sh create mode 100644 normalize-bash.sh create mode 100644 normalize-qsub-remaining.sh create mode 100644 normalize-qsub.sh create mode 100644 overallplots-normalized.r create mode 100644 overallplots.r create mode 100644 overallplotsdata-normalized.pl create mode 100644 overallplotsdata.pl create mode 100644 processbedfile-summit.pl create mode 100644 processbedfile.pl create mode 100644 readwigbedgraph.pl create mode 100644 run-normalize-todo.txt create mode 100644 run-normalize.txt create mode 100644 submit-job-on-cluster.txt create mode 100644 toolbedgraph-bash.sh create mode 100644 toolbedgraph.pl diff --git a/FactorStateClusterNormalizedPlot.r b/FactorStateClusterNormalizedPlot.r new file mode 100644 index 0000000..f0964ae --- /dev/null +++ b/FactorStateClusterNormalizedPlot.r @@ -0,0 +1,64 @@ +library(ggplot2) +library(gplots) +library(RColorBrewer) + +factorStateTable <- read.table("stats/factor-state-table.txt", header = TRUE, as.is=TRUE, row.names=1); +# print(factorStateTable) +class(factorStateTable) + +factorStateTallyOverMeanTable <- read.table("stats/factor-state-sample-mean-table.txt", header = TRUE, as.is=TRUE, row.names=1); + +# factorStateTallyOverMeanTable <- log(factorStateTallyOverMeanTable) + +stepColours <- read.table("stats/step-colours.txt", header = FALSE, as.is=TRUE) +stepColoursRow <- as.matrix(as.data.frame(t(stepColours), stringsAsFactors=FALSE)); +print(stepColours) +print(stepColoursRow) +class(stepColours) +class(stepColoursRow) + +for (i in 1:ncol(stepColoursRow)) { + stepColoursRow[1,i] <- paste("#",stepColoursRow[1,i],sep="") +} + +print(stepColoursRow) + + +# print(max(factorStateTable[,2:ncol(factorStateTable)])) +# print(sum(factorStateTable[,1:15])) + +# print(factorStatePercentTable) + +tallyovermeanMatrix <- data.matrix(factorStateTallyOverMeanTable) + +# tallyovermeanMatrix <- log2(tallyovermeanMatrix) + +print(log(factorStateTallyOverMeanTable)) + +# test <- data.matrix(factorStateTallyOverMeanTable) +# log2(test) +# log2(tallyovermeanMatrix) + + + +rowClust <- hclust(dist(tallyovermeanMatrix)) + +# Plot the data table as heatmap and the cluster results as dendrograms. +png("stats/cluster-heatmap-tallyovermean.png", width=2000, height=1500, res=100) +mycl <- cutree(rowClust, k=6); +# mycolhc <- c("#b8860b", "#ffd700", "#cd5c5c", "#32cd32", "#00bfff", "#ff4500") +mycolhc <- c("#ff9933", "#990000", "#990099", "#009900", "#33CCFF", "#FF0000", "#826561") +mycolhc <- mycolhc[as.vector(mycl)]; +hm <- heatmap.2(tallyovermeanMatrix, trace="none", RowSideColors=mycolhc, cexRow=0.75, cexCol=1.0, key=TRUE, keysize=1.5, margins=c(10,22)) +dev.off() +# print(hm) +# print(mycolhc) +# print(hm$Rowv) + +sink("stats/heatmap-cluster-colours-tallyovermean.txt") +cat(mycolhc) +sink() + + + +# Cut the tree at specific height and color the corresponding clusters in the heatmap color bar. \ No newline at end of file diff --git a/FactorStateClusterPlot.r b/FactorStateClusterPlot.r new file mode 100644 index 0000000..0ba73ab --- /dev/null +++ b/FactorStateClusterPlot.r @@ -0,0 +1,97 @@ +library(ggplot2) +library(gplots) +library(RColorBrewer) + +factorStateTable <- read.table("stats/factor-state-table.txt", header = TRUE, as.is=TRUE, row.names=1); +# print(factorStateTable) +class(factorStateTable) + +factorStatePercentTable <- read.table("stats/factor-state-percent-table.txt", header = TRUE, as.is=TRUE, row.names=1); + +stepColours <- read.table("stats/step-colours.txt", header = FALSE, as.is=TRUE) +stepColoursRow <- as.matrix(as.data.frame(t(stepColours), stringsAsFactors=FALSE)); +print(stepColours) +print(stepColoursRow) +class(stepColours) +class(stepColoursRow) + +for (i in 1:ncol(stepColoursRow)) { + stepColoursRow[1,i] <- paste("#",stepColoursRow[1,i],sep="") +} + +print(stepColoursRow) + + +# print(max(factorStateTable[,2:ncol(factorStateTable)])) +# print(sum(factorStateTable[,1:15])) + +# print(factorStatePercentTable) + +percentMatrix <- data.matrix(factorStatePercentTable) + +################ + +# ## The colors you specified. +# stepColours <- stepColoursRow +# ## Defining breaks for the color scale +# # stepBreaks <- c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1) + +# library(sparcl) + +# png("stats/cluster-heatmap-morecolours.png", width=2000, height=1500, res=100) +# rc <- rainbow(nrow(percentMatrix), start = 0, end = .3) +# cc <- rainbow(ncol(percentMatrix), start = 0, end = .3) + +# hc <- hclust(dist(percentMatrix)) +# dd <- as.dendrogram(hc) + +# hm <- heatmap.2(percentMatrix, #Rowv=NA, Colv=NA, +# # col = stepColours, ## using your colors +# # breaks = stepBreaks, ## using your breaks +# # dendrogram = "none", ## to suppress warnings +# cexRow=0.75, cexCol=1.0, key=TRUE, keysize=1.5, +# margins=c(10,22), +# trace="none") +# # ColorDendrogram(hm,main="My Simulated Data",branchlength=3) +# # legend("topright", fill = stepColours, +# # legend = c("0 to 10%", "10%+ to 20%", "20%+ to 30%", "30%+ to 40%", "40%+ to 50%", "50%+ to 60%", "60%+ to 70%", "70%+ to 80%", "80%+ to 90%", "90%+ to 100%")) +# # mar.orig <- par()$mar # save the original values +# # par(mar = c(4,4,50,4)) # set your new values EACH 1 IS 15 PX +# # par(mar = mar.orig) # put the original values back + +# dev.off() + +################## + +# mydatascale <- t(scale(t(percentMatrix))) # Centers and scales data. +rowClust <- hclust(dist(percentMatrix)) + +# print(rowClust[]) +# # colClust <- hclust(as.dist(percentMatrix)) +# hr <- hclust(as.dist(1-cor(t(mydatascale), method="pearson")), method="complete") # Cluster rows by Pearson correlation. +# hc <- hclust(as.dist(1-cor(mydatascale, method="spearman")), method="complete") +# # Clusters columns by Spearman correlation. +# heatmap(percentMatrix)# Rowv=as.dendrogram(hr), Colv=as.dendrogram(hc)) + + + + + # Plot the data table as heatmap and the cluster results as dendrograms. +png("stats/cluster-heatmap-morecolours.png", width=2000, height=1500, res=100) +mycl <- cutree(rowClust, k=6); +# mycolhc <- c("#b8860b", "#ffd700", "#cd5c5c", "#32cd32", "#00bfff", "#ff4500") +mycolhc <- c("#ff9933", "#990000", "#990099", "#009900", "#33CCFF", "#FF0000") +mycolhc <- mycolhc[as.vector(mycl)]; +hm <- heatmap.2(percentMatrix, RowSideColors=mycolhc, trace="none", cexRow=0.75, cexCol=1.0, key=TRUE, keysize=1.5, margins=c(10,22)) +dev.off() +print(hm) +print(mycolhc) +print(hm$Rowv) + +sink("stats/heatmap-cluster-colours.txt") +cat(mycolhc) +sink() + + + + # Cut the tree at specific height and color the corresponding clusters in the heatmap color bar. \ No newline at end of file diff --git a/FactorStateIntersect.sh b/FactorStateIntersect.sh new file mode 100644 index 0000000..923457d --- /dev/null +++ b/FactorStateIntersect.sh @@ -0,0 +1,8 @@ +cat "stats/run.txt" | while read tf; do + mkdir broadhmm/$tf + echo $tf + for state in {1..15} + do + bedtools intersect -a bedfiles/$tf -b broadhmm/$state.bed > broadhmm/$tf/$state-intersect.bed + done +done \ No newline at end of file diff --git a/FactorStateTable.pl b/FactorStateTable.pl new file mode 100644 index 0000000..bc174bc --- /dev/null +++ b/FactorStateTable.pl @@ -0,0 +1,97 @@ +#!/usr/bin/perl +use warnings; +# use integer; +use List::Util qw(sum); +use Math::BigFloat; + +open runfile, "stats/run.txt"; +$writeTablePath = "stats/factor-state-table.txt"; +$writeTablePercentPath = "stats/factor-state-percent-table.txt"; +open (writeFactorStateTableFile, ">".$writeTablePath); +open (writeFactorStateTablePercentFile, ">".$writeTablePercentPath); + +print writeFactorStateTableFile "factor\t1_Active_Promoter\t2_Weak_Promoter\t3_Poised_Promoter\t4_Strong_Enhancer\t5_Strong_Enhancer\t6_Weak_Enhancer\t7_Weak_Enhancer\t8_Insulator\t9_Txn_Transition\t10_Txn_Elongation\t11_Weak_Txn\t12_Repressed\t13_Heterochrom/lo\t14_Repetitive/CNV\t15_Repetitive/CNV\n"; +# print writeFactorStateTableFile "state1\tstate2\tstate3\tstate4\tstate5\tstate6\tstate7\tstate8\tstate9\tstate10\tstate11\tstate12\tstate13\tstate14\tstate15\n"; + +# print writeFactorStateTablePercentFile "factor\tstate1\tstate2\tstate3\tstate4\tstate5\tstate6\tstate7\tstate8\tstate9\tstate10\tstate11\tstate12\tstate13\tstate14\tstate15\n"; +print writeFactorStateTablePercentFile "factor\t1_Active_Promoter\t2_Weak_Promoter\t3_Poised_Promoter\t4_Strong_Enhancer\t5_Strong_Enhancer\t6_Weak_Enhancer\t7_Weak_Enhancer\t8_Insulator\t9_Txn_Transition\t10_Txn_Elongation\t11_Weak_Txn\t12_Repressed\t13_Heterochrom/lo\t14_Repetitive/CNV\t15_Repetitive/CNV\n"; + + +my @factorTotals = (); + +while () { + chomp; + my $factor = $_; + print "$factor\n"; + + my $factorSum = 0; + + print writeFactorStateTableFile "$factor\t"; + open tallyfile, "broadhmm/$factor/factor-tally.txt"; + + while () { + chomp; + my $stateTally = $_; + print writeFactorStateTableFile "$stateTally\t"; + # push(@factorValues, $stateTally); + $factorSum += $stateTally; + } + + close tallyfile; + print writeFactorStateTableFile "\n"; + # $factorSum = sum(@factorValues); + + print writeFactorStateTablePercentFile "$factor\t"; + open tallyfile, "broadhmm/$factor/factor-tally.txt"; + + while () { + chomp; + my $stateTally = $_; + my $statePercent = $stateTally / $factorSum; + print ref $stateTally; + + print "factorsum: $factorSum\n"; + print "$stateTally\n"; + print "$statePercent\n"; + + print writeFactorStateTablePercentFile "$statePercent\t"; + } + + close tallyfile; + print writeFactorStateTablePercentFile "\n"; + + # push(@factorTotals, sum(@factorValues)); + +} + +# foreach (@factorTotals) +# { +# print "$_\n"; +# } + +# print $factorTotals[1]; + + + + + + + + + + +# for ($state = 1; $state < 16; $state++) { +# print "$state\n"; +# open broadhmmFile, $broadhmmPath; +# while () { +# chomp; +# my @line = split(/\t/,"$_"); + +# # print index($line[3], $state); + +# if (index($line[3],"$state"."_") == 0) { +# # print "$line[3]\n"; +# print writeSplitState "$_\n"; +# } +# } +# } \ No newline at end of file diff --git a/FactorStateTableNormalized.pl b/FactorStateTableNormalized.pl new file mode 100644 index 0000000..3e1be8f --- /dev/null +++ b/FactorStateTableNormalized.pl @@ -0,0 +1,43 @@ +#!/usr/bin/perl +use warnings; +# use integer; +use List::Util qw(sum); +use Math::BigFloat; + +open runfile, "stats/run.txt"; +$writeTableSampleMeanPath = "stats/factor-state-sample-mean-table.txt"; +open (writeFactorStateTableSampleMeanFile, ">".$writeTableSampleMeanPath); + +print writeFactorStateTableSampleMeanFile "factor\t1_Active_Promoter\t2_Weak_Promoter\t3_Poised_Promoter\t4_Strong_Enhancer\t5_Strong_Enhancer\t6_Weak_Enhancer\t7_Weak_Enhancer\t8_Insulator\t9_Txn_Transition\t10_Txn_Elongation\t11_Weak_Txn\t12_Repressed\t13_Heterochrom/lo\n"; + +while () { + chomp; + my $factor = $_; + print writeFactorStateTableSampleMeanFile "$factor\t"; + + my @factorTally = (); + my @factorMean = (); + + open factorTallyFile, "broadhmm/$factor/factor-tally.txt"; + open factorMeanFile, "broadhmm/$factor/normalized/factor-sample-mean.txt"; + + while () { + chomp; + # print "$_\n"; + push (@factorTally, $_); + } + + while () { + chomp; + # print "$_\n"; + push (@factorMean, $_); + } + + # print $factorMean[0]; + + for ($line = 0; $line < 13; $line++) { + my $tallyOverMean = $factorTally[$line] / $factorMean[$line]; + print writeFactorStateTableSampleMeanFile "$tallyOverMean\t"; + } + print writeFactorStateTableSampleMeanFile "\n"; +} \ No newline at end of file diff --git a/FactorStateTally.sh b/FactorStateTally.sh new file mode 100644 index 0000000..77ec769 --- /dev/null +++ b/FactorStateTally.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# tally for each state + +for state in {1..15} +do + echo $state + cat "stats/run.txt" | while read tf; do + tally=`wc broadhmm/$tf/$state-intersect.bed | awk {'print $1'}` + echo -e "$tf\t$tally" + done > broadhmm/$state-tally.txt +done + + +# for easier processing later, tally for each factor according to state + +cat "stats/run.txt" | while read tf; do + echo "$tf" + for state in {1..15} + do + tally=`wc broadhmm/$tf/$state-intersect.bed | awk {'print $1'}` + echo -e "$tally" + done > broadhmm/$tf/factor-tally.txt +done \ No newline at end of file diff --git a/NormalizedStats.pl b/NormalizedStats.pl new file mode 100644 index 0000000..41be7a6 --- /dev/null +++ b/NormalizedStats.pl @@ -0,0 +1,54 @@ +#!/usr/bin/perl + +my $runfilePath = "stats/run.txt"; +open runfile, "stats/run.txt"; + +# print index("this is a string", "t"); + + +while() { + chomp; + my $factor = $_; + # print "$factor\n"; + + open (writefileSampleMean, ">broadhmm/$factor/normalized/factor-sample-mean.txt"); + open (writefileRealStat, ">broadhmm/$factor/normalized/factor-real-stat.txt"); + + for ($state = 1; $state < 16; $state++) { + # print "$state\n"; + open factorStateFile, "broadhmm/$factor/normalized/$state.txt"; + + # print "broadhmm/$factor/normalized/$state.txt\n"; + my $factorStateSampleMean = 0; + my $factorStateRealStat = 0; + + my $counter = 1; + + while () { + chomp; + my $line = $_; + # print "$line\n"; + + if(index($line, "The sample mean") != -1) { + my @line = split(/\s/,"$_"); + $factorStateSampleMean = $line[5]; + print "FOUND\n"; + # print "@line\n"; + # print "$line[5]\n"; + } + + if(index($line, "The real stat") != -1) { + my @line = split(/\s/,"$_"); + $factorStateRealStat = $line[5]; + print "FOUND\n"; + # print "@line\n"; + # print "$line[5]\n"; + } + } + print "$factorStateSampleMean\n"; + print "$factorStateRealStat\n"; + + print writefileSampleMean "$factorStateSampleMean\n"; + print writefileRealStat "$factorStateRealStat\n"; + } +} \ No newline at end of file diff --git a/README.md b/README.md index 42502bf..e430204 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ -nucleosomepositioning +Nucleosome Positioning Tool ===================== -Nucleosome Positioning +A tool to find and visualize the closest nucleosomes to transcription binding sites. + +Languages: Perl, R, Python, Bash diff --git a/RawDataToTables.r b/RawDataToTables.r new file mode 100644 index 0000000..2031a9f --- /dev/null +++ b/RawDataToTables.r @@ -0,0 +1,34 @@ +library(plyr) + +rawDistanceRows <- readLines("stats/violin-distance.txt") +rawIntensityRows <- readLines("stats/violin-intensity.txt") + +# split by sep +rawDistanceRowsSplits <- strsplit(rawDistanceRows, "\t") +# Make each into a list of dataframes (for rbind.fill) +rawDistanceRowsSplits <- lapply(rawDistanceRowsSplits, function(x)as.data.frame(t(x))) +# now bind +distanceRowsModified <- rbind.fill(rawDistanceRowsSplits) +distanceColumnsModified <- as.data.frame(t(as.matrix(distanceRowsModified))); + +write.table(distanceRowsModified, "stats/violin-distance-modifedrows.txt"); +write.table(distanceColumnsModified, "stats/violin-distance-modifiedcolumns.txt"); + + + +# split by sep +rawIntensityRowsSplits <- strsplit(rawIntensityRows, "\t") +# Make each into a list of dataframes (for rbind.fill) +rawIntensityRowsSplits <- lapply(rawIntensityRowsSplits, function(x)as.data.frame(t(x))) +# now bind +intensityRowsModified <- rbind.fill(rawIntensityRowsSplits) +intensityColumnsModified <- as.data.frame(t(as.matrix(intensityRowsModified))); + +write.table(intensityRowsModified, "stats/violin-intensity-modifedrows.txt"); +write.table(intensityColumnsModified, "stats/violin-intensity-modifiedcolumns.txt"); + + +dim(distanceRowsModified) +dim(distanceColumnsModified) +dim(intensityRowsModified) +dim(intensityColumnsModified) \ No newline at end of file diff --git a/SplitBroadHMM.pl b/SplitBroadHMM.pl new file mode 100644 index 0000000..264dd69 --- /dev/null +++ b/SplitBroadHMM.pl @@ -0,0 +1,23 @@ +#!/usr/bin/perl +use warnings; +use integer; + +$broadhmmPath = "broadhmm/wgEncodeBroadHmmGm12878HMM.bed"; + +for ($state = 1; $state < 16; $state++) { +print "$state\n"; +$writePath = "broadhmm/$state.bed"; +open (writeSplitState, ">$writePath"); + open broadhmmFile, $broadhmmPath; + while () { + chomp; + my @line = split(/\t/,"$_"); + + # print index($line[3], $state); + + if (index($line[3],"$state"."_") == 0) { + # print "$line[3]\n"; + print writeSplitState "$_\n"; + } + } +} \ No newline at end of file diff --git a/all-bedfilesplot.pl b/all-bedfilesplot.pl new file mode 100644 index 0000000..384d8c0 --- /dev/null +++ b/all-bedfilesplot.pl @@ -0,0 +1,11 @@ +#!/usr/bin/perl +use warnings; + +open statsList, "stats/run.txt"; + +while () { + chomp; + my $tf = "$_"; + print "$tf\n"; + system("Rscript bedfileplot.r $tf"); +} \ No newline at end of file diff --git a/all-processbedfile-summit.pl b/all-processbedfile-summit.pl new file mode 100644 index 0000000..793090e --- /dev/null +++ b/all-processbedfile-summit.pl @@ -0,0 +1,26 @@ +#!/usr/bin/perl +use warnings; + +open bedFilesList, "bedfiles-run.txt"; + +while(){ + chomp; + my $bedfile = "$_"; + print("$bedfile"); + my $bedfilePath = "bedfiles/$bedfile"; + + mkdir "$bedfilePath-parts-summit"; + my $writeFileName = "$bedfilePath-parts-summit/$bedfile-modified-summit.txt"; + open openfile, $bedfilePath; + open (writefile, ">$writeFileName"); + + while(){ + chomp; + my $line = "$_\n"; + my @entries = split("\t"); + my $middle = $entries[1] + $entries[9] - 1; + #print "$writeFileName\n"; + print writefile "$entries[0]\t$entries[1]\t$entries[2]\t$middle\n"; + } + system("bash bedfilesplit.sh $bedfile"); +} \ No newline at end of file diff --git a/allbedcat.pl b/allbedcat.pl new file mode 100644 index 0000000..e425cb7 --- /dev/null +++ b/allbedcat.pl @@ -0,0 +1,19 @@ +#!/usr/bin/perl +use warnings; + +my $statsDirectory = "stats/".$ARGV[0]."/bedfileparts-gteq1.5peak-summit"; + +system("cat $statsDirectory/bed*-stats.txt > $statsDirectory/all-bed-stats.txt"); + +open allbedstatread, $statsDirectory."/all-bed-stats.txt"; +open (bedwrite010, ">".$statsDirectory."/all-bed-stats-010.txt"); + +while () { + my @line = split(/\t/,"$_"); + my $distance = $line[0]; + my $intensity = $line[1]; + + if ($intensity <= 10) { + print bedwrite010 "$distance\t$intensity"; + } +} \ No newline at end of file diff --git a/allstats-bedcat.pl b/allstats-bedcat.pl new file mode 100644 index 0000000..b0fc870 --- /dev/null +++ b/allstats-bedcat.pl @@ -0,0 +1,30 @@ +#!/usr/bin/perl +use warnings; + +open statsList, "stats/run.txt"; + +while() { + chomp; + my $tf = "$_"; + print "$tf\n"; + + my $statsDirectory = "stats/$tf/bedfileparts-gteq1.5peak-summit"; + system("cat $statsDirectory/bed*-stats.txt > $statsDirectory/all-bed-stats.txt"); + + open allbedstatread, $statsDirectory."/all-bed-stats.txt"; + open (bedwrite010, ">".$statsDirectory."/all-bed-stats-010.txt"); + + while () { + chomp; + my @line = split(/\t/,"$_"); + my $distance = $line[0]; + my $intensity = $line[1]; + + if ($intensity <= 10) { + print bedwrite010 "$distance\t$intensity\n"; + } + } + + close allbedstatread; + close bedwrite010; +} \ No newline at end of file diff --git a/bedfile-statsjob-qsub.sh b/bedfile-statsjob-qsub.sh new file mode 100644 index 0000000..de08246 --- /dev/null +++ b/bedfile-statsjob-qsub.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +tf=$1; + +mkdir stats/$tf; + +for split in `ls -1 bedfiles/$tf-parts-summit/bedfile* | xargs -n 1 basename` +do + qsub -S /bin/bash -cwd -o stats/$tf/$split.log -e stats/$tf/$split-error.log -N $tf-$split-job toolbedgraph-bash.sh $tf $split + #bash toolbedgraph-bash.sh $tf $split +done \ No newline at end of file diff --git a/bedfileplot.r b/bedfileplot.r new file mode 100644 index 0000000..96ed59c --- /dev/null +++ b/bedfileplot.r @@ -0,0 +1,78 @@ +library(hexbin) +library(lattice) + +args <- commandArgs(trailingOnly = TRUE) +tf <- args[1] + +statFile <- paste("stats/", tf ,"/bedfileparts-gteq1.5peak-summit/all-bed-stats.txt", sep="") +zeroTenScatter <- paste("stats/", tf ,"/bedfileparts-gteq1.5peak-summit/010scatterplot.png", sep="") +zeroTenHeatmap <- paste("stats/", tf ,"/bedfileparts-gteq1.5peak-summit/010heatmap.png", sep="") +zeroUnlScatter <- paste("stats/", tf ,"/bedfileparts-gteq1.5peak-summit/0Unlscatterplot.png", sep="") +zeroUnlHeatmap <- paste("stats/", tf ,"/bedfileparts-gteq1.5peak-summit/0Unlheatmap.png", sep="") +zeroTenSmoothscatter <- paste("stats/", tf ,"/bedfileparts-gteq1.5peak-summit/010smoothscatter.png", sep="") +zeroUnlSmoothscatter <- paste("stats/", tf ,"/bedfileparts-gteq1.5peak-summit/0Unlsmoothscatter.png", sep="") +zeroTenSpline <- paste("stats/", tf ,"/bedfileparts-gteq1.5peak-summit/010spline.png", sep="") + +zeroTenScatterTitle <- paste(tf, " - 0 to 10 Scatter plot" , sep=""); +zeroTenHeatmapTitle <- paste(tf, " - 0 to 10 Heat map" , sep=""); +zeroTenSmoothscatterTitle <- paste(tf, " - 0 to 10 Smooth Scatter", sep=""); +zeroUnlScatterTitle <- paste(tf, " - 0 to Unl Scatter plot" , sep=""); +zeroUnlHeatmapTitle <- paste(tf, " - 0 to Unl Heat map" , sep=""); +zeroUnlSmoothscatterTitle <- paste(tf, " - 0 to Unl Smooth Scatter", sep=""); +zeroTenSplineTitle <- paste(tf, " - 0 to 10 Spline", sep=""); + +data <- read.table(statFile, as.is=TRUE); + +xpts010 <- data[,1][ data[,2] <= 10 & data[,2]>=0] +ypts010 <- data[,2][ data[,2] <= 10 & data[,2]>=0] + +spl010 <- smooth.spline(xpts010, ypts010) +xSplPts <- spl010[1] +ySplPts <- spl010[2] +ySplPtsUnlist <- unlist(ySplPts) +xSplPtsUnlist <- unlist(xSplPts) + +maxXIndex <- which.max(ySplPtsUnlist) +maxX <- xSplPtsUnlist[maxXIndex]; +maxY <- max(ySplPtsUnlist) + +xpts0Unl <- data[,1] +ypts0Unl <- data[,2] + +bin010 <- hexbin(x=xpts010, y=ypts010); +bin0Unl <- hexbin(x=xpts0Unl, y=ypts0Unl); + +png(zeroTenScatter, width = 1000, height = 1000); +plot(xpts010, ypts010, main = zeroTenScatterTitle, xlab = "Distance (bp)", ylab = "Intensity", col=rgb(0,0,0,100,maxColorValue=255), pch=16); +dev.off(); + +png(zeroTenHeatmap, width = 1000, height = 1000); +plot(bin010, main = zeroTenHeatmapTitle, xlab = "Distance (bp)", ylab = "Intensity"); +dev.off(); + +png(zeroTenSmoothscatter, width = 1000, height = 1000); +smoothScatter(xpts010, ypts010, nbin = 1000, main = zeroTenSmoothscatterTitle, xlab = "Distance (bp)", ylab = "Intensity"); +lines(spl010); +dev.off(); + +png(zeroTenSpline, width = 1000, height = 1000); +plot(spl010, main = zeroTenSplineTitle, xlab = "Distance (bp)", ylab = "Intensity", type = "l"); +dev.off(); + +png(zeroUnlScatter, width = 1000, height = 1000); +plot(xpts0Unl, ypts0Unl, main = zeroUnlScatterTitle, xlab = "Distance (bp)", ylab = "Intensity", col=rgb(0,0,0,100,maxColorValue=255), pch=16); +dev.off(); + +png(zeroUnlHeatmap, width = 1000, height = 1000); +plot(bin0Unl, main = zeroUnlHeatmapTitle, xlab = "Distance (bp)", ylab = "Intensity"); +dev.off(); + +png(zeroUnlSmoothscatter, width = 1000, height = 1000); +smoothScatter(xpts0Unl, ypts0Unl, nbin = 1000, main = zeroUnlSmoothscatterTitle, xlab = "Distance (bp)", ylab = "Intensity"); +dev.off(); + +sink(paste("stats/", tf ,"/bedfileparts-gteq1.5peak-summit/splinemaxpeak.txt", sep="")) +cat(maxX) +cat("\t") +cat(maxY) +sink() \ No newline at end of file diff --git a/bedfiles-run-cluster.txt b/bedfiles-run-cluster.txt new file mode 100644 index 0000000..b894f06 --- /dev/null +++ b/bedfiles-run-cluster.txt @@ -0,0 +1,150 @@ +x wgEncodeHaibTfbsGm12878Atf2sc81188V0422111PkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Atf2sc81188V0422111PkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Atf3Pcr1xPkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Atf3Pcr1xPkRep2.broadPeak +x wgEncodeHaibTfbsGm12878BatfPcr1xPkRep1.broadPeak +x wgEncodeHaibTfbsGm12878BatfPcr1xPkRep2.broadPeak +(x time) wgEncodeHaibTfbsGm12878Bcl11aPcr1xPkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Bcl11aPcr1xPkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Bcl3V0416101PkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Bcl3V0416101PkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Bclaf101388V0416101PkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Bclaf101388V0416101PkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Cebpbsc150V0422111PkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Cebpbsc150V0422111PkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Creb1sc240V0422111PkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Creb1sc240V0422111PkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Ebfsc137065Pcr1xPkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Ebfsc137065Pcr1xPkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Egr1Pcr2xPkRep3.broadPeak +x wgEncodeHaibTfbsGm12878Egr1V0416101PkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Egr1V0416101PkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Elf1sc631V0416101PkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Elf1sc631V0416101PkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Ets1Pcr1xPkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Ets1Pcr1xPkRep1V2.broadPeak +x wgEncodeHaibTfbsGm12878Ets1Pcr1xPkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Ets1Pcr1xPkRep2V2.broadPeak +x wgEncodeHaibTfbsGm12878Foxm1sc502V0422111PkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Foxm1sc502V0422111PkRep2.broadPeak +x wgEncodeHaibTfbsGm12878GabpPcr2xPkRep1.broadPeak +x wgEncodeHaibTfbsGm12878GabpPcr2xPkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Irf4sc6059Pcr1xPkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Irf4sc6059Pcr1xPkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Mef2aPcr1xPkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Mef2aPcr1xPkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Mef2csc13268V0416101PkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Mef2csc13268V0416101PkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Mta3sc81325V0422111PkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Mta3sc81325V0422111PkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Nfatc1sc17834V0422111PkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Nfatc1sc17834V0422111PkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Nficsc81335V0422111PkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Nficsc81335V0422111PkRep2.broadPeak +x wgEncodeHaibTfbsGm12878NrsfPcr1xPkRep1.broadPeak +x wgEncodeHaibTfbsGm12878NrsfPcr1xPkRep2.broadPeak +x wgEncodeHaibTfbsGm12878NrsfPcr2xPkRep1.broadPeak +x wgEncodeHaibTfbsGm12878NrsfPcr2xPkRep2.broadPeak +x wgEncodeHaibTfbsGm12878P300Pcr1xPkRep1.broadPeak +x wgEncodeHaibTfbsGm12878P300Pcr1xPkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Pax5c20Pcr1xPkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Pax5c20Pcr1xPkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Pax5n19Pcr1xPkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Pax5n19Pcr1xPkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Pbx3Pcr1xPkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Pbx3Pcr1xPkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Pmlsc71910V0422111PkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Pmlsc71910V0422111PkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Pol24h8Pcr1xPkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Pol24h8Pcr1xPkRep2.broadPeak +x wgEncodeHaibTfbsGm12878Pol2Pcr2xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Pol2Pcr2xPkRep2.broadPeak +wgEncodeHaibTfbsGm12878Pou2f2Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Pou2f2Pcr1xPkRep2.broadPeak +wgEncodeHaibTfbsGm12878Pou2f2Pcr1xPkRep3.broadPeak +wgEncodeHaibTfbsGm12878Pu1Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Pu1Pcr1xPkRep2.broadPeak +wgEncodeHaibTfbsGm12878Pu1Pcr1xPkRep3.broadPeak +wgEncodeHaibTfbsGm12878Rad21V0416101PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Rad21V0416101PkRep2.broadPeak +wgEncodeHaibTfbsGm12878Runx3sc101553V0422111PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Runx3sc101553V0422111PkRep2.broadPeak +wgEncodeHaibTfbsGm12878RxraPcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878RxraPcr1xPkRep2.broadPeak +wgEncodeHaibTfbsGm12878Six5Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Six5Pcr1xPkRep2.broadPeak +wgEncodeHaibTfbsGm12878Sp1Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Sp1Pcr1xPkRep2.broadPeak +wgEncodeHaibTfbsGm12878SrfPcr2xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878SrfPcr2xPkRep2.broadPeak +wgEncodeHaibTfbsGm12878SrfV0416101PkRep1.broadPeak +wgEncodeHaibTfbsGm12878SrfV0416101PkRep2.broadPeak +wgEncodeHaibTfbsGm12878Stat5asc74442V0422111PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Stat5asc74442V0422111PkRep2.broadPeak +wgEncodeHaibTfbsGm12878Taf1Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Taf1Pcr1xPkRep2.broadPeak +wgEncodeHaibTfbsGm12878Tcf12Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Tcf12Pcr1xPkRep2.broadPeak +wgEncodeHaibTfbsGm12878Tcf3Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Tcf3Pcr1xPkRep2.broadPeak +wgEncodeHaibTfbsGm12878Usf1Pcr2xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Usf1Pcr2xPkRep2.broadPeak +wgEncodeHaibTfbsGm12878Yy1sc281Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Yy1sc281Pcr1xPkRep2.broadPeak +wgEncodeHaibTfbsGm12878Zbtb33Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Zbtb33Pcr1xPkRep2.broadPeak +wgEncodeHaibTfbsGm12878Zeb1sc25388V0416102PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Zeb1sc25388V0416102PkRep2.broadPeak +wgEncodeSydhTfbsGm12878Bhlhe40cIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Brca1a300IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Cdpsc6327IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878CfosStdPk.narrowPeak +wgEncodeSydhTfbsGm12878Chd1a301218aIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Chd2ab68301IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Corestsc30189IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Ctcfsc15914c20StdPk.narrowPeak +wgEncodeSydhTfbsGm12878E2f4IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Ebf1sc137065StdPk.narrowPeak +wgEncodeSydhTfbsGm12878Elk112771IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878ErraIggrabPk.narrowPeak +wgEncodeSydhTfbsGm12878Gcn5StdPk.narrowPeak +wgEncodeSydhTfbsGm12878Ikzf1iknuclaStdPk.narrowPeak +wgEncodeSydhTfbsGm12878Irf3IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878JundIggrabPk.narrowPeak +wgEncodeSydhTfbsGm12878JundStdPk.narrowPeak +wgEncodeSydhTfbsGm12878MafkIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878MaxIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878MaxStdPk.narrowPeak +wgEncodeSydhTfbsGm12878Mazab85725IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Mxi1IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Nfe2sc22827StdPk.narrowPeak +wgEncodeSydhTfbsGm12878NfkbTnfaIggrabPk.narrowPeak +wgEncodeSydhTfbsGm12878NfyaIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878NfybIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Nrf1IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878P300bStdPk.narrowPeak +wgEncodeSydhTfbsGm12878P300IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878P300sc584IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Pol2IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Pol2s2IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Pol2StdPk.narrowPeak +wgEncodeSydhTfbsGm12878Pol3StdPk.narrowPeak +wgEncodeSydhTfbsGm12878Rad21IggrabPk.narrowPeak +wgEncodeSydhTfbsGm12878Rfx5200401194IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Sin3anb6001263IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Smc3ab9263IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Spt20StdPk.narrowPeak +wgEncodeSydhTfbsGm12878Srebp1IggrabPk.narrowPeak +wgEncodeSydhTfbsGm12878Srebp2IggrabPk.narrowPeak +wgEncodeSydhTfbsGm12878Stat1StdPk.narrowPeak +wgEncodeSydhTfbsGm12878Stat3IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Tblr1ab24550IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878TbpIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Tr4StdPk.narrowPeak +wgEncodeSydhTfbsGm12878Usf2IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878WhipIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Yy1StdPk.narrowPeak +wgEncodeSydhTfbsGm12878Znf143166181apStdPk.narrowPeak +wgEncodeSydhTfbsGm12878Znf274StdPk.narrowPeak +wgEncodeSydhTfbsGm12878Znf384hpa004051IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Zzz3StdPk.narrowPeak diff --git a/bedfiles-run.txt b/bedfiles-run.txt new file mode 100644 index 0000000..3b08fab --- /dev/null +++ b/bedfiles-run.txt @@ -0,0 +1,8 @@ +wgEncodeSydhTfbsGm12878MaxIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Ctcfsc15914c20StdPk.narrowPeak +wgEncodeSydhTfbsGm12878P300bStdPk.narrowPeak +wgEncodeSydhTfbsGm12878Znf143166181apStdPk.narrowPeak +wgEncodeSydhTfbsGm12878Rad21IggrabPk.narrowPeak +wgEncodeSydhTfbsGm12878Smc3ab9263IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878TbpIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Usf2IggmusPk.narrowPeak \ No newline at end of file diff --git a/bedfilesplit.sh b/bedfilesplit.sh new file mode 100644 index 0000000..f1c9058 --- /dev/null +++ b/bedfilesplit.sh @@ -0,0 +1,7 @@ +tf=$1; + +bedfileDir="bedfiles/$tf-parts-summit"; + +echo $bedfileDir; + +split -l 4000 $bedfileDir/$tf-modified-summit.txt $bedfileDir/bedfile; \ No newline at end of file diff --git a/normalize-bash.sh b/normalize-bash.sh new file mode 100644 index 0000000..b37bfb3 --- /dev/null +++ b/normalize-bash.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +tf=$1; +state=$2; + +rm broadhmm/$tf/normalized/$state.txt; +python normalize/block_bootstrap.py -1 bedfiles/$tf -2 broadhmm/$state.bed -d normalize/wgEncodeUwDnaseGm12878HotspotsRep1.broadPeak -r 0.041 -n 1000 -v -B -o broadhmm/$tf/normalized/$state.txt +#-1 and -2 switched from before \ No newline at end of file diff --git a/normalize-qsub-remaining.sh b/normalize-qsub-remaining.sh new file mode 100644 index 0000000..8821d45 --- /dev/null +++ b/normalize-qsub-remaining.sh @@ -0,0 +1,3 @@ +cat "run-normalize-todo.txt" | while read tf; do + bash normalize-qsub.sh $tf +done \ No newline at end of file diff --git a/normalize-qsub.sh b/normalize-qsub.sh new file mode 100644 index 0000000..aee9091 --- /dev/null +++ b/normalize-qsub.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +tf=$1; + +mkdir -p broadhmm/$tf/normalized; + +for state in {1..15} +do + qsub -S /bin/bash -cwd -o broadhmm/$tf/normalized/$state.log -e broadhmm/$tf/normalized/$state-error.log -N $tf-$state-normalize-job normalize-bash.sh $tf $state +done \ No newline at end of file diff --git a/overallplots-normalized.r b/overallplots-normalized.r new file mode 100644 index 0000000..ff94ee5 --- /dev/null +++ b/overallplots-normalized.r @@ -0,0 +1,436 @@ +# terminal arguments: +# Rscript overallplots-normalized.r #doesn't make any plots +# Rscript overallplots-normalized.r all #makes an overall scatterplot for all factors +# Rscript overallplots-normalized.r all 100 200 3 4 # makes a zoomed in plot of x from 100 to 200 and y from 3 to 4 +# Rscript overallplots-normalized.r wgEncodeHaibTfbsGm12878Usf1Pcr2xPkRep1.broadPeak # example factor name, highlights the specific factor on the overall scatterplot +# Rscript overallplots-normalized.r wgEncodeHaibTfbsGm12878Usf1Pcr2xPkRep1.broadPeak 100 200 3 4 # zoomed in version, if factor is on window, will be highlighted + +library(sm) +library(calibrate) +library(directlabels) +library(lattice) +library(ggplot2) +library(gridExtra) + + +vioplot2 <- function(x,...,range=1.5,h=NULL,ylim=NULL,names=NULL, horizontal=FALSE, + col="magenta", border="black", lty=1, lwd=1, rectCol="black", colMed="white", pchMed=19, at, add=FALSE, wex=1, + drawRect=TRUE) +{ + # process multiple datas + datas <- list(x,...) + n <- length(datas) + + if(missing(at)) at <- 1:n + + # pass 1 + # + # - calculate base range + # - estimate density + # + + # setup parameters for density estimation + upper <- vector(mode="numeric",length=n) + lower <- vector(mode="numeric",length=n) + q1 <- vector(mode="numeric",length=n) + q3 <- vector(mode="numeric",length=n) + med <- vector(mode="numeric",length=n) + base <- vector(mode="list",length=n) + height <- vector(mode="list",length=n) + baserange <- c(Inf,-Inf) + + # global args for sm.density function-call + args <- list(display="none") + + if (!(is.null(h))) + args <- c(args, h=h) + + + for(i in 1:n) { + + data<-datas[[i]] + + # calculate plot parameters + # 1- and 3-quantile, median, IQR, upper- and lower-adjacent + + data.min <- min(data) + data.max <- max(data) + q1[i]<-quantile(data,0.25) + q3[i]<-quantile(data,0.75) + med[i]<-median(data) + iqd <- q3[i]-q1[i] + upper[i] <- min( q3[i] + range*iqd, data.max ) + lower[i] <- max( q1[i] - range*iqd, data.min ) + + + # strategy: + # xmin = min(lower, data.min)) + # ymax = max(upper, data.max)) + # + + est.xlim <- c( min(lower[i], data.min), max(upper[i], data.max) ) + + # estimate density curve + + smout <- do.call("sm.density", c( list(data, xlim=est.xlim), args ) ) + + + # calculate stretch factor + # + # the plots density heights is defined in range 0.0 ... 0.5 + # we scale maximum estimated point to 0.4 per data + # + + hscale <- 0.4/max(smout$estimate) * wex + + + # add density curve x,y pair to lists + + base[[i]] <- smout$eval.points + height[[i]] <- smout$estimate * hscale + + + # calculate min,max base ranges + + t <- range(base[[i]]) + baserange[1] <- min(baserange[1],t[1]) + baserange[2] <- max(baserange[2],t[2]) + + } + + # pass 2 + # + # - plot graphics + + # setup parameters for plot + + if(!add){ + xlim <- if(n==1) + at + c(-.5, .5) + else + range(at) + min(diff(at))/2 * c(-1,1) + + if (is.null(ylim)) { + ylim <- baserange + } + } + if (is.null(names)) { + label <- 1:n + } else { + label <- names + } + + boxwidth <- 0.05 * wex + + + # setup plot + + if(!add) + plot.new() + if(!horizontal) { + if(!add){ + plot.window(xlim = xlim, ylim = ylim) + axis(2) + axis(1,at = at, label=label, las = 2 ) #ADDED LAS = 2 FOR VERTICAL AXIS LABELS + } + + box() + + for(i in 1:n) { + + # plot left/right density curve + + polygon( c(at[i]-height[[i]], rev(at[i]+height[[i]])), + c(base[[i]], rev(base[[i]])), + col = col[i], border=border, lty=lty, lwd=lwd) #ADDED COL[I] FOR MULTIPLE COLOURS + + + if(drawRect){ + # plot IQR + lines( at[c( i, i)], c(lower[i], upper[i]) ,lwd=lwd, lty=lty) + + # plot 50% KI box + + rect( at[i]-boxwidth/2, q1[i], at[i]+boxwidth/2, q3[i], col=rectCol) + + # plot median point + + points( at[i], med[i], pch=pchMed, col=colMed ) + } + } + + } + else { + if(!add){ + plot.window(xlim = ylim, ylim = xlim) + axis(1) + axis(2,at = at, label=label ) + } + + box() + for(i in 1:n) { + + # plot left/right density curve + + polygon( c(base[[i]], rev(base[[i]])), + c(at[i]-height[[i]], rev(at[i]+height[[i]])), + col = col, border=border, lty=lty, lwd=lwd) + + + if(drawRect){ + # plot IQR + lines( c(lower[i], upper[i]), at[c(i,i)] ,lwd=lwd, lty=lty) + + # plot 50% KI box + + rect( q1[i], at[i]-boxwidth/2, q3[i], at[i]+boxwidth/2, col=rectCol) + + # plot median point + points( med[i], at[i], pch=pchMed, col=colMed ) + } + } + + + } + + + invisible (list( upper=upper, lower=lower, median=med, q1=q1, q3=q3)) +} + +# end of vioplot2 + + +args <- commandArgs(trailingOnly = TRUE) +print(args) +print(args[1]) + +highlightName <- "" +highlightFactor <- "" + +if (is.na(args[1]) == FALSE) { + highlightName <- paste("-", args[1], sep="") + highlightFactor <- args[1] +} + +x1 <- 0 +x2 <- 0 +y1 <- 0 +y2 <- 0 + +if (is.na(args[2]) == FALSE & is.na(args[3]) == FALSE & is.na(args[4]) == FALSE & is.na(args[5]) == FALSE) { + x1 <- as.numeric(args[2]) + x2 <- as.numeric(args[3]) + y1 <- as.numeric(args[4]) + y2 <- as.numeric(args[5]) +} + +# print(x1) + +tfList <- read.table("stats/run.txt", as.is=TRUE, col.names=c("tfNames")); +tfListRow <- as.data.frame(t(as.matrix(tfList))); +tfListChar <- data.frame(lapply(tfListRow, as.character), stringsAsFactors=FALSE) + +scatterPlotData <- read.table("stats/scatter-plot-data-normalized.txt", as.is=TRUE, col.names=c("scatterDistance", "scatterIntensity", "tfColour", "scatterLabel", "scatterNumberLabel", "scatterNumberNameLabel", "scatterPointColour", "scatterName")); +print(scatterPlotData) +scatterPlotData["factorNames"] <- tfList + +# # print(scatterPlotData$scatterPointColour) + +for (i in 1:nrow(scatterPlotData)) { + scatterPlotData$scatterPointColour[i] <- paste("#",scatterPlotData$scatterPointColour[i],sep="") + # rest is the same +} + +chooseColorsOverall <- function(data, maxX, maxY) { + # print(dim(data)) + rowsNo <- dim(data)[1] + colsNo <- dim(data)[2] + # print(rowsNo) + # print(colsNo) + + # print(data[,7]) + + print(data$scatterPointColour) + + # maxX <- max(data[,1]) + # print(maxX) + # maxY <- max(data[,2]) + # print(maxY) + + colours <- c() + if (length(grep("#-", data$scatterPointColour[1])) != 0) { + print("here"); + for (i in 1:rowsNo) { + + if (data$factorNames[9] == highlightFactor) { + # print("red") + colours <- append(colours, rgb(green=0, red=1, blue=0)) + } else { + # print("else colour") + x <- data$scatterDistance[1] + y <- data$scatterIntensity[2] + + x <- 1-x/maxX + # y <- 1-((max(y)/min(y))*(y/(max(y)))) + y <- 1-y/maxY + y <- y + 0.3 + # print(rgb(green=y, red=y, blue=x)) + colours <- append(colours, rgb(green=y, red=y, blue=x)) + } + } + } else { + print("there"); + colours <- data$scatterPointColour + } + # print(colours) + return(colours) +} + +choosePointSize <- function(data) { + rowsNo <- dim(data)[1] + colsNo <- dim(data)[2] + sizes <- c() + + for (i in 1:rowsNo) { + if (data$factorNames[i] == highlightFactor) { + sizes <- append(sizes, 8) + } else { + sizes <- append(sizes, 4) + } + } + return(sizes) +} + +addZoomPlot <- function(limX1, limX2, limY1, limY2) { + scatterPlotDataZoom <- subset(scatterPlotData, scatterDistance <= limX2 & scatterDistance >= limX1 & scatterIntensity <= limY2 & scatterIntensity>=limY1) + fileName <- paste("stats/OverallScatterPlot-Numbered",highlightName, "-",limX1,"-",limX2,"-",limY1,"-",limY2,"-normalized.png",sep="") + print(fileName) + png(fileName, width = 2000, height = 1500, res = 100); + ggplotZoom <- ggplot(scatterPlotDataZoom, aes(x=scatterDistance, y=scatterIntensity, label=scatterNumberLabel)) + + geom_point(colour=chooseColorsOverall(scatterPlotDataZoom, max(scatterPlotData$scatterDistance), max(scatterPlotData$scatterIntensity)), size=choosePointSize(scatterPlotDataZoom)) + + geom_text(hjust=0, vjust=0) + + xlab("Distance (bp)") + + ylab("Intensity") + + coord_cartesian(xlim=c(limX1,limX2), ylim=c(limY1,limY2)) + + ggtitle("Intensity vs. Distance Plot") + # ggplotZoom + grid.arrange(ggplotZoom, legend = tableGrob(scatterPlotDataZoom$scatterNumberNameLabel)) + # direct.label(ggplotXYlim) +} + +# print(is.na(args[1])) +# print(is.na(args[2])) +# print(is.na(args[3])) +# print(is.na(args[4])) +# print(is.na(args[5])) + +# print(class(args[4])) + + +# PLOTS FROM INPUT FROM TERMINAL + +if (is.na(args[2]) == TRUE & (highlightFactor == "all" | highlightFactor != "")) { + # print("here") + png(paste("stats/OverallScatterPlot-Numbered-Labels", highlightName, "-normalized.png", sep=""), width = 4000, height = 3500, res = 80); + ggplotNumLabels <- ggplot(scatterPlotData, aes(x=scatterDistance, y=scatterIntensity, label=scatterNumberLabel)) + + geom_point(colour=chooseColorsOverall(scatterPlotData, max(scatterPlotData$scatterDistance), max(scatterPlotData$scatterIntensity)), size=choosePointSize(scatterPlotData)) + + geom_text(hjust=0, vjust=0) + + xlab("Distance (bp)") + + ylab("Intensity") + + ggtitle("Intensity vs. Distance Plot") + # ggplotNumLabels + grid.arrange(ggplotNumLabels, legend = tableGrob(scatterPlotData$scatterNumberNameLabel)) + dev.off() + + png(paste("stats/OverallScatterPlot-non-Numbered", highlightName, "-normalized.png", sep=""), width = 2000, height = 1500, res = 80); + ggplotNum <- ggplot(scatterPlotData, aes(x=scatterDistance, y=scatterIntensity, label=scatterNumberLabel)) + + geom_point(colour=chooseColorsOverall(scatterPlotData, max(scatterPlotData$scatterDistance), max(scatterPlotData$scatterIntensity)), size=choosePointSize(scatterPlotData)) + + # geom_text(hjust=0, vjust=0) + + xlab("Distance (bp)") + + ylab("Intensity") + + ggtitle("Intensity vs. Distance Plot") + ggplotNum + + # direct.label(ggplotNumLabels) +} else if (is.na(args[2]) == FALSE & is.na(args[3]) == FALSE & is.na(args[4]) == FALSE & is.na(args[5]) == FALSE) {# & class(args[2]) == "numeric" & class(args[3]) == "numeric" & class(args[4]) == "numeric" & class(args[5]) == "numeric") { + # print("there") + addZoomPlot(x1, x2, y1, y2) +} +dev.off() + + + + +# THESE PLOTS BELOW WERE FIRST MADE FOR PLOTTING DIRECTLY FROM THE SCRIPT + +# png(paste("stats/OverallScatterPlot-Numbered-Labels", highlightName, ".png", sep=""), width = 4000, height = 3500, res = 80); +# ggplotNumLabels <- ggplot(scatterPlotData, aes(x=scatterDistance, y=scatterIntensity, label=scatterNumberLabel)) + +# geom_point(colour=chooseColorsOverall(scatterPlotData, max(scatterPlotData$scatterDistance), max(scatterPlotData$scatterIntensity)), size=choosePointSize(scatterPlotData)) + +# geom_text(hjust=0, vjust=0) + +# xlab("Distance (bp)") + +# ylab("Intensity") + +# ggtitle("Intensity vs. Distance Plot") +# # ggplotNumLabels +# grid.arrange(ggplotNumLabels, legend = tableGrob(scatterPlotData$scatterNumberNameLabel)) +# # direct.label(ggplotNumLabels) +# dev.off() + +# png(paste("stats/OverallScatterPlot-Numbered", highlightName, ".png", sep=""), width = 2000, height = 1500, res = 80); +# ggplotNum <- ggplot(scatterPlotData, aes(x=scatterDistance, y=scatterIntensity, label=scatterNumberLabel)) + +# geom_point(colour=chooseColorsOverall(scatterPlotData, max(scatterPlotData$scatterDistance), max(scatterPlotData$scatterIntensity)), size=choosePointSize(scatterPlotData)) + +# geom_text(hjust=0, vjust=0) + +# xlab("Distance (bp)") + +# ylab("Intensity") + +# ggtitle("Intensity vs. Distance Plot") +# ggplotNum +# # direct.label(ggplotNum) +# dev.off() + + + + + + + + +# #adds extra zoomed in plot. params: (low distance, high distance, low intensity, high intensity) +# addZoomPlot(-10, 100, 2.25, 3.25) +# dev.off() +# addZoomPlot(100, 200, 3, 3.25) +# dev.off() +# addZoomPlot(100, 200, 2.25, 3.25) +# dev.off() +# addZoomPlot(400, 510, 2.25, 2.75) +# dev.off() + + +#VIOLIN PLOTS + +#VIOLIN PLOTS DATG + +# violinDistanceData <- read.table("stats/violin-distance.txt", fill=TRUE, nrows = 60);#;, na.strings = "0"); +# columnViolinDistanceData <- read.table("stats/violin-distance-modifiedcolumns.txt"); +# columnViolinIntensityData <- read.table("stats/violin-intensity-modifiedcolumns.txt"); +# names(columnViolinDistanceData)[1] <- "x"; +# names(columnViolinIntensityData)[1] <- "x"; + +# dim(columnViolinDistanceData); +# dim(columnViolinIntensityData); + +# png("stats/OveralllDistanceViolinPlot.png", width = 3000, height = 1800, res=80); +# # png("stats/AllDistanceViolinPlot.png", width = 2000, height = 2390); +# mar.orig <- par()$mar # save the original values +# par(mar = c(30,4,4,4)) # set your new values EACH 1 IS 15 PX +# do.call(vioplot2, c(lapply(columnViolinDistanceData, na.omit),list(names=tfListChar),list(col=scatterPlotData$tfColour))); +# title("Distances at Highest Spline Intensities"); +# par(mar = mar.orig) # put the original values back +# dev.off() + +# png("stats/OverallIntensityViolinPlot.png", width = 3000, height = 1800, res=80); +# # png("stats/AllIntensityViolinPlot.png", width = 2000, height = 2390); +# mar.orig <- par()$mar # save the original values +# par(mar = c(30,4,4,4)) # set your new values EACH 1 IS 15 PX FOR DEFAULT RES +# do.call(vioplot2, c(lapply(columnViolinIntensityData, na.omit),list(names=tfListChar),list(col=scatterPlotData$tfColour))); +# title("Highest Spline Intensities"); +# par(mar = mar.orig) # put the original values back +# dev.off() diff --git a/overallplots.r b/overallplots.r new file mode 100644 index 0000000..f138261 --- /dev/null +++ b/overallplots.r @@ -0,0 +1,475 @@ +# terminal arguments: +# Rscript overallplots.r #doesn't make any plots +# Rscript overallplots.r all #makes an overall scatterplot for all factors +# Rscript overallplots.r all 100 200 3 4 # makes a zoomed in plot of x from 100 to 200 and y from 3 to 4 +# Rscript overallplots.r wgEncodeHaibTfbsGm12878Usf1Pcr2xPkRep1.broadPeak # example factor name, highlights the specific factor on the overall scatterplot +# Rscript overallplots.r wgEncodeHaibTfbsGm12878Usf1Pcr2xPkRep1.broadPeak 100 200 3 4 # zoomed in version, if factor is on window, will be highlighted + +library(sm) +library(calibrate) +library(directlabels) +library(lattice) +library(ggplot2) +library(gridExtra) + + +vioplot2 <- function(x,...,range=1.5,h=NULL,ylim=NULL,names=NULL, horizontal=FALSE, + col="magenta", border="black", lty=1, lwd=1, rectCol="black", colMed="white", pchMed=19, at, add=FALSE, wex=1, + drawRect=TRUE) +{ + # process multiple datas + datas <- list(x,...) + n <- length(datas) + + if(missing(at)) at <- 1:n + + # pass 1 + # + # - calculate base range + # - estimate density + # + + # setup parameters for density estimation + upper <- vector(mode="numeric",length=n) + lower <- vector(mode="numeric",length=n) + q1 <- vector(mode="numeric",length=n) + q3 <- vector(mode="numeric",length=n) + med <- vector(mode="numeric",length=n) + base <- vector(mode="list",length=n) + height <- vector(mode="list",length=n) + baserange <- c(Inf,-Inf) + + # global args for sm.density function-call + args <- list(display="none") + + if (!(is.null(h))) + args <- c(args, h=h) + + + for(i in 1:n) { + + data<-datas[[i]] + + # calculate plot parameters + # 1- and 3-quantile, median, IQR, upper- and lower-adjacent + + data.min <- min(data) + data.max <- max(data) + q1[i]<-quantile(data,0.25) + q3[i]<-quantile(data,0.75) + med[i]<-median(data) + iqd <- q3[i]-q1[i] + upper[i] <- min( q3[i] + range*iqd, data.max ) + lower[i] <- max( q1[i] - range*iqd, data.min ) + + + # strategy: + # xmin = min(lower, data.min)) + # ymax = max(upper, data.max)) + # + + est.xlim <- c( min(lower[i], data.min), max(upper[i], data.max) ) + + # estimate density curve + + smout <- do.call("sm.density", c( list(data, xlim=est.xlim), args ) ) + + + # calculate stretch factor + # + # the plots density heights is defined in range 0.0 ... 0.5 + # we scale maximum estimated point to 0.4 per data + # + + hscale <- 0.4/max(smout$estimate) * wex + + + # add density curve x,y pair to lists + + base[[i]] <- smout$eval.points + height[[i]] <- smout$estimate * hscale + + + # calculate min,max base ranges + + t <- range(base[[i]]) + baserange[1] <- min(baserange[1],t[1]) + baserange[2] <- max(baserange[2],t[2]) + + } + + # pass 2 + # + # - plot graphics + + # setup parameters for plot + + if(!add){ + xlim <- if(n==1) + at + c(-.5, .5) + else + range(at) + min(diff(at))/2 * c(-1,1) + + if (is.null(ylim)) { + ylim <- baserange + } + } + if (is.null(names)) { + label <- 1:n + } else { + label <- names + } + + boxwidth <- 0.05 * wex + + + # setup plot + + if(!add) + plot.new() + if(!horizontal) { + if(!add){ + plot.window(xlim = xlim, ylim = ylim) + axis(2) + axis(1,at = at, label=label, las = 2 ) #ADDED LAS = 2 FOR VERTICAL AXIS LABELS + } + + box() + + for(i in 1:n) { + + # plot left/right density curve + + polygon( c(at[i]-height[[i]], rev(at[i]+height[[i]])), + c(base[[i]], rev(base[[i]])), + col = col[i], border=border, lty=lty, lwd=lwd) #ADDED COL[I] FOR MULTIPLE COLOURS + + + if(drawRect){ + # plot IQR + lines( at[c( i, i)], c(lower[i], upper[i]) ,lwd=lwd, lty=lty) + + # plot 50% KI box + + rect( at[i]-boxwidth/2, q1[i], at[i]+boxwidth/2, q3[i], col=rectCol) + + # plot median point + + points( at[i], med[i], pch=pchMed, col=colMed ) + } + } + + } + else { + if(!add){ + plot.window(xlim = ylim, ylim = xlim) + axis(1) + axis(2,at = at, label=label ) + } + + box() + for(i in 1:n) { + + # plot left/right density curve + + polygon( c(base[[i]], rev(base[[i]])), + c(at[i]-height[[i]], rev(at[i]+height[[i]])), + col = col, border=border, lty=lty, lwd=lwd) + + + if(drawRect){ + # plot IQR + lines( c(lower[i], upper[i]), at[c(i,i)] ,lwd=lwd, lty=lty) + + # plot 50% KI box + + rect( q1[i], at[i]-boxwidth/2, q3[i], at[i]+boxwidth/2, col=rectCol) + + # plot median point + points( med[i], at[i], pch=pchMed, col=colMed ) + } + } + + + } + + + invisible (list( upper=upper, lower=lower, median=med, q1=q1, q3=q3)) +} + +# end of vioplot2 + + +args <- commandArgs(trailingOnly = TRUE) +print(args) +print(args[1]) + +highlightName <- "" +highlightFactor <- "" + +if (is.na(args[1]) == FALSE) { + highlightName <- paste("-", args[1], sep="") + highlightFactor <- args[1] +} + +x1 <- 0 +x2 <- 0 +y1 <- 0 +y2 <- 0 + +if (is.na(args[2]) == FALSE & is.na(args[3]) == FALSE & is.na(args[4]) == FALSE & is.na(args[5]) == FALSE) { + x1 <- as.numeric(args[2]) + x2 <- as.numeric(args[3]) + y1 <- as.numeric(args[4]) + y2 <- as.numeric(args[5]) +} + +# print(x1) + +tfList <- read.table("stats/run.txt", as.is=TRUE, col.names=c("tfNames")); +tfListRow <- as.data.frame(t(as.matrix(tfList))); +tfListChar <- data.frame(lapply(tfListRow, as.character), stringsAsFactors=FALSE) + +scatterPlotData <- read.table("stats/scatter-plot-data.txt", as.is=TRUE, col.names=c("scatterDistance", "scatterIntensity", "tfColour", "scatterLabel", "scatterNumberLabel", "scatterNumberNameLabel", "scatterPointColour", "scatterName")); +# print(scatterPlotData) +scatterPlotData["factorNames"] <- tfList + +# # print(scatterPlotData$scatterPointColour) + +for (i in 1:nrow(scatterPlotData)) { + scatterPlotData$scatterPointColour[i] <- paste("#",scatterPlotData$scatterPointColour[i],sep="") + # rest is the same +} + +# print(scatterPlotData$scatterPointColour); + + + +#INTERESTING +# chooseColors <- function(x, y) { +# return(rgb(green=y/max(y), blue=y/max(y), red=x/max(x))) +# } + +# chooseColors <- function(x, y) { +# return(rgb(green=0, red=y/max(y), blue=x/max(x))) +# } + +# chooseColors2 <- function(x, y, maxX, maxY) { +# x <- 1-x/maxX +# # y <- 1-((max(y)/min(y))*(y/(max(y)))) +# y <- 1-y/maxY +# y <- y + 0.3 +# print(rgb(green=y, red=y, blue=x)) +# return(rgb(green=y, red=y, blue=x)) +# } + +# chooseColors3 <- function(x, y, maxX, maxY, name) { + +# if (name == highlightFactor) { +# print(name) +# print("IN"); +# return (rgb(green=0, red=1, blue=0)) +# } else { +# print("OUT"); +# x <- 1-x/maxX +# # y <- 1-((max(y)/min(y))*(y/(max(y)))) +# y <- 1-y/maxY +# y <- y + 0.3 +# print(rgb(green=y, red=y, blue=x)) +# return(rgb(green=y, red=y, blue=x)) +# } +# } + +chooseColorsOverall <- function(data, maxX, maxY) { + # print(dim(data)) + rowsNo <- dim(data)[1] + colsNo <- dim(data)[2] + # print(rowsNo) + # print(colsNo) + + # print(data[,7]) + + print(data$scatterPointColour) + + # maxX <- max(data[,1]) + # print(maxX) + # maxY <- max(data[,2]) + # print(maxY) + + colours <- c() + if (length(grep("#-", data$scatterPointColour[1])) != 0) { + print("here"); + for (i in 1:rowsNo) { + + if (data$factorNames[9] == highlightFactor) { + # print("red") + colours <- append(colours, rgb(green=0, red=1, blue=0)) + } else { + # print("else colour") + x <- data$scatterDistance[1] + y <- data$scatterIntensity[2] + + x <- 1-x/maxX + # y <- 1-((max(y)/min(y))*(y/(max(y)))) + y <- 1-y/maxY + y <- y + 0.3 + # print(rgb(green=y, red=y, blue=x)) + colours <- append(colours, rgb(green=y, red=y, blue=x)) + } + } + } else { + print("there"); + colours <- data$scatterPointColour + } + # print(colours) + return(colours) +} + +choosePointSize <- function(data) { + rowsNo <- dim(data)[1] + colsNo <- dim(data)[2] + sizes <- c() + + for (i in 1:rowsNo) { + if (data$factorNames[i] == highlightFactor) { + sizes <- append(sizes, 8) + } else { + sizes <- append(sizes, 4) + } + } + return(sizes) +} + +addZoomPlot <- function(limX1, limX2, limY1, limY2) { + scatterPlotDataZoom <- subset(scatterPlotData, scatterDistance <= limX2 & scatterDistance >= limX1 & scatterIntensity <= limY2 & scatterIntensity>=limY1) + fileName <- paste("stats/OverallScatterPlot-Numbered",highlightName, "-",limX1,"-",limX2,"-",limY1,"-",limY2,".png",sep="") + print(fileName) + png(fileName, width = 2000, height = 1500, res = 100); + ggplotZoom <- ggplot(scatterPlotDataZoom, aes(x=scatterDistance, y=scatterIntensity, label=scatterNumberLabel)) + + geom_point(colour=chooseColorsOverall(scatterPlotDataZoom, max(scatterPlotData$scatterDistance), max(scatterPlotData$scatterIntensity)), size=choosePointSize(scatterPlotDataZoom)) + + geom_text(hjust=0, vjust=0) + + xlab("Distance (bp)") + + ylab("Intensity") + + coord_cartesian(xlim=c(limX1,limX2), ylim=c(limY1,limY2)) + + ggtitle("Intensity vs. Distance Plot") + # ggplotZoom + grid.arrange(ggplotZoom, legend = tableGrob(scatterPlotDataZoom$scatterNumberNameLabel)) + # direct.label(ggplotXYlim) +} + +# print(is.na(args[1])) +# print(is.na(args[2])) +# print(is.na(args[3])) +# print(is.na(args[4])) +# print(is.na(args[5])) + +# print(class(args[4])) + + +# PLOTS FROM INPUT FROM TERMINAL + +if (is.na(args[2]) == TRUE & (highlightFactor == "all" | highlightFactor != "")) { + # print("here") + png(paste("stats/OverallScatterPlot-Numbered-Labels", highlightName, ".png", sep=""), width = 4000, height = 3500, res = 80); + ggplotNumLabels <- ggplot(scatterPlotData, aes(x=scatterDistance, y=scatterIntensity, label=scatterNumberLabel)) + + geom_point(colour=chooseColorsOverall(scatterPlotData, max(scatterPlotData$scatterDistance), max(scatterPlotData$scatterIntensity)), size=choosePointSize(scatterPlotData)) + + geom_text(hjust=0, vjust=0) + + xlab("Distance (bp)") + + ylab("Intensity") + + ggtitle("Intensity vs. Distance Plot") + # ggplotNumLabels + grid.arrange(ggplotNumLabels, legend = tableGrob(scatterPlotData$scatterNumberNameLabel)) + dev.off() + + png(paste("stats/OverallScatterPlot-non-Numbered", highlightName, ".png", sep=""), width = 2000, height = 1500, res = 80); + ggplotNum <- ggplot(scatterPlotData, aes(x=scatterDistance, y=scatterIntensity, label=scatterNumberLabel)) + + geom_point(colour=chooseColorsOverall(scatterPlotData, max(scatterPlotData$scatterDistance), max(scatterPlotData$scatterIntensity)), size=choosePointSize(scatterPlotData)) + + # geom_text(hjust=0, vjust=0) + + xlab("Distance (bp)") + + ylab("Intensity") + + ggtitle("Intensity vs. Distance Plot") + ggplotNum + + # direct.label(ggplotNumLabels) +} else if (is.na(args[2]) == FALSE & is.na(args[3]) == FALSE & is.na(args[4]) == FALSE & is.na(args[5]) == FALSE) {# & class(args[2]) == "numeric" & class(args[3]) == "numeric" & class(args[4]) == "numeric" & class(args[5]) == "numeric") { + # print("there") + addZoomPlot(x1, x2, y1, y2) +} +dev.off() + + + + +# THESE PLOTS BELOW WERE FIRST MADE FOR PLOTTING DIRECTLY FROM THE SCRIPT + +# png(paste("stats/OverallScatterPlot-Numbered-Labels", highlightName, ".png", sep=""), width = 4000, height = 3500, res = 80); +# ggplotNumLabels <- ggplot(scatterPlotData, aes(x=scatterDistance, y=scatterIntensity, label=scatterNumberLabel)) + +# geom_point(colour=chooseColorsOverall(scatterPlotData, max(scatterPlotData$scatterDistance), max(scatterPlotData$scatterIntensity)), size=choosePointSize(scatterPlotData)) + +# geom_text(hjust=0, vjust=0) + +# xlab("Distance (bp)") + +# ylab("Intensity") + +# ggtitle("Intensity vs. Distance Plot") +# # ggplotNumLabels +# grid.arrange(ggplotNumLabels, legend = tableGrob(scatterPlotData$scatterNumberNameLabel)) +# # direct.label(ggplotNumLabels) +# dev.off() + +# png(paste("stats/OverallScatterPlot-Numbered", highlightName, ".png", sep=""), width = 2000, height = 1500, res = 80); +# ggplotNum <- ggplot(scatterPlotData, aes(x=scatterDistance, y=scatterIntensity, label=scatterNumberLabel)) + +# geom_point(colour=chooseColorsOverall(scatterPlotData, max(scatterPlotData$scatterDistance), max(scatterPlotData$scatterIntensity)), size=choosePointSize(scatterPlotData)) + +# geom_text(hjust=0, vjust=0) + +# xlab("Distance (bp)") + +# ylab("Intensity") + +# ggtitle("Intensity vs. Distance Plot") +# ggplotNum +# # direct.label(ggplotNum) +# dev.off() + + + + + + + + +# #adds extra zoomed in plot. params: (low distance, high distance, low intensity, high intensity) +# addZoomPlot(-10, 100, 2.25, 3.25) +# dev.off() +# addZoomPlot(100, 200, 3, 3.25) +# dev.off() +# addZoomPlot(100, 200, 2.25, 3.25) +# dev.off() +# addZoomPlot(400, 510, 2.25, 2.75) +# dev.off() + + +#VIOLIN PLOTS + +#VIOLIN PLOTS DATG + +# violinDistanceData <- read.table("stats/violin-distance.txt", fill=TRUE, nrows = 60);#;, na.strings = "0"); +# columnViolinDistanceData <- read.table("stats/violin-distance-modifiedcolumns.txt"); +# columnViolinIntensityData <- read.table("stats/violin-intensity-modifiedcolumns.txt"); +# names(columnViolinDistanceData)[1] <- "x"; +# names(columnViolinIntensityData)[1] <- "x"; + +# dim(columnViolinDistanceData); +# dim(columnViolinIntensityData); + +# png("stats/OveralllDistanceViolinPlot.png", width = 3000, height = 1800, res=80); +# # png("stats/AllDistanceViolinPlot.png", width = 2000, height = 2390); +# mar.orig <- par()$mar # save the original values +# par(mar = c(30,4,4,4)) # set your new values EACH 1 IS 15 PX +# do.call(vioplot2, c(lapply(columnViolinDistanceData, na.omit),list(names=tfListChar),list(col=scatterPlotData$tfColour))); +# title("Distances at Highest Spline Intensities"); +# par(mar = mar.orig) # put the original values back +# dev.off() + +# png("stats/OverallIntensityViolinPlot.png", width = 3000, height = 1800, res=80); +# # png("stats/AllIntensityViolinPlot.png", width = 2000, height = 2390); +# mar.orig <- par()$mar # save the original values +# par(mar = c(30,4,4,4)) # set your new values EACH 1 IS 15 PX FOR DEFAULT RES +# do.call(vioplot2, c(lapply(columnViolinIntensityData, na.omit),list(names=tfListChar),list(col=scatterPlotData$tfColour))); +# title("Highest Spline Intensities"); +# par(mar = mar.orig) # put the original values back +# dev.off() diff --git a/overallplotsdata-normalized.pl b/overallplotsdata-normalized.pl new file mode 100644 index 0000000..b00ec53 --- /dev/null +++ b/overallplotsdata-normalized.pl @@ -0,0 +1,99 @@ +#!/usr/bin/perl +use warnings; +use integer; +use POSIX; + +my $statsDirectory = "stats"; + +open runfile, "$statsDirectory/run.txt";# or die $!; + +# open (writefileViolinplotIntensity, ">".$statsDirectory."/violin-intensity.txt");# or die $!; +# open (writefileViolinplotDistance, ">".$statsDirectory."/violin-distance.txt");# or die $!; + +open (writefileScatter, ">".$statsDirectory."/scatter-plot-data-normalized.txt"); +open (writefileScatterColour, ">".$statsDirectory."/scatter-plot-data-colour-normalized.txt"); + +#check cluster colour file +my $colourbarFile = 'stats/heatmap-cluster-colours-tallyovermean.txt'; +my $clusterColourFlag = 0; +my @clusterColours = (); +if (-e $colourbarFile) { + print "Cluster colour file exists.\n"; + $clusterColourFlag = 1; + open (clusterColourFile, $colourbarFile); + + while () { + @clusterColours = split(/#/,"$_"); + } +} else { + print "Cluster colour file doesn't exist.\n"; +} + +my $counter = 1; + +while (){ + chomp; + my $currTf = $_; + print "$currTf\n"; + # print "$statsDirectory/$currTf/bedfileparts-gteq1.5peak-summit/all-bed-stats-010.txt\n"; + + open currTfAllBedStatsFile, "$statsDirectory/$currTf/bedfileparts-gteq1.5peak-summit/all-bed-stats-010.txt";# or die $!; + + open currTfMaxpeakFile, "$statsDirectory/$currTf/bedfileparts-gteq1.5peak-summit/splinemaxpeak.txt"; + + my $distance = 0; + my $intensity = 0; + + while () { + chomp; + my @line = split(/\t/,"$_"); + + $distance = $line[0]; + $intensity = $line[1]; + + # print "$distance\t$intensity\n"; + + # print writefileViolinplotDistance "$distance\t"; + # print writefileViolinplotIntensity "$intensity\t"; + + } + + while () { + chomp; + # print "$_\n"; + my @maxPeakLines = split(/\t/,"$_"); + my $violinColour = ""; + my $scatterLabel = ""; + + if ($maxPeakLines[1] >= 3) { + $violinColour = "blue"; + $scatterLabel = $currTf; + } else { + $violinColour = "orange"; + $scatterLabel = "."; + } + + my $scatterNumberLabel = "$counter"; + my $scatterNumberNameLabel = "$counter-$currTf"; + + my $pointColourHex = "-"; + if ($clusterColourFlag == 1) { + $pointColourHex = $clusterColours[$counter]; + # print "$pointColourHex\n"; + } + + # print "$currTf\t$pointColourHex\n"; + + print writefileScatter "$_\t$violinColour\t$scatterLabel\t$scatterNumberLabel\t$scatterNumberNameLabel\t$pointColourHex\t$currTf"; + # print writefileScatterColour "#"."$pointColourHex\n"; + } + + # print "$currTf\t$distance\t$intensity\n"; + + # print writefileViolinplotDistance "\n"; + # print writefileViolinplotIntensity "\n"; + print writefileScatter "\n"; + + $counter++; + +} \ No newline at end of file diff --git a/overallplotsdata.pl b/overallplotsdata.pl new file mode 100644 index 0000000..6d56ad6 --- /dev/null +++ b/overallplotsdata.pl @@ -0,0 +1,99 @@ +#!/usr/bin/perl +use warnings; +use integer; +use POSIX; + +my $statsDirectory = "stats"; + +open runfile, "$statsDirectory/run.txt";# or die $!; + +# open (writefileViolinplotIntensity, ">".$statsDirectory."/violin-intensity.txt");# or die $!; +# open (writefileViolinplotDistance, ">".$statsDirectory."/violin-distance.txt");# or die $!; + +open (writefileScatter, ">".$statsDirectory."/scatter-plot-data.txt"); +open (writefileScatterColour, ">".$statsDirectory."/scatter-plot-data-colour.txt"); + +#check cluster colour file +my $colourbarFile = 'stats/heatmap-cluster-colours.txt'; +my $clusterColourFlag = 0; +my @clusterColours = (); +if (-e $colourbarFile) { + print "Cluster colour file exists.\n"; + $clusterColourFlag = 1; + open (clusterColourFile, $colourbarFile); + + while () { + @clusterColours = split(/#/,"$_"); + } +} else { + print "Cluster colour file doesn't exist.\n"; +} + +my $counter = 1; + +while (){ + chomp; + my $currTf = $_; + print "$currTf\n"; + # print "$statsDirectory/$currTf/bedfileparts-gteq1.5peak-summit/all-bed-stats-010.txt\n"; + + open currTfAllBedStatsFile, "$statsDirectory/$currTf/bedfileparts-gteq1.5peak-summit/all-bed-stats-010.txt";# or die $!; + + open currTfMaxpeakFile, "$statsDirectory/$currTf/bedfileparts-gteq1.5peak-summit/splinemaxpeak.txt"; + + my $distance = 0; + my $intensity = 0; + + while () { + chomp; + my @line = split(/\t/,"$_"); + + $distance = $line[0]; + $intensity = $line[1]; + + # print "$distance\t$intensity\n"; + + # print writefileViolinplotDistance "$distance\t"; + # print writefileViolinplotIntensity "$intensity\t"; + + } + + while () { + chomp; + # print "$_\n"; + my @maxPeakLines = split(/\t/,"$_"); + my $violinColour = ""; + my $scatterLabel = ""; + + if ($maxPeakLines[1] >= 3) { + $violinColour = "blue"; + $scatterLabel = $currTf; + } else { + $violinColour = "orange"; + $scatterLabel = "."; + } + + my $scatterNumberLabel = "$counter"; + my $scatterNumberNameLabel = "$counter-$currTf"; + + my $pointColourHex = "-"; + if ($clusterColourFlag == 1) { + $pointColourHex = $clusterColours[$counter]; + # print "$pointColourHex\n"; + } + + # print "$currTf\t$pointColourHex\n"; + + print writefileScatter "$_\t$violinColour\t$scatterLabel\t$scatterNumberLabel\t$scatterNumberNameLabel\t$pointColourHex\t$currTf"; + # print writefileScatterColour "#"."$pointColourHex\n"; + } + + # print "$currTf\t$distance\t$intensity\n"; + + # print writefileViolinplotDistance "\n"; + # print writefileViolinplotIntensity "\n"; + print writefileScatter "\n"; + + $counter++; + +} \ No newline at end of file diff --git a/processbedfile-summit.pl b/processbedfile-summit.pl new file mode 100644 index 0000000..60f5f4a --- /dev/null +++ b/processbedfile-summit.pl @@ -0,0 +1,26 @@ +#!/usr/bin/perl +use integer; +use warnings; + + +my $tf = $ARGV[0]; +my $bedfile = "$tf"; +my $bedfilePath = "bedfiles/$bedfile"; + +mkdir "$bedfilePath-parts-summit"; +my $writeFileName = "bedfiles/$bedfile-parts-summit/$bedfile-modified-summit.txt"; +open openfile, $bedfilePath; + +open (writefile, ">$writeFileName"); + +while(){ + chomp; + my $line = "$_\n"; + my @entries = split("\t"); + my $middle = $entries[1] + $entries[9] - 1; + #print "$writeFileName\n"; + print writefile "$entries[0]\t$entries[1]\t$entries[2]\t$middle\n"; +} + +system("pwd"); +system("bash bedfilesplit.sh $tf"); \ No newline at end of file diff --git a/processbedfile.pl b/processbedfile.pl new file mode 100644 index 0000000..9b5355e --- /dev/null +++ b/processbedfile.pl @@ -0,0 +1,25 @@ +#!/usr/bin/perl +use integer; +use warnings; + + +#system("rm -rf $dirname"); + +my $bedfile = $ARGV[0]; + +my $bedfilePath = "testbedfiles/$bedfile"; +my $writeFileName = "testbedfiles/$bedfile-modified.txt"; +open openfile, $bedfilePath; +#system("rm -rf $writeFileName"); +#mkdir $dirname; + +open (writefile, ">$writeFileName"); + +while(){ + chomp; + my $line = "$_\n"; + my @entries = split("\t"); + my $middle = ($entries[1] + $entries[2])/2; + #print "$writeFileName\n"; + print writefile "$entries[0]\t$entries[1]\t$entries[2]\t$middle\n"; +} \ No newline at end of file diff --git a/readwigbedgraph.pl b/readwigbedgraph.pl new file mode 100644 index 0000000..bf70002 --- /dev/null +++ b/readwigbedgraph.pl @@ -0,0 +1,24 @@ +#!/usr/bin/perl +use warnings; + +my $wigfile = $ARGV[0]; + +open FILE, "wigfiles/$wigfile"; +my $dirname = "wigfiles/$wigfile-parts"; +system("rm -f $dirname/*"); +mkdir $dirname; + +while (){ + chomp; + $line = "$_\n"; + if ($line =~ /(^.*section )(chr.*)(:.*$)/) { + $curfile = "$dirname/$2.txt"; + print "$curfile\n"; + close(writefile); + open (writefile, ">>$curfile"); + print "$1\n"; + } elsif ($curfile) { + print writefile $line; + } +} +close (writefile); \ No newline at end of file diff --git a/run-normalize-todo.txt b/run-normalize-todo.txt new file mode 100644 index 0000000..c82ea29 --- /dev/null +++ b/run-normalize-todo.txt @@ -0,0 +1,76 @@ +wgEncodeHaibTfbsGm12878Elf1sc631V0416101PkRep1.broadPeak +wgEncodeHaibTfbsGm12878GabpPcr2xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878Elk112771IggmusPk.narrowPeak +wgEncodeHaibTfbsGm12878Creb1sc240V0422111PkRep1.broadPeak +wgEncodeSydhTfbsGm12878Sin3anb6001263IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Mxi1IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878E2f4IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Mazab85725IggmusPk.narrowPeak +wgEncodeHaibTfbsGm12878Egr1V0416101PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Pmlsc71910V0422111PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Pou2f2Pcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878Pol2s2IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Pol2IggmusPk.narrowPeak +wgEncodeHaibTfbsGm12878Taf1Pcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878TbpIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Srebp2IggrabPk.narrowPeak +wgEncodeSydhTfbsGm12878Corestsc30189IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Stat1StdPk.narrowPeak +wgEncodeSydhTfbsGm12878MafkIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878ErraIggrabPk.narrowPeak +wgEncodeSydhTfbsGm12878Srebp1IggrabPk.narrowPeak +wgEncodeSydhTfbsGm12878Chd1a301218aIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878WhipIggmusPk.narrowPeak +wgEncodeHaibTfbsGm12878Bclaf101388V0416101PkRep1.broadPeak +wgEncodeHaibTfbsGm12878RxraPcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878MaxIggmusPk.narrowPeak +wgEncodeHaibTfbsGm12878Zeb1sc25388V0416102PkRep1.broadPeak +wgEncodeHaibTfbsGm12878SrfV0416101PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Zbtb33Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Ets1Pcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878Chd2ab68301IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Brca1a300IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Znf143166181apStdPk.narrowPeak +wgEncodeHaibTfbsGm12878Six5Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Yy1sc281Pcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878Nrf1IggmusPk.narrowPeak +wgEncodeHaibTfbsGm12878Foxm1sc502V0422111PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Nficsc81335V0422111PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Atf2sc81188V0422111PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Runx3sc101553V0422111PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Mef2aPcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878Tblr1ab24550IggmusPk.narrowPeak +wgEncodeHaibTfbsGm12878Bcl3V0416101PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Cebpbsc150V0422111PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Mta3sc81325V0422111PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Stat5asc74442V0422111PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Nfatc1sc17834V0422111PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Irf4sc6059Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878BatfPcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Bcl11aPcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878P300bStdPk.narrowPeak +wgEncodeSydhTfbsGm12878Ikzf1iknuclaStdPk.narrowPeak +wgEncodeHaibTfbsGm12878Mef2csc13268V0416101PkRep1.broadPeak +wgEncodeSydhTfbsGm12878Stat3IggmusPk.narrowPeak +wgEncodeHaibTfbsGm12878Tcf12Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Tcf3Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Pax5c20Pcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878Cdpsc6327IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Ebf1sc137065StdPk.narrowPeak +wgEncodeSydhTfbsGm12878JundStdPk.narrowPeak +wgEncodeHaibTfbsGm12878Pu1Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Sp1Pcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878NfybIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878NfyaIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Irf3IggmusPk.narrowPeak +wgEncodeHaibTfbsGm12878Pbx3Pcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878CfosStdPk.narrowPeak +wgEncodeSydhTfbsGm12878Rfx5200401194IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Usf2IggmusPk.narrowPeak +wgEncodeHaibTfbsGm12878Usf1Pcr2xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Atf3Pcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878Nfe2sc22827StdPk.narrowPeak +wgEncodeSydhTfbsGm12878Bhlhe40cIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Smc3ab9263IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Rad21IggrabPk.narrowPeak +wgEncodeSydhTfbsGm12878Ctcfsc15914c20StdPk.narrowPeak diff --git a/run-normalize.txt b/run-normalize.txt new file mode 100644 index 0000000..abcfc1b --- /dev/null +++ b/run-normalize.txt @@ -0,0 +1,76 @@ +x wgEncodeHaibTfbsGm12878Elf1sc631V0416101PkRep1.broadPeak +x wgEncodeHaibTfbsGm12878GabpPcr2xPkRep1.broadPeak +x wgEncodeSydhTfbsGm12878Elk112771IggmusPk.narrowPeak +x wgEncodeHaibTfbsGm12878Creb1sc240V0422111PkRep1.broadPeak +x wgEncodeSydhTfbsGm12878Sin3anb6001263IggmusPk.narrowPeak +x wgEncodeSydhTfbsGm12878Mxi1IggmusPk.narrowPeak +x wgEncodeSydhTfbsGm12878E2f4IggmusPk.narrowPeak +x wgEncodeSydhTfbsGm12878Mazab85725IggmusPk.narrowPeak +x wgEncodeHaibTfbsGm12878Egr1V0416101PkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Pmlsc71910V0422111PkRep1.broadPeak +x wgEncodeHaibTfbsGm12878Pou2f2Pcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878Pol2s2IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Pol2IggmusPk.narrowPeak +wgEncodeHaibTfbsGm12878Taf1Pcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878TbpIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Srebp2IggrabPk.narrowPeak +wgEncodeSydhTfbsGm12878Corestsc30189IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Stat1StdPk.narrowPeak +wgEncodeSydhTfbsGm12878MafkIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878ErraIggrabPk.narrowPeak +wgEncodeSydhTfbsGm12878Srebp1IggrabPk.narrowPeak +wgEncodeSydhTfbsGm12878Chd1a301218aIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878WhipIggmusPk.narrowPeak +wgEncodeHaibTfbsGm12878Bclaf101388V0416101PkRep1.broadPeak +wgEncodeHaibTfbsGm12878RxraPcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878MaxIggmusPk.narrowPeak +wgEncodeHaibTfbsGm12878Zeb1sc25388V0416102PkRep1.broadPeak +wgEncodeHaibTfbsGm12878SrfV0416101PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Zbtb33Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Ets1Pcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878Chd2ab68301IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Brca1a300IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Znf143166181apStdPk.narrowPeak +wgEncodeHaibTfbsGm12878Six5Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Yy1sc281Pcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878Nrf1IggmusPk.narrowPeak +wgEncodeHaibTfbsGm12878Foxm1sc502V0422111PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Nficsc81335V0422111PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Atf2sc81188V0422111PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Runx3sc101553V0422111PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Mef2aPcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878Tblr1ab24550IggmusPk.narrowPeak +wgEncodeHaibTfbsGm12878Bcl3V0416101PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Cebpbsc150V0422111PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Mta3sc81325V0422111PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Stat5asc74442V0422111PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Nfatc1sc17834V0422111PkRep1.broadPeak +wgEncodeHaibTfbsGm12878Irf4sc6059Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878BatfPcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Bcl11aPcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878P300bStdPk.narrowPeak +wgEncodeSydhTfbsGm12878Ikzf1iknuclaStdPk.narrowPeak +wgEncodeHaibTfbsGm12878Mef2csc13268V0416101PkRep1.broadPeak +wgEncodeSydhTfbsGm12878Stat3IggmusPk.narrowPeak +wgEncodeHaibTfbsGm12878Tcf12Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Tcf3Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Pax5c20Pcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878Cdpsc6327IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Ebf1sc137065StdPk.narrowPeak +wgEncodeSydhTfbsGm12878JundStdPk.narrowPeak +wgEncodeHaibTfbsGm12878Pu1Pcr1xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Sp1Pcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878NfybIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878NfyaIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Irf3IggmusPk.narrowPeak +wgEncodeHaibTfbsGm12878Pbx3Pcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878CfosStdPk.narrowPeak +wgEncodeSydhTfbsGm12878Rfx5200401194IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Usf2IggmusPk.narrowPeak +wgEncodeHaibTfbsGm12878Usf1Pcr2xPkRep1.broadPeak +wgEncodeHaibTfbsGm12878Atf3Pcr1xPkRep1.broadPeak +wgEncodeSydhTfbsGm12878Nfe2sc22827StdPk.narrowPeak +wgEncodeSydhTfbsGm12878Bhlhe40cIggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Smc3ab9263IggmusPk.narrowPeak +wgEncodeSydhTfbsGm12878Rad21IggrabPk.narrowPeak +wgEncodeSydhTfbsGm12878Ctcfsc15914c20StdPk.narrowPeak diff --git a/submit-job-on-cluster.txt b/submit-job-on-cluster.txt new file mode 100644 index 0000000..a8a1639 --- /dev/null +++ b/submit-job-on-cluster.txt @@ -0,0 +1 @@ +bash bedfile-statsjob-qsub.sh #bedfilename wgEncodeHaibTfbsGm12878BatfPcr1xPkRep1.broadPeak \ No newline at end of file diff --git a/toolbedgraph-bash.sh b/toolbedgraph-bash.sh new file mode 100644 index 0000000..e00eeb6 --- /dev/null +++ b/toolbedgraph-bash.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +time perl toolbedgraph.pl $1 $2 \ No newline at end of file diff --git a/toolbedgraph.pl b/toolbedgraph.pl new file mode 100644 index 0000000..3ae08a3 --- /dev/null +++ b/toolbedgraph.pl @@ -0,0 +1,197 @@ +#!/usr/bin/perl +use POSIX; +#use Statistics::R; +use warnings; +use File::Path qw(make_path remove_tree); + +my $tf = $ARGV[0]; +my $bedName = $ARGV[1]; + +my $bedDirectory = "bedfiles/$tf-parts-summit/"; +my $statsDirectory = "stats/$tf/bedfileparts-gteq1.5peak-summit/"; +make_path $statsDirectory; +my $wigDirectory = "wigfiles/wgEncodeSydhNsomeGm12878Sig.wig-parts/"; +(my $bedNameWithoutExt = $bedName) =~ s/\.[^.]+$//; +my $statsName = $bedNameWithoutExt."-stats.txt"; + +print "bedName: $bedName\t statsName: $statsName\n"; + +open bedfile, $bedDirectory.$bedName; +open (writefile, ">".$statsDirectory.$statsName); + +my $bedCount = 0; +my $peakCount = 0; +my $plusMinusRange = 500; +my $peakThreshold = 1.5; +# my $R = Statistics::R->new(); +# $R->startR; + +while (){ + $bedCount++; + print "\nBEDCOUNT: $bedCount\n"; + chomp; + my @line = split(/\t/,"$_"); + + my $chr = $line[0]; + my $location = $line[3]; + my $locationFlag = substr($location, 0, -4); + + # $lastDigit = substr($location, -1); + # $roundUp = (10 - $lastDigit) + 1; + + my $wigRangeBegin = ($location - $plusMinusRange);# + $roundUp; + my $wigRangeEnd = ($location + $plusMinusRange);# + $roundUp; + + # print "$line[0]\n"; + # print "$line[3]\n"; + + #print "lastdigit \t $lastDigit\t roundUp \t $roundUp\n"; + print "$chr\t$location\n"; + print "$wigRangeBegin\t$wigRangeEnd\n"; + + open readwigfile, $wigDirectory."$line[0].txt"; + + my @wigArrayLocations = (); + my @wigArrayPeaks = (); + # my $middleIndex = 0; + # my $loopTally = 0; + + # # in case the high peak is rather small, at least there will be a peak + # my $guaranteePeak = 0; + + print "location flag: $locationFlag\n"; + + while (){ + chomp; + if ($_ =~ m/chr.*\t$locationFlag....\t.*$/) { + my @wigLines = split(/\t/,"$_"); + + #print "IN IN IN\n"; + + #@wigLines = $_; + # print "@wigLines\n"; + if ((($wigLines[1] + $wigLines[2])/2) >= $wigRangeBegin && (($wigLines[1] + $wigLines[2])/2) <= $wigRangeEnd){ + #print "IN\n"; + push(@wigArrayLocations, ($wigLines[1]+$wigLines[2])/2); + push(@wigArrayPeaks, $wigLines[3]); + + # if ($wigLines[3] > $guaranteePeak) { + # $guaranteePeak = $wigLines[3]; + # } + + + # if (@wigLines[0] == $location){ + # $middleIndex = $loopTally; + # } + + # $loopTally++; + #print "@wigArray"; + } elsif (scalar(@wigArrayLocations) > 1) { + last; + } + } + #print "$wigRangeBegin $wigRangeEnd"; + #print "$_\n"; + } + close(readwigfile); + + # -------- this used to show + + # print "middle index: \t $middleIndex\n"; + + # #print "@wigArrayLocations\n"; + # print "@wigArrayPeaks\n"; + # print "size: " . scalar(@wigArrayPeaks) . "\n"; + + # ------- end show + + my $wigArrayIndices = scalar(@wigArrayPeaks); + + # -------- this used to show + + # for ($i=0; $i<$wigArrayIndices; $i++){ + # print "$wigArrayLocations[$i]\t$wigArrayPeaks[$i]\n"; + # } + + # ------- end show + + # print scalar(@wigArrayLocations); + # print scalar(@wigArrayPeaks); + + + # filter out sequential duplicate values + my @orig_index = 0; + my @deduped = $wigArrayPeaks[0]; + for my $index ( 1..$#wigArrayPeaks ) { + if ( $wigArrayPeaks[$index] != $wigArrayPeaks[$index-1] ) { + push @deduped, $wigArrayPeaks[$index]; + push @orig_index, $index; + } + } + + # print "@wigArrayLocations\n"; + + my @maxima = (); + for my $index ( 0..$#deduped ) { + #my peakThreshold = 0; # figure out what the threshold should be + #print "ADDING TO MAXIMA\n"; + if ($deduped[$index] > $deduped[$index-1] && $deduped[$index] > $deduped[$index+1] && $deduped[$index] >= $peakThreshold) { #make variable for peak threshold + push @maxima, $orig_index[$index]; + } + } + + # if ($wigArrayPeaks[0] > $wigArrayPeaks[1]){ + # unshift(@maxima,0); + # } + + # if ($wigArrayPeaks[-1] > $wigArrayPeaks[-2]){ + # push(@maxima,-1); + # } + + print "********* MAXIMA INDICES *********\n"; + + foreach (@maxima) { + #print "$_\n"; + print "$wigArrayLocations[$_]\t$wigArrayPeaks[$_]\n"; + } + + my $minIndex = 0; + my $minDistance = abs($location - $wigArrayLocations[0]); + #print "wigarraylocation:\t$wigArrayLocations[0]\tbefore maxima loop:\t$minDistance\n"; + my $minDistancePeak = 0; + + print "@maxima\n"; + + foreach (@maxima) { + if (abs($location - $wigArrayLocations[$_]) < $minDistance) { + $minIndex = $_; + $minDistance = abs($location - $wigArrayLocations[$_]); + #print "in maxima loop:\t$minDistance\n"; + $minDistancePeak = $wigArrayPeaks[$_]; + } + } + + if ($minDistance == $location) { + $minDistance = $plusMinusRange; + } + + print "minIndex: $minIndex\n"; + print "minDistance: $minDistance\n"; + print "minDistancePeak: $minDistancePeak\n"; + + if ($minDistancePeak > 0) { + print writefile "$minDistance\t$minDistancePeak\n"; + $peakCount++; + } +} +close(bedfile); + +my $nonPeakCount = $bedCount-$peakCount; + +open statfile, ">".$statsDirectory."overview-$statsName.txt"; +print statfile "File:\t$statsName\n"; +print statfile "Total ranges:\t$bedCount\n"; +print statfile "Total peaks:\t$peakCount\t".(($peakCount/$bedCount)*100)."%\n"; +print statfile "Total non-peaks:\t".($bedCount-$peakCount)."\t".((($nonPeakCount)/$bedCount)*100)."%\n"; +close(statfile); +