improvements

pdimens · May 2, 2024 · 2b9072d · 2b9072d
1 parent e56a9a6
commit 2b9072d
Showing 1 changed file with 63 additions and 88 deletions.
diff --git a/src/harpy/reports/HapCut2.Rmd b/src/harpy/reports/HapCut2.Rmd
@@ -20,7 +20,7 @@ using<-function(...) {
         lapply(need,require,character.only=TRUE)
     }
 }
-using("flexdashboard","dplyr","ggplot2","DT","scales", "ggridges")
+using("flexdashboard","dplyr","ggplot2","DT","scales", "highcharter","ggridges")
 
 #library(flexdashboard)
 #library(dplyr)
@@ -49,8 +49,8 @@ script src = "https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js"
 ```
 
 ```{r echo= FALSE, message = FALSE, warning = FALSE}
-#file <- "~/blocks.summary.gz"
-file <- snakemake@input[[1]]
+file <- "/home/pdimens/Documents/harpy/Phase/reports/blocks.summary.gz"
+#file <- snakemake@input[[1]]
 df <- read.table(
   file, header = T,
   colClasses = c("factor","factor","integer","integer", "integer")
@@ -61,6 +61,7 @@ if(nrow(df) < 1){
 }
 df$pos_end <- df$pos_start + df$block_length
 df <- df[, c(1,2,3,4,6,5)]
+levels(df$sample) <- basename(levels(df$sample))
 ```
 
 ```{r echo = FALSE, message = FALSE, warning = FALSE}
@@ -74,8 +75,10 @@ if (nrow(contigs) > 30){
 } else {
     .contigs <- contigs$contig
 }
-pltheight <- round(1.2 * (length(.contigs)), digits = 0)
+#pltheight <- round(1.2 * (length(.contigs)), digits = 0)
 ridgeheight <- 1 + round(0.7 * (length(.contigs)), digits = 0)
+#pltheight.samples <- <- round(1.2 * (length(levels(df$sample))), digits = 0)
+ridgeheight.samples <- 1 + round(0.7 * (length(levels(df$sample))), digits = 0)
 ```
 
 # General Stats
@@ -131,62 +134,57 @@ valueBox(scales::comma(overall$max_blocksize[1]), caption = "Longest Haplotype (
 <h2> Distribution of Haplotype Lengths </h2>
 The plots below show the distribution of haplotype length (in base pairs). 
 Phasing typically has a few very large haplotypes, resulting in extreme 
-right-tails in these distributions, making regular visualization
-difficult to plot meaningfully. Therefore, the plot on the left is the distribution of raw (unscaled) haplotype lengths and the one on the right is presented as **log-scaled lengths**
+right-tails in these distributions, therefore, the X-axis is **log-scaled lengths**
 to collapse the right tail for better readability. 
 
 ## dist plot
 ### the dist plpt {.no-title}
-```{r warning=FALSE, message=FALSE, echo= FALSE, out.width="100%", fig.height=2.7}
-ggplot(df, aes(block_length, after_stat(count))) + 
-  geom_histogram(color = "#8897a7", fill = "#ABBDD1") +
-  theme_minimal() +
-  theme(
-    panel.grid.minor.y = element_blank(),
-    panel.grid.minor.x = element_blank()
-    ) +
-  scale_y_continuous(labels = label_comma()) +
-  scale_x_continuous(labels = label_comma()) +
-  xlab("Haplotype Length") +
-  ylab("Count") +
-  labs(title = "Distribution of Haplotype Lengths")
-```
-
-### log dist plot {.no-title}
-```{r warning=FALSE, message=FALSE, echo= FALSE, out.width="100%", fig.height=2.7}
-ggplot(df, aes(block_length, after_stat(count))) + 
-  geom_density(color = "#859f9a", fill = "#bfe4dc") +
-  theme_minimal() +
-  theme(
-    panel.grid.minor.y = element_blank(),
-    panel.grid.minor.x = element_blank()
-    ) +
-   scale_y_continuous(labels = label_comma()) +
-   scale_x_log10(
-    breaks = trans_breaks("log10", function(x) 10^x),
-    labels = trans_format("log10", math_format(10^.x))
-  ) +
-  xlab("Haplotype Length (log scale)") +
-  ylab("Count") +
-  labs(title = "Distribution of Haplotype Lengths")
+```{r warning=FALSE, message=FALSE, echo= FALSE, out.width="100%"}
+hs <- hist(
+  df$block_length,
+  breaks = 500,
+  plot = F
+)
+hs <- data.frame(val = hs$mids, freq = hs$counts)
+hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#7eb495", name = "count") |>
+  hc_xAxis(title = list(text = "haplotype length in base pairs (log scale)"), type = "logarithmic") |>
+  hc_title(text = "Haplotype block lengths") |>
+  hc_yAxis(title = list(text = "count")) |>
+  hc_tooltip(crosshairs = TRUE) |>
+  hc_exporting(enabled = T, filename = "haplotype.hist",
+    buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG")))
+  )
 ```
 
 ## Per contig data desc
-### table per contig {.no0-title}
+### table per contig {.no-title}
 <h2> Stats Per Contig </h2>
-The table below provides details on the outcome of haplotype phasing on a per-contig basis.
-
-### distribution per contig {.no-title}
 The plot below shows the distribution of haplotype length (in base pairs) for 
 each contig, up to 30 of the largest contigs, the size of whom is inferred/assumed
 from the end position of the last haplotype. Phasing typically results in extreme
-right-tails in these distributions, making regular visualization
-difficult to plot meaningfully. The plot presented below has **log-scaled lengths**
+right-tails in these distributions, therefore the plot presented below has **log-scaled lengths**
 to collapse the right tails for better readability. 
+The table below provides details on the outcome of haplotype phasing on a per-contig basis.
 
 ## Per contig data
+### per contig plot {.no-title}
+```{r warning=FALSE, message=FALSE, echo= FALSE, fig.height=ridgeheight, fig.width=8}
+ggplot(df, aes(x = block_length, y = contig, fill = after_stat(x))) +
+  geom_density_ridges_gradient(scale = 1.4) +
+  scale_fill_viridis_c(option = "C", trans = "log10") +
+  theme_minimal() +
+  theme(legend.position = "none") +
+  scale_x_log10(
+     breaks = trans_breaks("log10", function(x) 10^x),
+     labels = trans_format("log10", math_format(10^.x))
+  ) +
+  labs(title = "Distribution of Haplotype Lengths by Contig", fill = "Haplotype Length (bp)") +
+  xlab("Haplotype Length (log scale)") +
+  ylab("")
+```
+
 ### stats per contig {.no-title}
-```{r echo= FALSE, message = FALSE, warning = FALSE}
+```{r echo= FALSE, message = FALSE, warning = FALSE, out.width="100%"}
 percontig <- df %>% group_by(contig) %>% summarise(
     n_haplo = round(length(n_snp)),
     mean_snps = round(mean(n_snp), digits = 0),
@@ -202,31 +200,9 @@ DT::datatable(
   extensions = 'Buttons',
   options = list(dom = 'Brtip', buttons = c('csv'), scrollX = TRUE, pageLength = length(levels(df$contig))),
   colnames = c("Contig", "Total Haplotypes", "Mean SNPs", "Median SNPs", "Mean Haplotype Length", "Median Haplotype Length", "Largest Haplotype"),
-  autoHideNavigation = T,
-  fillContainer = T
+  fillContainer = F
   )
 ```
-### per contig plot {.no-title}
-```{r warning=FALSE, message=FALSE, echo= FALSE, fig.height=pltheight, out.width="100%", fig.width=8}
-ggplot(df, aes(block_length, after_stat(count))) + 
-  geom_density(color = "#af9caf", fill = "#DBC3DB") +
-  theme_light() +
-  theme(
-    panel.grid.minor.y = element_blank(),
-    panel.grid.minor.x = element_blank(),
-    strip.background =element_rect(fill="grey90", color = "grey70"),
-    strip.text = element_text(colour = 'grey40')
-    ) +
-  scale_y_continuous(labels = label_comma()) +
-  scale_x_log10(
-     breaks = trans_breaks("log10", function(x) 10^x),
-     labels = trans_format("log10", math_format(10^.x))
-  ) +
-  facet_wrap(~contig, ncol = 3, scales = "free") +
-  xlab("Haplotype Length (log scale)") +
-  ylab("Count") +
-  labs(title = "Distribution of Haplotype Lengths Per Contig")
-```
 
 # Per-Sample Stats
 ## per sample desc
@@ -242,8 +218,24 @@ visualization difficult to plot meaningfully. The plot presented below has
 **log-scaled lengths** to collapse the right tails for better readability. 
 
 ## per sample 
+### the ridgeplot {.no-title}
+```{r warning=FALSE, message=FALSE, echo= FALSE, fig.height=ridgeheight.samples, fig.width=8}
+ggplot(df, aes(x = block_length, y = sample, fill = after_stat(x))) +
+  geom_density_ridges_gradient(scale = 1.4) +
+  scale_fill_viridis_c(option = "G", trans = "log10") +
+  theme_minimal() +
+  theme(legend.position = "none") +
+  scale_x_log10(
+     breaks = trans_breaks("log10", function(x) 10^x),
+     labels = trans_format("log10", math_format(10^.x))
+  ) +
+  labs(title = "Distribution of Haplotype Lengths by Sample", fill = "Haplotype Length (bp)") +
+  xlab("Haplotype Length (log scale)") +
+  ylab("")
+```
+
 ### stats per sample {.no-title}
-```{r echo= FALSE, message = FALSE, warning = FALSE}
+```{r echo= FALSE, message = FALSE, warning = FALSE, out.width="100%"}
 persample <- df %>% group_by(sample) %>% summarise(
     n_haplo = sum(length(n_snp)),
     mean_snps = round(mean(n_snp), digits = 0),
@@ -259,23 +251,6 @@ DT::datatable(
   extensions = 'Buttons',
   options = list(dom = 'Brtip', buttons = c('csv'), scrollX = TRUE, pageLength = length(levels(df$sample))),
   colnames = c("Sample", "Haplotypes", "Mean SNPs", "Median SNPs", "Mean Haplotype Length", "Median Haplotype Length", "Largest Haplotype"),
-  autoHideNavigation = T,
-  fillContainer = T,
+  fillContainer = F
 )
-```
-
-### the ridgeplot {.no-title}
-```{r warning=FALSE, message=FALSE, echo= FALSE, fig.height=ridgeheight, fig.width=8}
-ggplot(df, aes(x = block_length, y = sample, fill = stat(x))) +
-  geom_density_ridges_gradient(scale = 1.4) +
-  scale_fill_viridis_c(option = "G", trans = "log10") +
-  theme_minimal() +
-  theme(legend.position = "none") +
-  scale_x_log10(
-     breaks = trans_breaks("log10", function(x) 10^x),
-     labels = trans_format("log10", math_format(10^.x))
-  ) +
-  labs(title = "Distribution of Haplotype Lengths by Sample", fill = "Haplotype Length (bp)") +
-  xlab("Haplotype Length (log scale)") +
-  ylab("")
-```
+```