From 2b9072dcdb90e03c69be1771dd2568cad55fc1b1 Mon Sep 17 00:00:00 2001 From: pdimens Date: Thu, 2 May 2024 12:13:05 -0400 Subject: [PATCH] improvements --- src/harpy/reports/HapCut2.Rmd | 151 ++++++++++++++-------------------- 1 file changed, 63 insertions(+), 88 deletions(-) diff --git a/src/harpy/reports/HapCut2.Rmd b/src/harpy/reports/HapCut2.Rmd index 926c077b8..1b6b5cb50 100644 --- a/src/harpy/reports/HapCut2.Rmd +++ b/src/harpy/reports/HapCut2.Rmd @@ -20,7 +20,7 @@ using<-function(...) { lapply(need,require,character.only=TRUE) } } -using("flexdashboard","dplyr","ggplot2","DT","scales", "ggridges") +using("flexdashboard","dplyr","ggplot2","DT","scales", "highcharter","ggridges") #library(flexdashboard) #library(dplyr) @@ -49,8 +49,8 @@ script src = "https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js" ``` ```{r echo= FALSE, message = FALSE, warning = FALSE} -#file <- "~/blocks.summary.gz" -file <- snakemake@input[[1]] +file <- "/home/pdimens/Documents/harpy/Phase/reports/blocks.summary.gz" +#file <- snakemake@input[[1]] df <- read.table( file, header = T, colClasses = c("factor","factor","integer","integer", "integer") @@ -61,6 +61,7 @@ if(nrow(df) < 1){ } df$pos_end <- df$pos_start + df$block_length df <- df[, c(1,2,3,4,6,5)] +levels(df$sample) <- basename(levels(df$sample)) ``` ```{r echo = FALSE, message = FALSE, warning = FALSE} @@ -74,8 +75,10 @@ if (nrow(contigs) > 30){ } else { .contigs <- contigs$contig } -pltheight <- round(1.2 * (length(.contigs)), digits = 0) +#pltheight <- round(1.2 * (length(.contigs)), digits = 0) ridgeheight <- 1 + round(0.7 * (length(.contigs)), digits = 0) +#pltheight.samples <- <- round(1.2 * (length(levels(df$sample))), digits = 0) +ridgeheight.samples <- 1 + round(0.7 * (length(levels(df$sample))), digits = 0) ``` # General Stats @@ -131,62 +134,57 @@ valueBox(scales::comma(overall$max_blocksize[1]), caption = "Longest Haplotype (

Distribution of Haplotype Lengths

The plots below show the distribution of haplotype length (in base pairs). Phasing typically has a few very large haplotypes, resulting in extreme -right-tails in these distributions, making regular visualization -difficult to plot meaningfully. Therefore, the plot on the left is the distribution of raw (unscaled) haplotype lengths and the one on the right is presented as **log-scaled lengths** +right-tails in these distributions, therefore, the X-axis is **log-scaled lengths** to collapse the right tail for better readability. ## dist plot ### the dist plpt {.no-title} -```{r warning=FALSE, message=FALSE, echo= FALSE, out.width="100%", fig.height=2.7} -ggplot(df, aes(block_length, after_stat(count))) + - geom_histogram(color = "#8897a7", fill = "#ABBDD1") + - theme_minimal() + - theme( - panel.grid.minor.y = element_blank(), - panel.grid.minor.x = element_blank() - ) + - scale_y_continuous(labels = label_comma()) + - scale_x_continuous(labels = label_comma()) + - xlab("Haplotype Length") + - ylab("Count") + - labs(title = "Distribution of Haplotype Lengths") -``` - -### log dist plot {.no-title} -```{r warning=FALSE, message=FALSE, echo= FALSE, out.width="100%", fig.height=2.7} -ggplot(df, aes(block_length, after_stat(count))) + - geom_density(color = "#859f9a", fill = "#bfe4dc") + - theme_minimal() + - theme( - panel.grid.minor.y = element_blank(), - panel.grid.minor.x = element_blank() - ) + - scale_y_continuous(labels = label_comma()) + - scale_x_log10( - breaks = trans_breaks("log10", function(x) 10^x), - labels = trans_format("log10", math_format(10^.x)) - ) + - xlab("Haplotype Length (log scale)") + - ylab("Count") + - labs(title = "Distribution of Haplotype Lengths") +```{r warning=FALSE, message=FALSE, echo= FALSE, out.width="100%"} +hs <- hist( + df$block_length, + breaks = 500, + plot = F +) +hs <- data.frame(val = hs$mids, freq = hs$counts) +hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#7eb495", name = "count") |> + hc_xAxis(title = list(text = "haplotype length in base pairs (log scale)"), type = "logarithmic") |> + hc_title(text = "Haplotype block lengths") |> + hc_yAxis(title = list(text = "count")) |> + hc_tooltip(crosshairs = TRUE) |> + hc_exporting(enabled = T, filename = "haplotype.hist", + buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + ) ``` ## Per contig data desc -### table per contig {.no0-title} +### table per contig {.no-title}

Stats Per Contig

-The table below provides details on the outcome of haplotype phasing on a per-contig basis. - -### distribution per contig {.no-title} The plot below shows the distribution of haplotype length (in base pairs) for each contig, up to 30 of the largest contigs, the size of whom is inferred/assumed from the end position of the last haplotype. Phasing typically results in extreme -right-tails in these distributions, making regular visualization -difficult to plot meaningfully. The plot presented below has **log-scaled lengths** +right-tails in these distributions, therefore the plot presented below has **log-scaled lengths** to collapse the right tails for better readability. +The table below provides details on the outcome of haplotype phasing on a per-contig basis. ## Per contig data +### per contig plot {.no-title} +```{r warning=FALSE, message=FALSE, echo= FALSE, fig.height=ridgeheight, fig.width=8} +ggplot(df, aes(x = block_length, y = contig, fill = after_stat(x))) + + geom_density_ridges_gradient(scale = 1.4) + + scale_fill_viridis_c(option = "C", trans = "log10") + + theme_minimal() + + theme(legend.position = "none") + + scale_x_log10( + breaks = trans_breaks("log10", function(x) 10^x), + labels = trans_format("log10", math_format(10^.x)) + ) + + labs(title = "Distribution of Haplotype Lengths by Contig", fill = "Haplotype Length (bp)") + + xlab("Haplotype Length (log scale)") + + ylab("") +``` + ### stats per contig {.no-title} -```{r echo= FALSE, message = FALSE, warning = FALSE} +```{r echo= FALSE, message = FALSE, warning = FALSE, out.width="100%"} percontig <- df %>% group_by(contig) %>% summarise( n_haplo = round(length(n_snp)), mean_snps = round(mean(n_snp), digits = 0), @@ -202,31 +200,9 @@ DT::datatable( extensions = 'Buttons', options = list(dom = 'Brtip', buttons = c('csv'), scrollX = TRUE, pageLength = length(levels(df$contig))), colnames = c("Contig", "Total Haplotypes", "Mean SNPs", "Median SNPs", "Mean Haplotype Length", "Median Haplotype Length", "Largest Haplotype"), - autoHideNavigation = T, - fillContainer = T + fillContainer = F ) ``` -### per contig plot {.no-title} -```{r warning=FALSE, message=FALSE, echo= FALSE, fig.height=pltheight, out.width="100%", fig.width=8} -ggplot(df, aes(block_length, after_stat(count))) + - geom_density(color = "#af9caf", fill = "#DBC3DB") + - theme_light() + - theme( - panel.grid.minor.y = element_blank(), - panel.grid.minor.x = element_blank(), - strip.background =element_rect(fill="grey90", color = "grey70"), - strip.text = element_text(colour = 'grey40') - ) + - scale_y_continuous(labels = label_comma()) + - scale_x_log10( - breaks = trans_breaks("log10", function(x) 10^x), - labels = trans_format("log10", math_format(10^.x)) - ) + - facet_wrap(~contig, ncol = 3, scales = "free") + - xlab("Haplotype Length (log scale)") + - ylab("Count") + - labs(title = "Distribution of Haplotype Lengths Per Contig") -``` # Per-Sample Stats ## per sample desc @@ -242,8 +218,24 @@ visualization difficult to plot meaningfully. The plot presented below has **log-scaled lengths** to collapse the right tails for better readability. ## per sample +### the ridgeplot {.no-title} +```{r warning=FALSE, message=FALSE, echo= FALSE, fig.height=ridgeheight.samples, fig.width=8} +ggplot(df, aes(x = block_length, y = sample, fill = after_stat(x))) + + geom_density_ridges_gradient(scale = 1.4) + + scale_fill_viridis_c(option = "G", trans = "log10") + + theme_minimal() + + theme(legend.position = "none") + + scale_x_log10( + breaks = trans_breaks("log10", function(x) 10^x), + labels = trans_format("log10", math_format(10^.x)) + ) + + labs(title = "Distribution of Haplotype Lengths by Sample", fill = "Haplotype Length (bp)") + + xlab("Haplotype Length (log scale)") + + ylab("") +``` + ### stats per sample {.no-title} -```{r echo= FALSE, message = FALSE, warning = FALSE} +```{r echo= FALSE, message = FALSE, warning = FALSE, out.width="100%"} persample <- df %>% group_by(sample) %>% summarise( n_haplo = sum(length(n_snp)), mean_snps = round(mean(n_snp), digits = 0), @@ -259,23 +251,6 @@ DT::datatable( extensions = 'Buttons', options = list(dom = 'Brtip', buttons = c('csv'), scrollX = TRUE, pageLength = length(levels(df$sample))), colnames = c("Sample", "Haplotypes", "Mean SNPs", "Median SNPs", "Mean Haplotype Length", "Median Haplotype Length", "Largest Haplotype"), - autoHideNavigation = T, - fillContainer = T, + fillContainer = F ) -``` - -### the ridgeplot {.no-title} -```{r warning=FALSE, message=FALSE, echo= FALSE, fig.height=ridgeheight, fig.width=8} -ggplot(df, aes(x = block_length, y = sample, fill = stat(x))) + - geom_density_ridges_gradient(scale = 1.4) + - scale_fill_viridis_c(option = "G", trans = "log10") + - theme_minimal() + - theme(legend.position = "none") + - scale_x_log10( - breaks = trans_breaks("log10", function(x) 10^x), - labels = trans_format("log10", math_format(10^.x)) - ) + - labs(title = "Distribution of Haplotype Lengths by Sample", fill = "Haplotype Length (bp)") + - xlab("Haplotype Length (log scale)") + - ylab("") -``` +``` \ No newline at end of file