Skip to content

Commit

Permalink
improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
pdimens committed May 2, 2024
1 parent e56a9a6 commit 2b9072d
Showing 1 changed file with 63 additions and 88 deletions.
151 changes: 63 additions & 88 deletions src/harpy/reports/HapCut2.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ using<-function(...) {
lapply(need,require,character.only=TRUE)
}
}
using("flexdashboard","dplyr","ggplot2","DT","scales", "ggridges")
using("flexdashboard","dplyr","ggplot2","DT","scales", "highcharter","ggridges")
#library(flexdashboard)
#library(dplyr)
Expand Down Expand Up @@ -49,8 +49,8 @@ script src = "https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js"
```

```{r echo= FALSE, message = FALSE, warning = FALSE}
#file <- "~/blocks.summary.gz"
file <- snakemake@input[[1]]
file <- "/home/pdimens/Documents/harpy/Phase/reports/blocks.summary.gz"
#file <- snakemake@input[[1]]
df <- read.table(
file, header = T,
colClasses = c("factor","factor","integer","integer", "integer")
Expand All @@ -61,6 +61,7 @@ if(nrow(df) < 1){
}
df$pos_end <- df$pos_start + df$block_length
df <- df[, c(1,2,3,4,6,5)]
levels(df$sample) <- basename(levels(df$sample))
```

```{r echo = FALSE, message = FALSE, warning = FALSE}
Expand All @@ -74,8 +75,10 @@ if (nrow(contigs) > 30){
} else {
.contigs <- contigs$contig
}
pltheight <- round(1.2 * (length(.contigs)), digits = 0)
#pltheight <- round(1.2 * (length(.contigs)), digits = 0)
ridgeheight <- 1 + round(0.7 * (length(.contigs)), digits = 0)
#pltheight.samples <- <- round(1.2 * (length(levels(df$sample))), digits = 0)
ridgeheight.samples <- 1 + round(0.7 * (length(levels(df$sample))), digits = 0)
```

# General Stats
Expand Down Expand Up @@ -131,62 +134,57 @@ valueBox(scales::comma(overall$max_blocksize[1]), caption = "Longest Haplotype (
<h2> Distribution of Haplotype Lengths </h2>
The plots below show the distribution of haplotype length (in base pairs).
Phasing typically has a few very large haplotypes, resulting in extreme
right-tails in these distributions, making regular visualization
difficult to plot meaningfully. Therefore, the plot on the left is the distribution of raw (unscaled) haplotype lengths and the one on the right is presented as **log-scaled lengths**
right-tails in these distributions, therefore, the X-axis is **log-scaled lengths**
to collapse the right tail for better readability.

## dist plot
### the dist plpt {.no-title}
```{r warning=FALSE, message=FALSE, echo= FALSE, out.width="100%", fig.height=2.7}
ggplot(df, aes(block_length, after_stat(count))) +
geom_histogram(color = "#8897a7", fill = "#ABBDD1") +
theme_minimal() +
theme(
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank()
) +
scale_y_continuous(labels = label_comma()) +
scale_x_continuous(labels = label_comma()) +
xlab("Haplotype Length") +
ylab("Count") +
labs(title = "Distribution of Haplotype Lengths")
```

### log dist plot {.no-title}
```{r warning=FALSE, message=FALSE, echo= FALSE, out.width="100%", fig.height=2.7}
ggplot(df, aes(block_length, after_stat(count))) +
geom_density(color = "#859f9a", fill = "#bfe4dc") +
theme_minimal() +
theme(
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank()
) +
scale_y_continuous(labels = label_comma()) +
scale_x_log10(
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))
) +
xlab("Haplotype Length (log scale)") +
ylab("Count") +
labs(title = "Distribution of Haplotype Lengths")
```{r warning=FALSE, message=FALSE, echo= FALSE, out.width="100%"}
hs <- hist(
df$block_length,
breaks = 500,
plot = F
)
hs <- data.frame(val = hs$mids, freq = hs$counts)
hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#7eb495", name = "count") |>
hc_xAxis(title = list(text = "haplotype length in base pairs (log scale)"), type = "logarithmic") |>
hc_title(text = "Haplotype block lengths") |>
hc_yAxis(title = list(text = "count")) |>
hc_tooltip(crosshairs = TRUE) |>
hc_exporting(enabled = T, filename = "haplotype.hist",
buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG")))
)
```

## Per contig data desc
### table per contig {.no0-title}
### table per contig {.no-title}
<h2> Stats Per Contig </h2>
The table below provides details on the outcome of haplotype phasing on a per-contig basis.

### distribution per contig {.no-title}
The plot below shows the distribution of haplotype length (in base pairs) for
each contig, up to 30 of the largest contigs, the size of whom is inferred/assumed
from the end position of the last haplotype. Phasing typically results in extreme
right-tails in these distributions, making regular visualization
difficult to plot meaningfully. The plot presented below has **log-scaled lengths**
right-tails in these distributions, therefore the plot presented below has **log-scaled lengths**
to collapse the right tails for better readability.
The table below provides details on the outcome of haplotype phasing on a per-contig basis.

## Per contig data
### per contig plot {.no-title}
```{r warning=FALSE, message=FALSE, echo= FALSE, fig.height=ridgeheight, fig.width=8}
ggplot(df, aes(x = block_length, y = contig, fill = after_stat(x))) +
geom_density_ridges_gradient(scale = 1.4) +
scale_fill_viridis_c(option = "C", trans = "log10") +
theme_minimal() +
theme(legend.position = "none") +
scale_x_log10(
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))
) +
labs(title = "Distribution of Haplotype Lengths by Contig", fill = "Haplotype Length (bp)") +
xlab("Haplotype Length (log scale)") +
ylab("")
```

### stats per contig {.no-title}
```{r echo= FALSE, message = FALSE, warning = FALSE}
```{r echo= FALSE, message = FALSE, warning = FALSE, out.width="100%"}
percontig <- df %>% group_by(contig) %>% summarise(
n_haplo = round(length(n_snp)),
mean_snps = round(mean(n_snp), digits = 0),
Expand All @@ -202,31 +200,9 @@ DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Brtip', buttons = c('csv'), scrollX = TRUE, pageLength = length(levels(df$contig))),
colnames = c("Contig", "Total Haplotypes", "Mean SNPs", "Median SNPs", "Mean Haplotype Length", "Median Haplotype Length", "Largest Haplotype"),
autoHideNavigation = T,
fillContainer = T
fillContainer = F
)
```
### per contig plot {.no-title}
```{r warning=FALSE, message=FALSE, echo= FALSE, fig.height=pltheight, out.width="100%", fig.width=8}
ggplot(df, aes(block_length, after_stat(count))) +
geom_density(color = "#af9caf", fill = "#DBC3DB") +
theme_light() +
theme(
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank(),
strip.background =element_rect(fill="grey90", color = "grey70"),
strip.text = element_text(colour = 'grey40')
) +
scale_y_continuous(labels = label_comma()) +
scale_x_log10(
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))
) +
facet_wrap(~contig, ncol = 3, scales = "free") +
xlab("Haplotype Length (log scale)") +
ylab("Count") +
labs(title = "Distribution of Haplotype Lengths Per Contig")
```

# Per-Sample Stats
## per sample desc
Expand All @@ -242,8 +218,24 @@ visualization difficult to plot meaningfully. The plot presented below has
**log-scaled lengths** to collapse the right tails for better readability.

## per sample
### the ridgeplot {.no-title}
```{r warning=FALSE, message=FALSE, echo= FALSE, fig.height=ridgeheight.samples, fig.width=8}
ggplot(df, aes(x = block_length, y = sample, fill = after_stat(x))) +
geom_density_ridges_gradient(scale = 1.4) +
scale_fill_viridis_c(option = "G", trans = "log10") +
theme_minimal() +
theme(legend.position = "none") +
scale_x_log10(
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))
) +
labs(title = "Distribution of Haplotype Lengths by Sample", fill = "Haplotype Length (bp)") +
xlab("Haplotype Length (log scale)") +
ylab("")
```

### stats per sample {.no-title}
```{r echo= FALSE, message = FALSE, warning = FALSE}
```{r echo= FALSE, message = FALSE, warning = FALSE, out.width="100%"}
persample <- df %>% group_by(sample) %>% summarise(
n_haplo = sum(length(n_snp)),
mean_snps = round(mean(n_snp), digits = 0),
Expand All @@ -259,23 +251,6 @@ DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Brtip', buttons = c('csv'), scrollX = TRUE, pageLength = length(levels(df$sample))),
colnames = c("Sample", "Haplotypes", "Mean SNPs", "Median SNPs", "Mean Haplotype Length", "Median Haplotype Length", "Largest Haplotype"),
autoHideNavigation = T,
fillContainer = T,
fillContainer = F
)
```

### the ridgeplot {.no-title}
```{r warning=FALSE, message=FALSE, echo= FALSE, fig.height=ridgeheight, fig.width=8}
ggplot(df, aes(x = block_length, y = sample, fill = stat(x))) +
geom_density_ridges_gradient(scale = 1.4) +
scale_fill_viridis_c(option = "G", trans = "log10") +
theme_minimal() +
theme(legend.position = "none") +
scale_x_log10(
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))
) +
labs(title = "Distribution of Haplotype Lengths by Sample", fill = "Haplotype Length (bp)") +
xlab("Haplotype Length (log scale)") +
ylab("")
```
```

0 comments on commit 2b9072d

Please sign in to comment.