exogenous-rna-profiles.qmd

---
title: "Adamson smallRNA - pegRNA"
author: "Lance Parsons"
format:
  html:
    toc: true
    code-fold: true
    df-print: paged
    embed-resources: true
editor: source
---

# Load libraries

This project uses [`renv`](https://rstudio.github.io/renv/articles/renv.html)
to keep track of installed packages. Install `renv` if not installed and load
dependencies with `renv::restore()`.

```r
install.packages("renv")
renv::restore()
```

```{r}
#| label: load-packages
#| include: false
#| message: false
library(readr)
library(dplyr)
library(tidyr)
library(reshape2)
library(GenomicRanges)
library(pheatmap)
library(tibble)
library(ggplot2)
library(stringr)
library(cowplot)
library(markdown)
library(RColorBrewer)
library(GenomicAlignments)
library(reshape2)
```

# Read data

1. Get list of samples

```{r}
samples <- read_tsv("config/samples.tsv", show_col_types = FALSE)
units <- read_tsv("config/units.tsv", show_col_types = FALSE)
sample_units <- dplyr::left_join(samples, units, by = "sample_name") %>%
  unite(sample_unit, sample_name, unit_name, remove = FALSE)
sample_units
```

2. Read Samtools `idxstats` to get human coverage for normalization

Notes:

* The counts include the total number of reads aligned, they 
  are not limited to uniquely aligned reads.
* The counts are reads, not pairs or fragments

```{r}
idxstats_exogenousrna_dir <-
  "results/samtools_idxstats/exogenous_rna/"

idxstats_human_dir <-
  "results/samtools_idxstats/Homo_sapiens.GRCh38.dna.primary_assembly/"

bowtie2_human_logs <-
  "results/logs/bowtie2/Homo_sapiens.GRCh38.dna.primary_assembly/"

idxstats <- tibble()

for (row in seq_len(nrow(sample_units))) {
  sample <- sample_units[row, ]$sample_unit

  # Read `idsxstats` for exogenous mapped reads
  exogenous_rna_stats <- read_tsv(
    file.path(idxstats_exogenousrna_dir, sprintf("%s.bam.idxstats", sample)),
    col_names = c(
      "sequence_name", "sequence_length",
      "mapped_reads", "unmapped_reads"
    ),
    col_types = "ciii"
  )
  exogenous_rna_mapped_reads <- exogenous_rna_stats %>%
    filter(!sequence_name %in% c("*")) %>%
    select(sequence_name, mapped_reads) %>%
    mutate(sample = sample)

  # Read `idxstats` for human mapped reads
  human_stats <- read_tsv(
    file.path(idxstats_human_dir, sprintf("%s.bam.idxstats", sample)),
    col_names = c(
      "sequence_name", "sequence_length",
      "mapped_reads", "unmapped_reads"
    ),
    col_types = "ciii"
  )
  grch38_mapped_reads <- human_stats %>%
    filter(!sequence_name %in% c("*")) %>%
    select(mapped_reads) %>%
    sum()
  grch38_mapped_reads <- tibble(
    sequence_name = "grch38_mapped_reads",
    mapped_reads = grch38_mapped_reads,
    sample = sample
  )

  # Read bowtie2 logs for unmapped reads
  bowtie2_log <- readLines(
    file.path(bowtie2_human_logs, sprintf("%s.log", sample))
  )
  total_pairs <- strtoi(str_split(bowtie2_log[1], " ")[[1]][1])
  total_reads <- total_pairs * 2
  unmapped_reads <- tibble(
    sequence_name = "unmapped",
    mapped_reads = total_reads - grch38_mapped_reads$mapped_reads,
    sample = sample
  )

  # Consolidate counts for rows
  idxstats <- rbind(
    idxstats,
    exogenous_rna_mapped_reads,
    grch38_mapped_reads,
    unmapped_reads
  )
}
idxstats
```

# Coverage

## Concordant vs Discordant paired reads

Concordant pairs are pairs of reads that:

* Align on the same pegRNA
* Align within 500 bp of each other
* Align in the expected forward-reverse orientation (`--> .. <--`)

Discordant reads aligned but whose mate:

* Did not align (on the pegRNA)
* Aligned more than 500 bp away
* Aligned in an unexpected orientation

```{r fig.width=10, fig.height=10}
## Config and function definition

bam_dir <- "results/alignments/exogenous_rna/sorted"

last_day <- 0
cols <- brewer.pal(n = 5, name = "RdBu")

concordant_cell_line_colors <- list(
  "Parental" = "#CA0020",
  "P1E10" = "#0571B0"
)

discordant_cell_line_colors <- list(
  "Parental" = "#F4A582",
  "P1E10" = "#92C5DE"
)

# Exogenous RNA mixtures
rna_mixes <- tibble()
for (mix in c("mastermix1", "mastermix2", "PJY103", "PJY300")) {
  t <- readDNAStringSet(sprintf("data/references/%s.fa", mix))
  rna_mixes <- rbind(rna_mixes, tibble(
    exogenous_rna = mix,
    rna_species = word(t@ranges@NAMES, 1),
    start = 1,
    end = t@ranges@width
  ))
}
rna_mixes <- rna_mixes %>%
  rows_update(tibble(exogenous_rna = "PJY103", start = 331, end = 460),
    by = "exogenous_rna"
  ) %>%
  rows_update(tibble(exogenous_rna = "PJY300", start = 331, end = 497),
    by = "exogenous_rna"
  )
rna_mixes

source("pegrna_plots.R")
```

## PJY103 / PJY300 (Batch 2)

```{r fig.width=20, fig.height=10}
#| warning: false
sample_list <- rna_mixes %>% filter(grepl("PJY103|PJY300", rna_species))

for (start_offset in c(NA, 5)) {
  for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
    for (i in seq_len(nrow(sample_list))) {
      rna_species <- sample_list[[i, "rna_species"]]
      mix <- sample_list[[i, "exogenous_rna"]]

      pegrna_plots(
        sequence_name = rna_species,
        normalization_factor = norm_factor,
        mix = mix,
        ylab = sprintf("%s coverage (normalized to %s)", mix, norm_factor),
        vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
        start_offset = start_offset
      )
    }
  }
}
```

## VEGFA

```{r fig.width=20, fig.height=10}
sample_list <- rna_mixes %>% filter(grepl("VEGFA", rna_species))

for (start_offset in c(NA, 5)) {
  for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
    for (i in seq_len(nrow(sample_list))) {
      rna_species <- sample_list[[i, "rna_species"]]
      mix <- sample_list[[i, "exogenous_rna"]]

      pegrna_plots(
        sequence_name = rna_species,
        normalization_factor = norm_factor,
        mix = mix,
        ylab = sprintf("%s coverage (normalized to %s)", mix, norm_factor),
        vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
        start_offset = start_offset
      )
    }
  }
}
```

## FANCF

```{r fig.width=20, fig.height=10}
sample_list <- rna_mixes %>% filter(grepl("FANCF", rna_species))

for (start_offset in c(NA, 5)) {
  for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
    for (i in seq_len(nrow(sample_list))) {
      rna_species <- sample_list[i, ]$rna_species
      pegrna_plots(
        sequence_name = rna_species,
        normalization_factor = norm_factor,
        ylab = sprintf("Coverage (normalized to %s)", norm_factor),
        vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
        start_offset = start_offset
      )
    }
  }
}
```

## HEK3

```{r fig.width=20, fig.height=10}
sample_list <- rna_mixes %>% filter(grepl("HEK3", rna_species))

for (start_offset in c(NA, 5)) {
  for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
    for (i in seq_len(nrow(sample_list))) {
      rna_species <- sample_list[i, ]$rna_species
      pegrna_plots(
        sequence_name = rna_species,
        normalization_factor = norm_factor,
        ylab = sprintf("Coverage (normalized to %s)", norm_factor),
        vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
        start_offset = start_offset
      )
    }
  }
}
```


## DNMT1

```{r fig.width=20, fig.height=10}
sample_list <- rna_mixes %>% filter(grepl("DNMT1", rna_species))

for (start_offset in c(NA, 5)) {
  for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
    for (i in seq_len(nrow(sample_list))) {
      rna_species <- sample_list[i, ]$rna_species
      pegrna_plots(
        sequence_name = rna_species,
        normalization_factor = norm_factor,
        ylab = sprintf("Coverage (normalized to %s)", norm_factor),
        vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
        start_offset = start_offset
      )
    }
  }
}
```


## RUNX1

```{r fig.width=20, fig.height=10}
sample_list <- rna_mixes %>% filter(grepl("RUNX1", rna_species))

for (start_offset in c(NA, 5)) {
  for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
    for (i in seq_len(nrow(sample_list))) {
      rna_species <- sample_list[i, ]$rna_species
      pegrna_plots(
        sequence_name = rna_species,
        normalization_factor = norm_factor,
        ylab = sprintf("Coverage (normalized to %s)", norm_factor),
        vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
        start_offset = start_offset
      )
    }
  }
}
```

## EMX1

```{r fig.width=20, fig.height=10}
sample_list <- rna_mixes %>% filter(grepl("EMX1", rna_species))

for (start_offset in c(NA, 5)) {
  for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
    for (i in seq_len(nrow(sample_list))) {
      rna_species <- sample_list[i, ]$rna_species
      pegrna_plots(
        sequence_name = rna_species,
        normalization_factor = norm_factor,
        ylab = sprintf("Coverage (normalized to %s)", norm_factor),
        vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
        start_offset = start_offset
      )
    }
  }
}
```

## RNF2

```{r fig.width=20, fig.height=10}
sample_list <- rna_mixes %>% filter(grepl("RNF2", rna_species))

for (start_offset in c(NA, 5)) {
  for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
    for (i in seq_len(nrow(sample_list))) {
      rna_species <- sample_list[i, ]$rna_species
      pegrna_plots(
        sequence_name = rna_species,
        normalization_factor = norm_factor,
        ylab = sprintf("Coverage (normalized to %s)", norm_factor),
        vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
        start_offset = start_offset
      )
    }
  }
}
```

# Strandedness

```{r}
#| warning: false
source("pegrna_alignment_strandedness.R")
for (rna_species in rna_mixes %>%
  select(rna_species) %>%
  unique() %>%
  pull()) {
  stranded_counts <- pegrna_alignment_strandedness(sequence_name = rna_species)
  p <- ggplot(
    data = stranded_counts,
    aes(x = sample_unit, y = count, fill = strand)
  ) +
    geom_bar(stat = "identity") +
    labs(title = rna_species) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
  print(p)
}
```