HBCC_DLPFC_FEATURECOUNTS_GRCh38_CQN.Rmd

---
title: "HBCC - DLPFC - FEATURECOUNTS - GRCh38 - CQN"
author: "Kelsey Montgomery, Gabriel Hoffman, Thanneer Perumal"
date: "`r date()`"
output: html_document
---

```{r knit2synapse, eval=FALSE}
library(synapser)
library(knit2synapse)

synLogin()

knit2synapse::createAndKnitToFolderEntity(file = "HBCC_DLPFC_FEATURECOUNTS_GRCh38_CQN.Rmd",
                                          parentId ="syn21435176",
                                          folderName = 'HBCC - DLPFC - DREAM - FEATURECOUNTS - GRCh38 - CQN')
```


```{r libs, echo=FALSE, warning=FALSE, message=FALSE, include=FALSE}
## It is assumed your working directory is where this file

## Load required libraries
library(CovariateAnalysis) # get the package from devtools::install_github('th1vairam/CovariateAnalysis@dev')
library(data.table)
library(tidyr)
library(plyr)
library(dplyr)
library(stringr)
library(mvBIC)

library(ggplot2)
library(limma)
library(psych)
library(edgeR)
library(cqn)
library(lme4)
library(biomaRt)
library(ComplexHeatmap)
library(statmod)

library(variancePartition)
library(synapser)
library(knitr)
library(githubr) # get the package from devtools::install_github('brian-bot/githubr')

synLogin()

library(doParallel)
library(foreach)
library(future)

cl = makeCluster((availableCores()-2)/2)


registerDoParallel(cl)

options(xtable.type="html")

knitr::opts_chunk$set(
  echo=FALSE,
  warning=FALSE,
  message=FALSE,
  error = FALSE,
  tidy = FALSE,
  cache = TRUE)
```

```{r synapse.parameters, include=FALSE, cache=TRUE}
parentId = 'syn21435176';
activityName = 'Covariate adjustments';
activityDescription = 'Covariate analysis of GRCh38 feature counts with CQN normalisation (DLPFC)';

thisFileName <- 'HBCC_DLPFC_FEATURECOUNTS_GRCh38_CQN.Rmd'

# Github link
thisRepo <- getRepo(repository = "CommonMindConsortium/covarr-de", ref="branch", refName='master')
thisFile <- getPermlink(repository = thisRepo, repositoryPath = thisFileName)
```
### Data download
#### Obtain effective count matrix and metadata from synapse.
```{r download.data, cache=TRUE}
downloadFile <- function(id){
  fread(synGet(id)$path, data.table = F)
}
downloadFile_version <- function(id , version){
  fread(synGet(id, version = version)$path, data.table = F)
}

# Get RNASeq QCmetadata
METADATA_QC_DLPFC_ID = 'syn16816488' 
ALL_USED_IDs = METADATA_QC_DLPFC_ID
metadata = downloadFile_version(METADATA_QC_DLPFC_ID, version = 19) %>%
  dplyr::select(`Individual_ID`, Institution, Cohort, `Reported_Gender`, Sex, Ethnicity, ageOfDeath, `PMI_(in_hours)`, Dx, 
                Sample_RNA_ID, one_of('rnaSeq_isolation:RIN',
                                      'rnaSeq_dissection:Brain_Region',
                                      'rnaSeq_report:Ribozero_Batch', 'rnaSeq_report:Library_Batch', 
                                      'rnaSeq_report:Flowcell_Batch', 'rnaSeq_report:Exclude?',
                                      'rnaSeq_report:Mapped_Reads',
                                      'rnaSeq_report:Intragenic_Rate','rnaSeq_report:Intronic_Rate',
                                      'rnaSeq_report:Intergenic_Rate','rnaSeq_report:Genes_Detected',
                                      'rnaSeq_report:Expression_Profiling_Efficiency',
                                      'rnaSeq_report:rRNA_Rate',
                                      'rnaSeq_report:Total_Reads', 'rnaSeq_report:Percent_Aligned',
                                      'rnaSeq_report:Transcripts_Detected')) %>% 
  dplyr::filter(`rnaSeq_dissection:Brain_Region` %in% c("DLPFC"))

## Download reprocessed counts (DLPFC)
counts = 'syn21754352'
ALL_USED_IDs[length(ALL_USED_IDs)+1] = counts
COUNT = downloadFile(counts) %>% data.frame()
COUNT$Chr <- NULL
COUNT$Start <- NULL
COUNT$End <- NULL
COUNT$Strand <- NULL

# Parse gene lengths
gene_lengths = COUNT[,c("Geneid", "Length")]
COUNT$Length <- NULL
gene_lengths <- dplyr::rename(gene_lengths, gene_id = Geneid)

# Get ancestry vector calculated using gemtools
ANCESTRY_ID = 'syn9922992'
ALL_USED_IDs = c(ALL_USED_IDs, ANCESTRY_ID)
ANCESTRY = downloadFile(ANCESTRY_ID) %>% 
  plyr::rename(c(ID = 'SNP_report:Genotyping_Sample_ID'))

# Get genotype ids from synapse
GENOTYPE_ID = 'syn16816490'
ALL_USED_IDs = c(ALL_USED_IDs, GENOTYPE_ID)
GENOTYPE = downloadFile(GENOTYPE_ID) %>% 
  dplyr::select(Individual_ID, `SNP_report:Genotyping_Sample_ID`, `SNP_report:Exclude?`) %>% 
  dplyr::inner_join(ANCESTRY) %>% 
  dplyr::filter(is.na(`SNP_report:Exclude?`))
```

### Data preprocessing
```{r preprocess.data}
# Merge metadata
METADATA = metadata %>%
  dplyr::left_join(GENOTYPE) %>% 
  dplyr::rename(Region = `rnaSeq_dissection:Brain_Region`,
                PMI = `PMI_(in_hours)`,
                RIN = `rnaSeq_isolation:RIN`, 
                ReportExclude = `rnaSeq_report:Exclude?`,
                GenotypeExclude = `SNP_report:Exclude?`,
                SampleID = Sample_RNA_ID,
                LibraryBatch = `rnaSeq_report:Library_Batch`,
                FlowcellBatch = `rnaSeq_report:Flowcell_Batch`,
                RibozeroBatch = `rnaSeq_report:Ribozero_Batch`,
                MappedReads = `rnaSeq_report:Mapped_Reads`,
                IntragenicRate = `rnaSeq_report:Intragenic_Rate`, IntronicRate = `rnaSeq_report:Intronic_Rate`, 
                IntergenicRate = `rnaSeq_report:Intergenic_Rate`, GenesDetected = `rnaSeq_report:Genes_Detected`,
                ExpProfEfficiency = `rnaSeq_report:Expression_Profiling_Efficiency`, rRNARate = `rnaSeq_report:rRNA_Rate`,
                TotalReads = `rnaSeq_report:Total_Reads`, AlignmentRate = `rnaSeq_report:Percent_Aligned`, TranscriptsDetected = `rnaSeq_report:Transcripts_Detected`) %>%
  dplyr::select(SampleID, Individual_ID, Institution, Cohort, Reported_Gender,Sex, Ethnicity, ageOfDeath, PMI, Dx, 
                RIN, ReportExclude, GenotypeExclude, EV.1, EV.2, EV.3, EV.4, EV.5,
                LibraryBatch, FlowcellBatch, RibozeroBatch, MappedReads, TotalReads, GenesDetected, AlignmentRate,
                IntragenicRate, IntergenicRate, IntronicRate, ExpProfEfficiency, rRNARate, TranscriptsDetected) %>%
  dplyr::filter(Cohort %in% "NIMH-HBCC") %>% 
  dplyr::filter(SampleID %in% colnames(COUNT), !is.na(SampleID)) %>% 
  dplyr::mutate(Dx = forcats::fct_recode(Dx, Other = "BP", Other = "undetermined", Control = "Control", SCZ = "SCZ"))
  
ind = METADATA$SampleID[which(METADATA$ReportExclude == 1 | METADATA$GenotypeExclude)]
writeLines(paste('Following',length(ind),'samples are marked exclude'))
writeLines(paste(ind, collapse = ', '))
METADATA <- METADATA  %>% dplyr::filter(!(SampleID %in% ind)) 

ind = METADATA$SampleID [is.na(METADATA$Ethnicity) | is.na(METADATA$Institution) | is.na(METADATA$Dx)]
writeLines(paste('Following', length(ind), 'counts are missing any metadata'))
writeLines(paste(ind, collapse = ', '))
METADATA <- METADATA  %>% dplyr::filter(!(SampleID %in% ind)) 

ind = METADATA$SampleID [is.na(METADATA$PMI)]
writeLines(paste('Following', length(ind), 'counts are missing PMI'))
writeLines(paste(ind, collapse = ', '))
METADATA <- METADATA  %>% dplyr::filter(!(SampleID %in% ind)) 

ind = METADATA$SampleID [is.na(METADATA$Reported_Gender)]
writeLines(paste('Following', length(ind), 'counts are missing gender'))
writeLines(paste(ind, collapse = ', '))
METADATA <- METADATA  %>% dplyr::filter(!(SampleID %in% ind)) 

ind = METADATA$SampleID [is.na(METADATA$ageOfDeath)]
writeLines(paste('Following', length(ind), 'counts are missing age of death'))
writeLines(paste(ind, collapse = ', '))
METADATA <- METADATA  %>% dplyr::filter(!(SampleID %in% ind))

ind = METADATA$SampleID [is.na(METADATA$EV.1)]
writeLines(paste('Following', length(ind), 'counts are missing ancestry information'))
writeLines(paste(ind, collapse = ', '))
METADATA <- METADATA  %>% dplyr::filter(!(SampleID %in% ind))
```

```{r preprocess.data1, results='asis'}
# Match covariates to expression data
indToRetain = intersect(METADATA$SampleID, colnames(COUNT))

rownames(COUNT) <- COUNT$Geneid
COUNT = COUNT[,indToRetain]

rownames(METADATA) = METADATA$SampleID
METADATA = METADATA[indToRetain,]

METADATA %>% 
  group_by(Dx) %>% 
  dplyr::summarise(count = n()) %>% 
  kable()
```


```{r gene.param, include = F}
## Get GC content from biomart
backgroundGenes = data.frame(gene_id = rownames(COUNT)) %>% 
  dplyr::mutate(id = gene_id) %>% 
  tidyr::separate(id, c('ensembl_gene_id','position'), sep = '\\.')

# Define biomart object
mart <- useMart(biomart = "ENSEMBL_MART_ENSEMBL",
                host = "uswest.ensembl.org", # Ensembl Release 99 (January 2020)",
                dataset = "hsapiens_gene_ensembl")

# Query biomart
Ensemble2HGNC <- getBM(attributes = c("ensembl_gene_id", "hgnc_symbol", "percentage_gene_gc_content", "gene_biotype", "chromosome_name"),
                       filters = "ensembl_gene_id", values = backgroundGenes$ensembl_gene_id,
                       mart = mart)

GENE.GC.CONT = Ensemble2HGNC %>%
  dplyr::left_join(backgroundGenes) %>% 
  dplyr::select(gene_id, percentage_gene_gc_content, chromosome_name) %>%
  unique
rownames(GENE.GC.CONT) = GENE.GC.CONT$gene_id
```
### Sex Chromosome-specific Gene Expression Patterns

```{r Reported_Gender_plots}
COUNT$gene_id = rownames(COUNT) 
REPORTED.GENDER.COUNTS = GENE.GC.CONT %>% 
  left_join(COUNT) %>%
  dplyr::select(-one_of("percentage_gene_gc_content")) %>%
  filter(chromosome_name == "X" |chromosome_name == "Y") %>% 
  tidyr::gather(key = item, value = value, -c(gene_id, chromosome_name)) %>%
  dplyr::mutate(value = log(value)) %>%
  dplyr::rename(`counts(log)`= value) %>% 
  dplyr::rename(SampleID = item) %>%
  left_join(METADATA[,c("SampleID", "Reported_Gender")]) %>% 
  dplyr::rename(`Reported Gender` = Reported_Gender) 


my.theme <- theme_bw() %+replace% theme(legend.position = 'top', axis.text.x = element_text(angle = 90, hjust = 1), plot.title=element_text(hjust=0.5))

p = list()
p[[1]] = ggplot(filter(REPORTED.GENDER.COUNTS, chromosome_name == "X"), aes(x = `Reported Gender`, y = `counts(log)`)) + geom_boxplot()
p[[1]] = p[[1]] + ggtitle('X') + my.theme
p[[2]] = ggplot(filter(REPORTED.GENDER.COUNTS, chromosome_name == "Y"), aes(x = `Reported Gender`, y = `counts(log)`)) + geom_boxplot()
p[[2]] = p[[2]] + ggtitle('Y') + my.theme
multiplot(plotlist = p, cols = 2)

##XIST and UTY expression 
#ENSG00000229807.11 and ENSG00000183878.15 
FILT <- REPORTED.GENDER.COUNTS %>% 
  filter(gene_id == "ENSG00000229807.11" | gene_id == "ENSG00000183878.15") %>% 
  dplyr::select(-one_of("chromosome_name")) %>% 
  tidyr::spread(key = gene_id, value = `counts(log)`) %>% 
  mutate(XIST = as.numeric(`ENSG00000229807.11`)) %>% 
  mutate(UTY = as.numeric(`ENSG00000183878.15`)) %>% 
  mutate(UTY = ifelse(UTY == -Inf, 0, UTY)) %>% 
  mutate(XIST = ifelse(XIST == -Inf, 0, XIST))

p = ggplot(FILT, aes (x= XIST, y = UTY)) 
p = p + geom_point(aes(color=`Reported Gender`)) + 
  ggtitle("Sex Check HBCC Cohort") + 
  theme(plot.title = element_text(hjust = 0.5, size = 15)) +
  labs(colour = "Reported Gender")
p
```


### Covariates clustering
Determine relationship between covariates. 
```{r covariates.clustering}
FactorCovariates <- c('Individual_ID', "Reported_Gender", "LibraryBatch", "Dx", "FlowcellBatch", "RibozeroBatch")
ContCovariates <- c("ageOfDeath", "PMI", "RIN", "MappedReads", "IntragenicRate", "IntronicRate", "IntergenicRate",
                    "GenesDetected", "ExpProfEfficiency", "rRNARate", "TotalReads", "AlignmentRate",
                    "EV.1", "EV.2", "EV.3", "EV.4", "EV.5")

# Find inter relation between factor covariates
COVARIATES = METADATA[,c(FactorCovariates,ContCovariates),drop=F]
COVARIATES[,FactorCovariates] <- data.frame(lapply(COVARIATES[,FactorCovariates],function(x){
  x <- sapply(x,function(y){str_replace_all(as.character(y),'[^[:alnum:]]','_')})}))
rownames(COVARIATES) <- METADATA$SampleID

# Convert factor covariates to factors
COVARIATES[,FactorCovariates] = lapply(COVARIATES[,FactorCovariates], factor)
COVARIATES[,ContCovariates] = lapply(COVARIATES[,ContCovariates], function(x){
  x = as.numeric(as.character(gsub('[\\,\\%]','',x)))
})

# Add in RIN^2 values
COVARIATES$RIN2 = COVARIATES$RIN^2
ContCovariates = c(ContCovariates, 'RIN2')
```
Correlation/association between covariates at an FDR <= 0.1
```{r covariates.correlation, fig.width=10, fig.height=8}
COVARIATES.CORRELATION = getAssociationStatistics(COVARIATES, PVAL = 0.05)
tmp = COVARIATES.CORRELATION$ESTIMATE
tmp[COVARIATES.CORRELATION$PVAL > 0.05] = 0
h = Heatmap(tmp, col = colorRamp2(c(-1,0,1), c('blue','white','red')), name = 'AssocEstimate')
ComplexHeatmap::draw(h, heatmap_legend_side = 'left')
```

## Explore metatdata
```{r data.explore, fig.width = 12, fig.height = 12}
my.theme <- theme_bw() %+replace% theme(legend.position = 'top', axis.text.x = element_text(angle = 90, hjust = 1), plot.title=element_text(hjust=0.5))

# RIN
p = list()
p[[1]] = ggplot(COVARIATES, aes(x = Dx, y = RIN)) + geom_boxplot()
p[[1]] = p[[1]] + ggtitle('RIN') + my.theme

# Age of Death
p[[2]] = ggplot(COVARIATES, aes(x = Dx, y = ageOfDeath)) + geom_boxplot()
p[[2]] = p[[2]] + ggtitle('AgeOfDeath') + my.theme

# PMI
p[[3]] = ggplot(COVARIATES, aes(x = Dx, y = PMI)) + geom_boxplot()
p[[3]] = p[[3]] + ggtitle('PMI (in hours)') + my.theme

# Intronic Rate
p[[4]] = ggplot(COVARIATES, aes(x = Dx, y = IntronicRate)) + geom_boxplot()
p[[4]] = p[[4]] + ggtitle('Intronic Rate') + my.theme

# IntergenicRate
p[[5]] = ggplot(COVARIATES, aes(x = Dx, y = IntergenicRate)) + geom_boxplot()
p[[5]] = p[[5]] + ggtitle('Intergenic Rate') + my.theme

# Transcripts Detected
p[[6]] = ggplot(COVARIATES, aes(x = Dx, y = IntragenicRate)) + geom_boxplot()
p[[6]] = p[[6]] + ggtitle('Intragenic Rate') + my.theme

# Mapped Reads
p[[7]] = ggplot(COVARIATES, aes(x = Dx, y = MappedReads)) + geom_boxplot()
p[[7]] = p[[7]] + ggtitle('Mapped Reads') + my.theme

# PercentAligned
p[[8]] = ggplot(COVARIATES, aes(x = Dx, y = TotalReads)) + geom_boxplot()
p[[8]] = p[[8]] + ggtitle('Total Reads') + my.theme

# rRNARate
p[[9]] = ggplot(COVARIATES, aes(x = Dx, y = rRNARate)) + geom_boxplot()
p[[9]] = p[[9]] + ggtitle('rRNARate') + my.theme

multiplot(plotlist = p, cols = 3)

# Institution
# vcd::mosaic(~ Institution + Dx, data = COVARIATES)

# Gender
# vcd::mosaic(~ Reported_Gender + Dx, data = COVARIATES)
```
### Filter genes
Remove genes that have less than 1 cpm counts in at least 50% of samples per Dx and per Dx.sex. Also remove genes with missing gene length and percentage GC content


```{r cpmnormalisation}
rownames(COUNT) = COUNT$gene_id
COUNT$gene_id = NULL

genesToAnalyze = dlply(METADATA, .(Dx, Reported_Gender), .fun = function(mtd, count){
  processed.counts = getGeneFilteredGeneExprMatrix(count[,mtd$SampleID],
                                                   MIN_GENE_CPM=1, 
                                                   MIN_SAMPLE_PERCENT_WITH_MIN_GENE_CPM=0.5)
  processed.counts$filteredExprMatrix$genes
}, COUNT)

genesToAnalyze = unlist(genesToAnalyze) %>% 
  unique() %>% 
  intersect(GENE.GC.CONT$gene_id[!is.na(GENE.GC.CONT$percentage_gene_gc_content)]) %>% 
  intersect(gene_lengths$gene_id[!is.na(gene_lengths$Length)])

PROCESSED_COUNTS = getGeneFilteredGeneExprMatrix(COUNT[genesToAnalyze, ], MIN_GENE_CPM=0, MIN_SAMPLE_PERCENT_WITH_MIN_GENE_CPM=0)
pct.pc = PROCESSED_COUNTS$filteredExprMatrix$genes %>%
  dplyr::rename(gene_id = genes) %>%
  left_join(backgroundGenes) %>%
  left_join(Ensemble2HGNC) %>%
  group_by(gene_biotype) %>%
  dplyr::summarise(fraction  = n()) %>%
  filter(fraction > 100) %>%
  dplyr::mutate(fraction = fraction/length(PROCESSED_COUNTS$filteredExprMatrix$genes[,1]))
```
`r dim(PROCESSED_COUNTS$filteredExprMatrix$genes)[1]` genes are used for the analysis. 

Following is the distribution of biotypes
`r kable(pct.pc)`

### Outlier analysis
Detect outlier samples based on their expression (logCPM) pattern
```{r detect.outliers, fig.height=8, fig.width=8, results='asis', cache = FALSE}
# Find principal components of expression to plot
PC <- prcomp(voom(PROCESSED_COUNTS$filteredExprMatrix$counts)$E, scale.=T, center = T)

# Plot first 2 PCs
plotdata <- data.frame(SampleID=rownames(PC$rotation), 
                       PC1=PC$rotation[,1], 
                       PC2=PC$rotation[,2])

# Percentage from each PC
eigen <- PC$sdev^2
pc1 <- eigen[1]/sum(eigen)
pc2 <- eigen[2]/sum(eigen)

# Identify outliers - samples 4SDs from the mean
outliers <- as.character(plotdata$SampleID[c(which(plotdata$PC1 < mean(plotdata$PC1) - 4*sd(plotdata$PC1)),
                              which(plotdata$PC1 > mean(plotdata$PC1) + 4*sd(plotdata$PC1))), drop = T])

outliers <- c(outliers, as.character(plotdata$SampleID[c(which(plotdata$PC2 < mean(plotdata$PC2) - 4*sd(plotdata$PC2)),
                                           which(plotdata$PC2 > mean(plotdata$PC2) + 4*sd(plotdata$PC2))), drop = T] ))
  
plotdata <- left_join(plotdata, rownameToFirstColumn(COVARIATES, "SampleID")) %>%
  dplyr::mutate(label = SampleID) %>% 
  dplyr::mutate(label = ifelse((label %in% outliers), label, NA)) %>% 
  dplyr::mutate(Institution = "NIMH-HBCC") %>% 
  dplyr::mutate(Tissue = "DLPFC")

p <- ggplot(plotdata, aes(x=PC1, y=PC2))
p <- p + geom_point(aes(shape= Tissue, color = Institution, size=ageOfDeath))
p <- p + theme_bw() %+replace% theme(legend.position="right")
p <- p + geom_text(aes(label= label), size=4, hjust=0)
p
# Plot abberent distribution of logcpm counts
tmp1 = voom(PROCESSED_COUNTS$filteredExprMatrix$counts)$E %>%
  rownameToFirstColumn('ensembl_gene_id') %>%
  tidyr::gather(SampleID, logCPM, -ensembl_gene_id) %>%
  left_join(COVARIATES %>%
              rownameToFirstColumn('SampleID'))
# p = ggplot(tmp1 %>%
#              dplyr::filter(SampleID %in% indToRemove),
#            aes(x = logCPM, color = SampleID)) + geom_density() 
# p = p + theme_bw() %+replace% theme(legend.position = 'none') + facet_grid(Dx~., scale = 'free')
# p

indToRetain = setdiff(colnames(PROCESSED_COUNTS$filteredExprMatrix$counts), outliers)
PROCESSED_COUNTS$filteredExprMatrix$counts = PROCESSED_COUNTS$filteredExprMatrix$counts[,indToRetain]
COVARIATES = COVARIATES[indToRetain,]

tmp = COVARIATES %>%
  group_by(Dx) %>%
  dplyr::summarise(count = n())

NEW.COUNTS = PROCESSED_COUNTS$filteredExprMatrix$counts
```
Processing `r dim(PROCESSED_COUNTS$filteredExprMatrix)[1]` genes in `r dim(PROCESSED_COUNTS$filteredExprMatrix)[2]` samples from `r length(unique(COVARIATES$Individual_ID))` unique individuals

Based on the expression pattern following samples were tagged as outliers: `r paste(outliers, collapse = ', ')` 

Distribution of samples are:
`r kable(tmp)`

### Library Normalisation
Initial library normalisation usign cqn
```{r cqn}
# Compute offset for gene length and gc content
gene_lengths = tibble::column_to_rownames(gene_lengths, var = 'gene_id') %>% as.data.frame()
CQN.GENE_EXPRESSION = cqn(NEW.COUNTS, 
                          x = GENE.GC.CONT[PROCESSED_COUNTS$filteredExprMatrix$genes$genes, 'percentage_gene_gc_content'],
                          lengths = gene_lengths[PROCESSED_COUNTS$filteredExprMatrix$genes$genes, 'Length'],
                          lengthMethod = "smooth", 
                          verbose = FALSE)
CQN.GENE_EXPRESSION$E = CQN.GENE_EXPRESSION$y + CQN.GENE_EXPRESSION$offset
```

### Co-expression distribution
Coexpression of genes 
```{r coexp1, cache=FALSE, fig.height=5, fig.width=5}
cr = cor(t(CQN.GENE_EXPRESSION$E))
hist(cr, main = 'Distribution of correlation between genes', xlab = 'Correlation')
```

### Sample clustering
```{r decompse.normalise.data, fig.height=8, fig.width=8, results='asis', cache=FALSE}
# Find principal components of expression to plot
PC <- prcomp(CQN.GENE_EXPRESSION$E, scale.=T, center = T)

# Plot first 2 PCs
plotdata <- data.frame(SampleID=rownames(PC$rotation), 
                       PC1=PC$rotation[,1], 
                       PC2=PC$rotation[,2])

plotdata <- left_join(plotdata, rownameToFirstColumn(COVARIATES, 'SampleID')) %>% 
  dplyr::mutate(Institution = "NIMH-HBCC") %>% 
  dplyr::mutate(Tissue = "DLPFC")

p <- ggplot(plotdata, aes(x=PC1, y=PC2))
p <- p + geom_point(aes(color=Institution, shape=Tissue, size=ageOfDeath))
p <- p + theme_bw() + theme(legend.position="right") + facet_grid(Tissue~.)
# p <- p + geom_text(aes(label= SampleID), size=4, hjust=0)
p
```
Tree based clustering of samples
```{r decompse.normalise.data.1, fig.height=6, fig.width=10, results='asis'}
# Eucledian tree based analysis
COVARIATES.tmp = data.matrix(COVARIATES[,c("Reported_Gender", "LibraryBatch", "Dx")])
COVARIATES.tmp[is.na(COVARIATES.tmp)] = 0

tree = hclust(as.dist(t(CQN.GENE_EXPRESSION$E)))
cols = WGCNA::labels2colors(COVARIATES.tmp);

WGCNA::plotDendroAndColors(tree, 
                           colors = cols, 
                           dendroLabels = FALSE, 
                           abHeight = 0.80, 
                           main = "Sample dendrogram",
                           groupLabels = colnames(COVARIATES.tmp))
```

```{r temp, include = F}
dev.off()
gc()
```

### Distribution of samples (log cpm)
```{r lcpm.dist, cache=FALSE}
# Plot abberent distribution of logcpm counts
tmp1 = (CQN.GENE_EXPRESSION$E) %>%
  rownameToFirstColumn('gene_id') %>%
  tidyr::gather(SampleID, logCPM, -gene_id) %>%
  left_join(COVARIATES %>%
              rownameToFirstColumn('SampleID'))

p = ggplot(tmp1, aes(x = logCPM, color = SampleID)) + geom_density() 
p = p + theme_bw() %+replace% theme(legend.position = 'NONE') + facet_grid(.~Dx, scale = 'free')
p
```

### Significant Covariates
Correlation between pca of unadjusted mRNA expression and covariates is used to find significant covariates
```{r preAdjusted.covariates, cache=TRUE}
# Find correlation between PC's of gene expression with covariates
preAdjustedSigCovars = runPCAandPlotCorrelations(CQN.GENE_EXPRESSION$E, 
                                                 COVARIATES,
                                                 'NULL design(voom-normalized)', 
                                                 isKeyPlot=TRUE, 
                                                 MIN_PVE_PCT_PC = 1)
```

Significant covariates to adjust at FDR 0.1 are `r preAdjustedSigCovars$significantCovars`
```{r preAdjustedSigCovars.NULL, fig.width=20, fig.height=12}
preAdjustedSigCovars[["PC_res"]][[2]]$plotData
```

### Model Identification

#### Phase I (clinical, ancestry and sample specific technical variables)
```{r phase1}
#Fit a base model with the following covariates
base_covars <- "Dx"
random_effect <- "Individual_ID"

# Set base formula
base_formula <- glue::glue("~ ", glue::glue_collapse(base_covars, sep = " + ")) %>%
  glue::glue(" + (1|{random_effect})")

# Stepwise regression
phase1 <- mvBIC::mvForwardStepwise(exprObj = CQN.GENE_EXPRESSION$E,
                         baseFormula = base_formula,
                         data = COVARIATES,
                         variables = array(c("(1|Reported_Gender)", "scale(PMI)",
                                             "scale(ageOfDeath)", "scale(RIN)", "scale(RIN2)", "scale(EV.1)",
                                             "scale(EV.2)", "scale(EV.3)", "scale(EV.4)", "scale(EV.5)")))
phase1
```

#### Phase II (batch effects)
```{r phase2}
# Fit a base model with the following covariates
phase2 <- mvBIC::mvForwardStepwise(exprObj = CQN.GENE_EXPRESSION$E,
                         baseFormula = phase1$formula, 
                         data = COVARIATES,
                         variables = array(c("(1|LibraryBatch)", "(1|FlowcellBatch)")))
phase2
```

#### Phase III (RNASeq alignment specific covariates)
```{r phase3}
# Fit a base model with the following covariates
phase3 <- mvBIC::mvForwardStepwise(exprObj = CQN.GENE_EXPRESSION$E,
                         baseFormula = phase2$formula, 
                         data = COVARIATES,
                         variables = array(c("scale(IntragenicRate)", "scale(IntronicRate)","scale(IntergenicRate)","scale(rRNARate)", 
                                             "scale(TotalReads)", "scale(GenesDetected)", "scale(MappedReads)")))
phase3
```

#### Variables included in the model 
Covariates were added as fixed and random effects iteratively in three phases if model improvement by BIC criteria was observed. Clinical, ancestry and sample-specific technical variables were tested for model improvement first. Covariates related to batch effects were next tested for model improvement and finally, RNASeq alignment-specific covariates.

The model for clinical, technical and batch variables selected after phase I, II and III is `r paste(phase3$formula)`

### Store files in synapse
```{r synapse.store, include=FALSE, eval=TRUE, cache=FALSE}
# Set annotations
all.annotations = list(
  dataType = 'geneExpression',
  dataSubtype = 'processed',
  assay	 = 'rnaSeq',
  
  tissue	= 'dorsolateral prefrontal cortex, anterior cingulate cortex', 
  study = 'CMC', 

  species = 'Human',
  consortium	= 'CMC',
   
  analysisType	= "data normalization",
  
  transcriptNormalizationMethod	= 'CQN',
  transcriptQuantificationMethod = 'featureCounts',
  referenceSet = 'GRCh38'
)

# Code
CODE <- Folder(name = "HBCC - DLPFC - DREAM - FEATURECOUNTS - GRCh38 - CQN", parentId = parentId)
CODE <- synStore(CODE)

# Store covariates
COVARIATES = rownameToFirstColumn(COVARIATES, 'SampleID')
write.table(COVARIATES, file = 'CMC_HBCC_DLPFC_Covariates.tsv', sep = '\t', row.names=F, quote=F)
COV_OBJ = synapser::File('CMC_HBCC_DLPFC_Covariates.tsv', name = 'Covariates', parentId = CODE$properties$id)
COV_OBJ = synStore(COV_OBJ, used = ALL_USED_IDs, activityName = activityName, 
                   executed = thisFile, activityDescription = activityDescription)
all.annotations$dataSubType = 'covariates'
synSetAnnotations(COV_OBJ, annotations = all.annotations)

# Store filtered counts
PROCESSED_COUNTS$filteredExprMatrix$counts %>%
  rownameToFirstColumn('ensembl_gene_id') %>%
  write.table(file = 'CMC_HBCC_DLPFC_Counts.tsv', sep = '\t', row.names=F, quote=F)
COUNT_OBJ = File('CMC_HBCC_DLPFC_Counts.tsv', name = 'Counts (filtered raw)', parentId = CODE$properties$id)
COUNT_OBJ = synStore(COUNT_OBJ, used = ALL_USED_IDs, activityName = activityName, 
                     executed = thisFile, activityDescription = activityDescription)
all.annotations$dataSubType = 'filteredCounts'
synSetAnnotations(COUNT_OBJ, annotations = all.annotations)

# Store logCPM
CQN.GENE_EXPRESSION$E %>%
  rownameToFirstColumn('ensembl_gene_id') %>%
  write.table(file = 'CMC_HBCC_DLPFC_logCPM.tsv', sep = '\t', row.names=F, quote=F)
LCOUNT_OBJ = File('CMC_HBCC_DLPFC_logCPM.tsv', name = 'Counts (filtered logCPM)', parentId = CODE$properties$id)
LCOUNT_OBJ = synStore(LCOUNT_OBJ, used = ALL_USED_IDs, activityName = activityName, 
                      executed = thisFile, activityDescription = activityDescription)
all.annotations$dataSubType = 'filteredLCPM'
synSetAnnotations(LCOUNT_OBJ, annotations = all.annotations)

# Store cqn offsets
CQN.GENE_EXPRESSION$offset %>%
  rownameToFirstColumn('ensembl_gene_id') %>%
  write.table(file = 'CMC_HBCC_DLPFC_offset.tsv', sep = '\t', row.names=F, quote=F)
OFFSET_OBJ = File('CMC_HBCC_DLPFC_offset.tsv', name = 'Gene length and GC content offset', parentId = CODE$properties$id)
OFFSET_OBJ = synStore(OFFSET_OBJ, used = ALL_USED_IDs, activityName = activityName, 
                      executed = thisFile, activityDescription = activityDescription)
all.annotations$dataSubType = 'offset'
synSetAnnotations(OFFSET_OBJ, annotations = all.annotations)
```

|  *Results*                                  |  *SynapseID*                     |
|  -----------------------------------------  |   ---------                      |
|  Covariates                                 |  `r COV_OBJ$properties$id`       |
|  Counts (raw)                               |  `r COUNT_OBJ$properties$id`     |
|  Counts (lcpm)                              |  `r LCOUNT_OBJ$properties$id`    |
|  Offset (for gene length and gc content)    |  `r OFFSET_OBJ$properties$id`    |

### Source Rmd
[Source markdown](`r thisFile`)