10_heatmap

---
title: "Heatmap"
author: "Florent Chuffart"
date: "`r Sys.Date()`"
output: 
  rmarkdown::html_document:
    toc: true
    toc_float: true
    toc_depth: 3
    number_sections: true
---

```{r}
question = "04.3_deseq2_q3"
lpval_thresh = 2
l2fc_thresh = 1
```

```{r, echo=FALSE, eval=TRUE}
knitr::opts_chunk$set(collapse=TRUE, comment = "#>", fig.width=12, fig.height=6, eval=TRUE, echo=FALSE, results="hide")
```


1. I selected up (red) and down (blue) genes according to `r question`
2. I draw the heatmap of corresponding genes across conditions. Expression was standardize according toi WT samples
3. I draw boxplots of pooled up and down genes across day and Atad2 status (KO in violet and WT in green)

We globally observed:  

1. more disorder in Atad2 KO samples
2. an early increase in up-regulated genes in Atad2 KO samples for d20 and d22
3. an early decrease in down-regulated genes in Atad2 KO samples  for d20 and d22


# Volcano plot

```{r}
if (!exists("mread.table")) mread.table = memoise::memoise(read.table)

layout(matrix(1:2, 1), respect=TRUE)


up_genes = list()
dw_genes = list()
table_file = list.files(paste0(question, "/tables/"), "*.complete.txt", full.names=TRUE)  
print(table_file)
daresults = mread.table(table_file, header=TRUE)
rownames(daresults) = daresults$Id
col = rep("black", nrow(daresults))
names(col) = rownames(daresults)
up_genes[[question]] = rownames(daresults)[!is.na(daresults$padj) & daresults$log2FoldChange>0 & -log10(daresults$padj) > lpval_thresh & daresults$log2FoldChange > l2fc_thresh]
dw_genes[[question]] = rownames(daresults)[!is.na(daresults$padj) & daresults$log2FoldChange<0 & -log10(daresults$padj) > lpval_thresh & daresults$log2FoldChange < l2fc_thresh]
col[up_genes[[question]]] = "red"
col[dw_genes[[question]]] = "blue"
# plot(daresults$log2FoldChange, -log10(daresults$pvalue), main=substr(table_file, 31, 38), col=col, xlim=c(-7,7), ylim=c(0, 20), pch=".", xlab="log2FoldChange", ylab="raw p-value")
plot(daresults$log2FoldChange, -log10(daresults$padj)  , main=question, col=col, xlim=c(-7,7), ylim=c(0, 20), pch=".", xlab="log2FoldChange", ylab="adjusted p-value")
points(daresults[up_genes[[question]],]$log2FoldChange, -log10(daresults[up_genes[[question]],]$padj)  , col="red")
points(daresults[dw_genes[[question]],]$log2FoldChange, -log10(daresults[dw_genes[[question]],]$padj)  , col="blue")
legend("topleft", c(paste0("up_genes (", length(up_genes[[question]]), ")"), paste0("dw_genes (", length(dw_genes[[question]]), ")")), pch=16, col=c("red", "blue"))

pooled_up_genes = unique(unlist(up_genes))
pooled_dw_genes = unique(unlist(dw_genes))

sapply(up_genes, length)
length(pooled_up_genes)
sapply(dw_genes, length)
length(pooled_dw_genes)
```


```{r}
for (k in names(up_genes)) {
  write.table(up_genes[[k]], paste0(question, "/custom_up_genes.txt"), col.names=FALSE, row.names=FALSE, quote=FALSE)  
  write.table(dw_genes[[k]], paste0(question, "/custom_dw_genes.txt"), col.names=FALSE, row.names=FALSE, quote=FALSE)  
}

```

# Heatmap


```{r fig.height=9, fig.width=9, eval=TRUE}
# Loading full data matrix
table_file = list.files("04_deseq2/tables/", "*.complete.txt", full.names=TRUE)[1]
daresults = mread.table(table_file, header=TRUE)
rownames(daresults) = daresults$Id
# data = log2(as.matrix(daresults[c(pooled_up_genes),substr(colnames(daresults), 1, 4) == "norm"])+1)
# data = log2(as.matrix(daresults[c(pooled_dw_genes),substr(colnames(daresults), 1, 4) == "norm"])+1)
data = log2(as.matrix(daresults[c(pooled_dw_genes, pooled_up_genes),substr(colnames(daresults), 1, 4) == "norm"])+1)
colnames(data) = substr(colnames(data), 6, 1000)
head(data)

# Sort data matrix
day = substr(colnames(data), 1,3)
day
line = substr(colnames(data), 5,6)
line = factor(line, levels=rev(unique(line)))
line
# data = data[,order(day,line)]
data = data[,order(line, day)]
# d = cbind(data.frame(day=day, line=line), t(norm_data))
# data = data[pooled_up_genes,]
head(data)

# params
normalization="zscore_rows_on_wt"     
nb_grp_row=4                    
nb_grp_col=4                    
main=""         
hc_cols=FALSE                   
hc_rows=TRUE                    
hc_rows="eucl_dist"
hc_rows="eucl_dist_on_wt"
# hc_rows="cor_dist_on_wt"
# normalization=FALSE
ordering_func=median            
colors=c("cyan", "black", "red")


# Remove rows with no variation (needed to clustering rows according to cor)
# data = data[apply(data[,1:4], 1, function(l) {length(unique(l))})>1, ]

# Normalization
# colnames(data) = s$exp_grp[colnames(data),]$tissue_group_level1
if (normalization=="zscore_rows_on_wt") {
  idx_zscore = grep("wt", colnames(data))
  # idx_zscore = 1:4
  data = data[apply(data[,idx_zscore], 1, function(l) {length(unique(l))})>1, ]
  data = data - apply(data[,idx_zscore], 1, mean)
  data = data / apply(data[,idx_zscore], 1, sd)
} else if (normalization=="zscore_rows" | normalization==TRUE) {
  data = data - apply(data, 1, mean)
  data = data / apply(data, 1, sd)    
} else if (normalization=="zscore_cols") {
  data = t(data)
  data = data - apply(data, 1, mean)
  data = data / apply(data, 1, sd)    
  data = t(data)
} else if (normalization=="rank_cols") {
  data = apply(data, 2, rank)
} else if (normalization=="qqnorm_cols") {
  data = apply(data, 2, function(c) {
    qqnorm(c, plot.it=FALSE)$x
  })
}


# # Cut-off
# data[data >  20] = 20
# data[data < -20] = -20
# data[data >  4] =  4
# data[data < -4] = -4
data[data >  3] =  3
data[data < -3] = -3
# data[data >  2] =  2
# data[data < -2] = -2


# clustering samples...
if (hc_cols != FALSE) {
  tmp_d = t(data)
  if (hc_cols == "cor") {
    # ... based on correlation
    tmp_d = tmp_d[!apply(is.na(tmp_d), 1, any), ]
    d = dist(1 - cor(t(tmp_d), method="pe"))
    hc_col = hclust(d, method="complete")
    Colv = as.dendrogram(hc_col)
  } else if (hc_cols == "ordered") {
    # ... ordered by median
    data = data[,order(apply(tmp_d, 1, ordering_func, na.rm=TRUE), decreasing=TRUE)]
    hc_col = Colv = NULL      
  } else {
    # ... based on eucl. dist.
    d = dist(tmp_d)
    hc_col = hclust(d, method="complete")
    Colv = as.dendrogram(hc_col)
  }
} else {
  hc_col = Colv = NULL          
}

# clustering features...
if (hc_rows != FALSE) {
  tmp_d = data
  if (hc_rows == "eucl_dist_on_wt") {
    idx_clust = grep("wt", colnames(data))
    tmp_d = tmp_d[,idx_clust]
    # ... based on eucl. dist.
    d = dist(tmp_d)
    hc_row = hclust(d, method="complete")
    Rowv = as.dendrogram(hc_row)
  } else if (hc_rows == "cor_dist_on_wt") {
    idx_clust = grep("wt", colnames(data))
    tmp_d = tmp_d[,idx_clust]
    # ... bases on correlation
    tmp_d = tmp_d[!apply(is.na(tmp_d), 1, any), ]
    d = dist(1 - cor(t(tmp_d), method="pe"))
    hc_row = hclust(d, method="complete")
    Rowv = as.dendrogram(hc_row)      
  } else if (hc_rows == "eucl_dist") {
    # ... based on eucl. dist.
    d = dist(tmp_d)
    hc_row = hclust(d, method="complete")
    Rowv = as.dendrogram(hc_row)
  } else if (hc_rows == "ordered") {
    # ... ordered by median
    data = data[order(apply(tmp_d, 1, ordering_func, na.rm=TRUE), decreasing=TRUE),]
    hc_row = Rowv = NULL      
  } else {
    # ... bases on correlation
    tmp_d = tmp_d[!apply(is.na(tmp_d), 1, any), ]
    d = dist(1 - cor(t(tmp_d), method="pe"))
    hc_row = hclust(d, method="complete")
    Rowv = as.dendrogram(hc_row)      
  }
} else {
  hc_row = Rowv = NULL    
}
RowSideColors = rep("white", nrow(data))
names(RowSideColors) = rownames(data)
RowSideColors[intersect(names(RowSideColors), pooled_up_genes)] = "red"
RowSideColors[intersect(names(RowSideColors), pooled_dw_genes)] = "blue"


ColSideColors = rep("violet", ncol(data))
names(ColSideColors) = colnames(data)
ColSideColors[grep("wt", colnames(data))] = "green"


if (!is.null(Colv) & !is.null(Rowv)) {
  dendrogram="both"
} else if (!is.null(Rowv)) {
  dendrogram="row"
} else if (!is.null(Colv)) {
  dendrogram="col"
} else {
  dendrogram="none"
}

print(dim(data))
print(dendrogram)
print(Rowv)

# colors = c("green", "black", "red")
# colors = c("blue", "yellow", "red")
# colors = rev(RColorBrewer::brewer.pal(n=11, "RdYlBu"))
cols = colorRampPalette(colors)(20)
foo = gplots::heatmap.2(data, 
  Rowv=Rowv,
  Colv=Colv,
  dendrogram=dendrogram,
  RowSideColors=RowSideColors,
  ColSideColors=ColSideColors,
  trace="none", 
  col=cols, 
  main=paste0(main, " (", nrow(data), " features x ", ncol(data), " tissues)"), 
  mar=c(10,5), 
  useRaster=TRUE, 
  # ColSideColors=ColSideColors,
  cex.axis=0.5
)  

```

# Boxplot

```{r eval = FALSE}
layout(matrix(1:2, 1), respect=TRUE)
pooled_genes = list(
  up = pooled_up_genes,
  down = pooled_dw_genes
)
for (k in names(pooled_genes)) {
  d = apply(data[intersect(pooled_genes[[k]], rownames(data)),], 1, function(l) {
    data.frame(sample=colnames(data), expr=l)
  })
  d = do.call(rbind, d)
  d$day = substr(colnames(data), 1,3)
  day
  d$line = substr(colnames(data), 5,6)
  d$day = substr(colnames(data), 1,3)
  d$line = substr(colnames(data), 5,6)
  d$line = factor(d$line, levels=unique(d$line))
  boxplot(expr~line+day, d[], col=c("green","violet"), las=2, xlab="", ylab=paste0("z-score of pooled ", k, " genes (a.u.)"), main=paste0(k, " genes"))  
}

```


# Session Information

```{r, results="verbatim"}
sessionInfo()
```