-
Notifications
You must be signed in to change notification settings - Fork 2
/
06_absolute_expression_quantiles.Rmd
115 lines (93 loc) · 2.84 KB
/
06_absolute_expression_quantiles.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
---
title: "Absolute expression quantile"
author: "Florent Chuffart"
date: "`r Sys.Date()`"
output:
rmarkdown::html_document:
toc: true
toc_float: true
toc_depth: 3
number_sections: true
---
```{r, echo=FALSE, eval=TRUE}
knitr::opts_chunk$set(collapse=TRUE, comment = "#>", fig.width=9, fig.height=6, eval=TRUE, echo=FALSE, results="hide")
source("config")
```
```{r define quantiles, results="verbatim"}
genes = readRDS(paste0("~/projects/genes/bed_", version, "_epimeddb.rds"))
genes = genes[genes$type %in% "protein-coding",]
dim(genes)
# Loading full data matrix
table_file = list.files("04_deseq2/tables/", "*.complete.txt", full.names=TRUE)[1]
d = read.table(table_file, header=TRUE)
rownames(d) = d$Id
keys = colnames(d)[grep("norm.", colnames(d), fixed=TRUE)]
d$mean.scru = log2((d[,keys[1]] + d[,keys[2]] + d[,keys[3]] + d[,keys[4]]) / 4 + 1)
d = d[
!is.na(d$mean.scru) &
d$Id %in% rownames(genes),
]
dim(d)
d_filtered = d[
d$mean.scru>0,
]
dim(d_filtered)
layout(matrix(1:2,1), respect=TRUE)
pairs(d_filtered[,keys[1:4]])
plot(density(d_filtered$mean.scru))
probs = c(0,.25,.5,.75,1)
# probs = c(0, .05, .25, .5, .75, .95, 1)
q = quantile(d_filtered$mean.scru, probs=probs)
abline(v=q, lty=2, col="grey")
d_filtered$q_expr = as.numeric(cut(d_filtered$mean.scru, q, include.lowest=TRUE))
head(d_filtered)
dim(d_filtered)
# d[,1:6s]
d$q_expr = 0
d[rownames(d_filtered),]$q_expr = d_filtered$q_expr
table(d$q_expr)
```
# export bed file
```{r export}
for (nb_genes in (0:floor(min(table(d$q_expr))/1000))*1000) {
for (i in sort(unique(d$q_expr))) {
prefix = paste0("abs_expr_q", i, "_", nb_genes)
if (nb_genes > 0) {
set.seed(1)
feat = genes[sample(as.character(d[d$Id %in% rownames(genes) & d$q_expr %in% i,]$Id), nb_genes),]
} else {
feat = genes[as.character(d[d$Id %in% rownames(genes) & d$q_expr %in% i,]$Id),]
}
feat[,1:6]
# feat = feat[!is.na(feat$tx_end),]
# feat = feat[nchar(feat$chrom_text) <=5,]
# print(dim(feat))
# table(feat$type, useNA="ifany")
# table(feat$chrom_text, useNA="ifany")
#
# # tss
# table(feat$strand, useNA="ifany")
# feat$tss = NA
# feat[feat$strand == "+",]$tss = feat[feat$strand == "+",2]
# feat[feat$strand == "-",]$tss = feat[feat$strand == "-",3]
# feat$tss_key = paste0(feat$chrom_text, "_", feat$tss)#, "_", feat$strand)
# sum(duplicated(feat$tss_key))
# feat = feat[!duplicated(feat$tss_key),]
# dim(feat)
#
# unique(feat[,1])
#
# feat[feat[,1]=="chrMT",1] = "chrM"
# unique(feat[,1])
#
#
# feat = feat[feat$type,]
regions_filenames = regions_filename = paste0(prefix, ".bed")
write.table(feat[,1:6], file=regions_filename, sep="\t", quote=FALSE,row.names=FALSE, col.names=FALSE)
}
}
```
# Session Information
```{r, results="verbatim"}
sessionInfo()
```