-
Notifications
You must be signed in to change notification settings - Fork 4
/
DEG_analysis.R
137 lines (112 loc) · 4.51 KB
/
DEG_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
setwd(".")
library(DESeq2)
library(knitr)
library(arrayQualityMetrics)
library(devtools)
#install_github("vqv/ggbiplot")
data = read.csv('./data/severity_study/GSE147507_RawReadCounts_Human.tsv',sep = '\t',row.names = 1)
#data = t(data)
rownames(data)
# read metadata
target = read.csv('./data/severity_study/Metadata_3.csv')
# remove samples not in SARS-CoV-2
series_selected = unique(target[target$Type=='SARS-CoV-2','Series'])
target =target[target$Series %in% series_selected,]
rownames(target) = target$Sample
# remove phenotypes with less than 6 samples ( 3 replicates per case)
tab <- table(target$Series)
target =target[target$Series %in% names(tab)[tab>5],]
# remove sample with drug pertubagen
target = target[- grep("Rux", target$Sample),]
#change sample names in the count table such as in the metadata
colnames(data) = gsub('SARS.CoV.2','Infected',colnames(data))
colnames(data) = gsub('A549.ACE2','A549_ACE2',colnames(data))
target$Sample = gsub('SARS.CoV.2','Infected',target$Sample)
target$Sample = gsub('A549.ACE2','A549_ACE2',target$Sample)
target$Type = gsub('SARS-CoV-2','Infected',target$Type)
# select samples only in the metadata after filteration
data = data[,colnames(data) %in% as.character(target$Sample)]
data = data[,order(colnames(data))]
target = target[order(target$Sample),]
#define the condition
condition = target$Series
genotype = as.factor(target$Type)
meta = DataFrame(condition,genotype)
# Create quality report
#minimalSet <- ExpressionSet(assayData=as.matrix(data))
#arrayQualityMetrics(expressionset = minimalSet, outdir = "Quality_Report_GSE147507", force = TRUE, do.logtransform = FALSE)
# detect outliers via hierarchical clustering
#distmat = dist(t(data))
#hcldat = hclust(distmat, method="average")
#plot(hcldat)
#BiocManager::install('arrayQualityMetrics')
# Create quality report for Moran et al. dataset
library(arrayQualityMetrics)
minimalSet <- ExpressionSet(assayData=as.matrix(data))
arrayQualityMetrics(expressionset = minimalSet, outdir = "Quality_Report_Moran", force = TRUE, do.logtransform = FALSE)
#We filter out lowly expressed genes.
library(edgeR)
keep <- filterByExpr(data)
table(keep)
keep
data <- data[keep,]
#genotype + condition + genotype:condition
dds <- DESeqDataSetFromMatrix(countData = data,colData = meta,~condition +genotype:condition)
rld <- rlog(dds)
plotPCA(rld)
# also possible to perform custom transformation:
dds <- estimateSizeFactors(dds)
# shifted log of normalized counts
se <- SummarizedExperiment(log2(counts(dds, normalized=TRUE) + 1),
colData=colData(dds))
# the call to DESeqTransform() is needed to
# trigger our plotPCA method.
plotPCA( DESeqTransform( se ) )
boxplot(log10(counts(dds)+1))
dds <- estimateSizeFactors(dds)
boxplot(log10(counts(dds,normalized=TRUE)+1))
vsd <- vst(dds)
class(vsd)
assay(vsd)[1:3,1:3]
plotPCA(vsd, "condition")
## PCA
library("ggplot2")
pcaData <- plotPCA(vsd, intgroup = c( "genotype", "condition"), returnData = TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(x = PC1, y = PC2, color = genotype, shape = condition)) +
geom_point(size =3) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
coord_fixed()
#dds$genotype <- target$Tyoe
dds$condition
dds$genotype
dds$genotype = relevel( dds$genotype, "Mock")
#design(dds) <- ~ genotype + condition + genotype:conditionx
assay(dds)[1:10,]
dds <- DESeq(dds)
resultsNames(dds)
#res <- results(dds, contrast=c("condition","Series1","Series2"))
#res <- results(dds, contrast=c("genotype","Infected","Mock"))
res <- results(dds, name=c("Intercept"))
res <- subset(res, padj < 0.05)
res <- subset(res, abs(log2FoldChange) >1)
write.table(res,paste0('./data/severity_study/DEGS_Intercept.txt'))
#res <- results(dds, list(c("condition_Infected_vs_Mock","conditionSeries16.genotypeInfected")))
#c("condition_B_vs_A","genotypeIII.conditionB")
#res <- subset(res, padj < 0.05)
conditions_unq = unique(condition)
for (i in 1:length(conditions_unq)){
condition_ = conditions_unq[i]
res <- results(dds, name=c(paste0("condition",condition_,".genotypeInfected")))
res <- subset(res, padj < 0.05)
res <- subset(res, abs(log2FoldChange) >1)
write.table(res,paste0('./data/severity_study/DEGs/',condition_,'.txt'))
}
design(dds) <- ~ genotype + condition + genotype:condition
dds <- DESeq(dds)
res <- results(dds, contrast=c("genotype","Infected","Mock"))
res <- subset(res, padj < 0.05)
res <- subset(res, abs(log2FoldChange) >1)
res
write.table(res,paste0('./data/severity_study/DEGS_Inf_vs_Mock.txt'))