forked from Wendellab/Co-expressionNetwork
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Rcode.txt
420 lines (374 loc) · 18.3 KB
/
Rcode.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
############## Install WGCNA ####################
install.packages("WGCNA")
install.packages(c("dynamicTreeCut", "cluster", "flashClust", "Hmisc", "reshape", "foreach", "doParallel") )
source("http://bioconductor.org/biocLite.R")
biocLite("impute")
orgCodes = c("Hs", "Mm", "Rn", "Pf", "Sc", "Dm", "Bt", "Ce", "Cf", "Dr", "Gg");
orgExtensions = c(rep(".eg", 4), ".sgd", rep(".eg", 6));
packageNames = paste("org.", orgCodes, orgExtensions, ".db", sep="");
biocLite(c("GO.db", "KEGG.db", "topGO", packageNames, "hgu133a.db", "hgu95av2.db", "annotate", "hgu133plus2.db", "SNPlocs.Hsapiens.dbSNP.20100427", "minet", "OrderedList"))
workingDir="~/Dropbox/CoexpressionNetworkAnalysis/oilseedRPKM"
setwd(workingDir);
# Load the package
library(WGCNA);
library(RColorBrewer)
# The following setting is important, do not omit.
options(stringsAsFactors = FALSE);
enableWGCNAThreads();
############### Step 1. Basic data processing and cleaning ###############
# load expression data
data = read.table("All ADT RPKM.txt",header=TRUE, sep="\t");
# Take a quick look at what is in the data set:
dim(data); #37223 157
names(data);
names(data)<-gsub("assemblies.|.bam","",names(data))
# Make each row corresponds to a gene and column to a sample or auxiliary information.
datExpr0 = as.data.frame(t(data[, -1]));
names(datExpr0) = data[,1];
rownames(datExpr0);
# Check for genes and samples with too many missing values
gsg = goodSamplesGenes(datExpr0, verbose = 3);
#Excluding 5889 genes from the calculation due to too many missing samples or zero variance.
gsg$allOK
# If the last statement returns TRUE, all genes have passed the cuts. If not, we remove the offending genes and samples from the data:
if (!gsg$allOK)
{
# Optionally, print the gene and sample names that were removed:
if (sum(!gsg$goodGenes)>0)
printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")));
if (sum(!gsg$goodSamples)>0)
printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));
# Remove the offending genes and samples from the data:
datExpr = datExpr0[gsg$goodSamples, gsg$goodGenes]
}
dim(datExpr0) # raw 156 37223
dim(datExpr) # now 156 31334
# Cluster the samples (in contrast to clustering genes that will come later) to see if there are any obvious outliers. We use the function flashClust that provides faster hierarchical clustering than the standard function hclust.
sampleTree = flashClust(dist(datExpr), method = "average");
# Plot the sample tree: Open a graphic output window of size 12 by 9 inches
# The user should change the dimensions if the window is too large or too small.
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
cex.axis = 1.5, cex.main = 2)
# read in sample condition and match the samples.
traitData = read.csv("SampleCondition.csv");
dim(traitData)
names(traitData)
# Form a data frame analogous to condition table.
dataSamples = rownames(datExpr);
traitRows = match(dataSamples, traitData$sample);
datTraits = traitData[traitRows, -1];
rownames(datTraits) = traitData[traitRows, 1];
collectGarbage();
# visualize how the clinical traits relate to the sample dendrogram.
# Re-cluster samples
sampleTree2 = flashClust(dist(datExpr), method = "average")
# Convert traits to a color representation
traitColors = data.frame(
genome=brewer.pal(nlevels(factor(datTraits$genome) ),"Set1") [as.numeric(factor(datTraits$genome) )],
dpa =brewer.pal(nlevels(factor(datTraits$dpa ) ),"Set2") [as.numeric(factor(datTraits$dpa ) )],
rep = "grey",
sub =brewer.pal(nlevels(factor(datTraits$sub ) ),"Set3") [as.numeric(factor(datTraits$sub ) )] )
# Plot the sample dendrogram and the colors underneath.
sizeGrWindow(12,9)
pdf(file = "sampleClustering.pdf", width = 12, height = 9);
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, cex.axis = 1.5, cex.main = 2)
plotDendroAndColors(sampleTree2, traitColors, groupLabels = names(datTraits), main = "Sample dendrogram and trait heatmap",cex.dendroLabels = 0.5, cex.colorLabels=1.5, colorHeight =0.1)
dev.off()
save(datExpr, datTraits, file = "oilseed-01-dataInput.RData")
############### Step 2. Choosing the soft-thresholding power: analysis of network topology ###############
# if start a new R seesion
"
workingDir=''
setwd(workingDir);
library(WGCNA);
library(RColorBrewer);
library(ggplot2);
options(stringsAsFactors = FALSE);
enableWGCNAThreads()
lnames = load(file = "oilseed-01-dataInput.RData");
lnames
"
# the current expression contains still too many genes to build network, I need to filter them more
# AND Genes with across-the-board low expressions tend to reflect noise and correlations based on counts that are mostly zero aren't really meaningful. These genes should be removed; for example, genes with read count < 10 in more than 90% of the samples. The actual thresholds should be based on experimental design, sequencing depth and sample counts.
table(apply(datExpr,2,mean)>=2) # average RPKM >=2 17829
table(apply(datExpr,2,median)>=2) # median RPKM >=2 16673 ** makes sense to me, save as datExpr1
table(apply(datExpr,2,max)>=2) # max RPKM >=2 28732
table(apply(datExpr,2,max)>10) # max RPKM >=10 16752 ** makes sense to me, save as datExpr2
table(apply(datExpr,2,min)>=2) # min RPKM >=2 5467
# Or remove genes with centain percentage of RPKM counts lower than a cutoff
table(apply(datExpr,2,filter<-function(x){length(x[x<=10] )/length(x)==1} ) ) # same as max RPKM>=10, 16752 should be kept, same as
table(apply(datExpr,2,filter<-function(x){length(x[x<=5] )/length(x)>0.95} ) ) # 13422 genes with over 95% counts <=5, 17912 should be kept
dim( datExpr1 <- datExpr[,apply(datExpr,2,median)>=2] ) #16673
dim( datExpr2 <- datExpr[,apply(datExpr,2,max)>10] ) #16752
dim( datExpr3 <- datExpr[,apply(datExpr,2,filter<-function(x){length(x[x<=5] )/length(x)>0.95} ) ==FALSE] ) #17912
save(datExpr1, datExpr2, datExpr3, datTraits, file = "oilseed-02-dataFilter.RData")
#### Repeat below for datExpr1, datExpr2, datExpr3, between repeats, rename sft folder
# work with individual genome
for (i in c("A2","D5","AD3","TM1","Yuc") )
{
# i="A2"
# Extract total read counts for each genome
subTraits <- datTraits[datTraits$genome == i & datTraits$sub=="T",]
subDat <- datExpr1[datTraits$genome == i & datTraits$sub=="T",]
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=40, by=2))
# Call the network topology analysis function
system.time( sft <- pickSoftThreshold(subDat, powerVector = powers, verbose = 5) )
# Plot the results:
pdf(file = paste("sft/SoftThreshold_",i,".pdf",sep=""), height=5,width=9)
sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n", main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.80, col="orange") # converge to 0.8 with 12 samples
abline(h=0.90, col="red") # converge to 0.9 with 24 samples
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5], xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
dev.off()
write.table(as.data.frame(sft$fitIndices), file= paste("sft/SoftThreshold_",i,".txt",sep=""), sep="\t")
}
# Inspect all the threshold tables, if the model fit converge to 0.8, manually inspect SFT.R.sq >=0.8
setwd("./sft/")
file_list<-list.files(pattern=".*txt")
rm(dataset)
for (file in file_list){
# if the merged dataset doesn't exist, create it
if (!exists("dataset")){
dataset <- read.table(file, header=TRUE, sep="\t")
}
# if the merged dataset does exist, append to it
else{
temp_dataset <-read.table(file, header=TRUE, sep="\t")
dataset<-rbind(dataset, temp_dataset)
rm(temp_dataset)
}
}
sfts<-dataset
sfts$genome<-rep(gsub(".*_|.txt","",file_list), each=25)
# plot all the curves together
pdf("allgenome.pdf")
ggplot(sfts, aes(x=Power, y=SFT.R.sq, color=genome)) + geom_point() +geom_line()+geom_hline(yintercept=0.8)
ggplot(sfts, aes(x=Power, y=mean.k., color=genome)) + geom_point() +geom_line()
dev.off()
###### End of repeat
# Inspect "allgenome.pdf", a bit hard to choose Power. I am going to test c(20, 25, 30) to see how the choice affect my network and results.
## The fit can be affected by the choice of filtering methods.
## datExpr2 and datExpr3 behave almost the same, better than datExpr1, so I flip a coin and chose datExpr3
############### Step 3. Network Construction ###############
# if start a new R seesion
"
workingDir=''
setwd(workingDir);
library(WGCNA);
library(RColorBrewer);
library(ggplot2);
options(stringsAsFactors = FALSE);
enableWGCNAThreads()
lnames = load(file = "oilseed-02-dataFilter.RData");
lnames
"
# work with individual genome, then work with different soft threshold
#for (i in c("A2","D5","AD3","TM1","Yuc") )
#for j in c(20, 25, 30, 40) )
##### START of manual construction, but gene sets over 5000 are very likely to kill R!!!!!
# Extract total read counts for each genome
i="A2"
subTraits <- datTraits[datTraits$genome == i & datTraits$sub=="T",]
subDat <- datExpr3[datTraits$genome == i & datTraits$sub=="T",]
# test different softpower
j= 20
softPower = j
# computes the network connectivity (Connectivity)
Connectivity= softConnectivity(subDat,power=softPower)
# Creating Scale Free Topology Plots (SUPPLEMENTARY FIGURE S1 in our article)
par(mfrow=c(1,1))
scaleFreePlot(Connectivity, truncated=T,main= paste("beta=",as.character(softPower)))
# We now calculate the adjacencies, using the soft thresholding power
adjacency = adjacency(subDat, power = softPower);
# Turn adjacency into topological overlap
TOM = TOMsimilarity(adjacency);
dissTOM = 1-TOM
# Call the hierarchical clustering function using TOM
geneTree = flashClust(as.dist(dissTOM), method = "average");
# Plot the resulting clustering tree (dendrogram)
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04);
# We like large modules, so we set the minimum module size relatively high:
minModuleSize = 30;
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit = 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
table(dynamicMods)
# Returned 22 modules from largest to smallest. Label 0 is reserved for unassigned genes.
# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
# Merging of modules whose expression profiles are very similar
# Calculate eigengenes
MEList = moduleEigengenes(subDat, colors = dynamicColors) # MEList = moduleEigengenes(subDat[,bwnet$goodGenes], colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs);
# Cluster module eigengenes
METree = flashClust(as.dist(MEDiss), method = "average");
# Plot the result
sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",xlab = "", sub = "")
# A height cut of 0.25, corresponding to correlation of 0.75, to merge these modules:
MEDissThres = 0.25
# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")
# Call an automatic merging function
merge = mergeCloseModules(subDat, dynamicColors, cutHeight = MEDissThres, verbose = 3)
# The merged module colors
mergedColors = merge$colors;
# Eigengenes of the new merged modules:
= merge$newMEs;
sizeGrWindow(12, 9)
#pdf(file = "Plots/geneDendro-3.pdf", wi = 9, he = 6)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors), c("Dynamic Tree Cut", "Merged dynamic"),dendroLabels = FALSE, hang = 0.03,addGuide = TRUE, guideHang = 0.05)
#dev.off()
In the subsequent analysis, we will use the merged module colors in mergedColors. We save the relevant variables for
use in subsequent parts of the tutorial:
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs;
# Save module colors and labels for use in subsequent parts
save(MEs, moduleLabels, moduleColors, geneTree, file = ".....RData")
##### END of manual construction, celebrate if getting here without crashing R
##### Otherwise, time to do block-wise construction or run R in bash mode in server or somewhere powerful
#R --save < script.R >A2_out.txt
nohup R CMD BATCH buildnetwork.R &
#output as buildnetwork.Rout
############### Step 4. Module-level analysis ###############
# if start a new R seesion
"
workingDir=''
setwd(workingDir);
library(WGCNA);
library(RColorBrewer);
library(scatterplot3d);
library(ggplot2);
options(stringsAsFactors = FALSE);
enableWGCNAThreads()
"
i="A2"
subTraits <- datTraits[datTraits$genome == i & datTraits$sub=="T",]
subDat <- datExpr3[datTraits$genome == i & datTraits$sub=="T",]
load("A2_power20.RData")
Anet20<-net
load("A2_power25.RData")
Anet25<-net
load("A2_power30.RData")
Anet30<-net
gg<-Anet20$goodGenes #get the good genes
pdf("A2_modules_cluster.pdf")
plotDendroAndColors(Anet20$dendrograms[[1]], cbind(labels2colors(Anet20$colors[gg]),labels2colors(Anet25$colors[gg]),labels2colors(Anet30$colors[gg])),c("power=20","power=25","power=30"),dendroLabels = FALSE, hang = 0.03,addGuide = TRUE, guideHang = 0.05)
dev.off()
# Network topology and other concepts need to be explored more!!!!!!!!!!!!!!!!!!
# The following computes the network connectivity (Connectivity)
par(mfrow=c(2,2))
pdf("A2_network_connectivity.pdf")
for(j in c(20,25,30)){
k= softConnectivity(subDat,power=j)
# Creating Scale Free Topology Plots (SUPPLEMENTARY FIGURE S1 in our article)
scaleFreePlot(k, truncated=T,main= paste("beta=",j))
}
dev.off()
discretized.k = cut(k, 20)
dk = tapply(k, discretized.k, mean)
p.dk = as.vector(tapply(k, discretized.k, length)/length(k))
plot(log(dk, 10), log(p.dk, 10)) # power law, # linear
#### Eigengenes are the 1st principal component of modules in a given single dataset, which provide a summary profile for each module.
# Study the relationships among the found modules.
load('~/Downloads/A2_power20_TOM-block.1.RData')
dissTOM=as.matrix(1-TOM)
# test this with a subset of the TOM, otherwise it will take forever
select<-sample(dim(dissTOM)[1],500)
selectTOM = dissTOM[select, select];
selectTree = flashClust(as.dist(selectTOM), method = "average")
selectColors = labels2colors(Anet20$colors[Anet20$goodGenes])[select];
sizeGrWindow(9,9)
# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing
# the color palette; setting the diagonal to NA also improves the clarity of the plot
plotDiss = selectTOM^7;
diag(plotDiss) = NA;
TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")
# use classical multi-dimensional scaling plots
# for visualizing the network. Here we chose 3 scaling dimensions
# This also takes about 10 minutes...
cmd1=cmdscale(as.dist(selectTOM),4)
par(mfrow=c(2,3))
plot(cmd1[,c(1,2)], col= as.character(selectColors) )
plot(cmd1[,c(1,3)], col= as.character(selectColors) )
plot(cmd1[,c(1,4)], col= as.character(selectColors) )
plot(cmd1[,c(2,3)], col= as.character(selectColors) )
plot(cmd1[,c(2,4)], col= as.character(selectColors) )
plot(cmd1[,c(3,4)], col= as.character(selectColors) )
# 3D plot, oh yeah
par(mfrow=c(1,1))
scatterplot3d(cmd1[,1:3], color=as.character(colorh1), main="MDS plot",xlab="Scaling Dimension 1", ylab="Scaling Dimension 2", zlab="Scaling Dimension 3",cex.axis=1.5,angle=320)
#### Displaying module heatmap and the eigengene
sizeGrWindow(8,7);
pdf("A2_network20_module&expression.pdf")
for(i in 0:dim(Anet20$MEs)[2]) {
which.module=paste("ME",i,sep="")
module.color=labels2colors(i)
#heatmap
par(mfrow=c(2,1), mar=c(0.3, 5.5, 4, 2))
plotMat(t(scale(subDat[,Anet20$colors==i ]) ),
nrgcols=30,rlabels=T,rcols=module.color,
main=which.module, cex.main=2)
#barplot
par(mar=c(5, 4.2, 0, 0.7))
barplot(Anet20$MEs[,which.module], col=module.color, main="", cex.main=2,
ylab="eigengene expression",xlab="seed development (dpa)", names.arg=rep(c(10,20,30,40), each=3) )
}
dev.off()
#### Relate to eigengenes to external traits or sample conditions
# Define numbers of genes and samples
nGenes = ncol(subDat);
nSamples = nrow(subDat);
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(subDat, labels2colors(Anet20$colors))$eigengenes
MEs = orderMEs(MEs0)
# calculate corelation between eigengenes and condition -dpa
moduleTraitCor = cor(MEs, subTraits$dpa, use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);
# graphical representation for corelation
sizeGrWindow(5,6)
# Will display correlations and their p-values
pdf("A2_network20_modules.pdf")
textMatrix = paste(signif(moduleTraitCor, 2), " (", signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor, xLabels = "dpa", yLabels = names(MEs),
ySymbols = names(MEs), colorLabels = TRUE, colors = blueWhiteRed(50),
textMatrix = as.matrix(textMatrix), setStdMargins = FALSE,
cex.text = 0.7,zlim = c(-1,1), main = paste("Module-development relationships"))
# Another way to visualize the correlations with eigegene dendrogram and adjacency heatmap, BUT the heatmap color bar is NOT RIGHT, should be (-1, 1)
MET = orderMEs(cbind(MEs, subTraits$dpa))
sizeGrWindow(5,7.5);
par(cex = 0.9)
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle= 90)
dev.off()