-
Notifications
You must be signed in to change notification settings - Fork 0
/
bootstrap_models.R
149 lines (123 loc) · 4.92 KB
/
bootstrap_models.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# This script did bootrapping on select swi models, and was run an an EC2
library(synapseClient)
library(glmnet)
library(limma)
library(ROCR)
library(tidyverse)
library(doMC)
registerDoMC(cores=10)
setwd('/home/ubuntu/')
synapseLogin()
synapseCacheDir("./")
# list of mutations
swiMuts <- read.delim2("results-20170523-150114.csv",sep=",",as.is=TRUE)
# list of tumor samples
swiPats <- read.delim2("results-20170523-154238.csv",sep=",",as.is=TRUE)[,1]
# clinical data, patient by row
print("get synapse TCGA data")
clin <- read.delim2(synGet("syn4983466")@filePath,sep="\t",header=TRUE,as.is=TRUE)
# expression data, genes by row, tumor sample by col
expr <- read.delim2(synGet("syn4976369")@filePath, as.is=TRUE, sep="\t",header=TRUE,check.names=FALSE)
print("filtering data")
# remove gene names
tmp <- data.matrix(expr[,-1])
# logical vector for any rows with atleast 1 NA
mask <- apply(tmp, 1, function(x) any(is.na(x)))
# getting gene names from names column, filtering out any with NA's in row
genes <- gsub("(.*)\\|.*", "\\1", expr[,1])[!mask]
# getting entrez numbers from names column, filtering out any with NA's in row
entrez <- gsub(".*\\|(.*)", "\\1", expr[,1])[!mask]
# filtering out na rows
em <- tmp[!mask,]
rm(tmp)
rm(expr)
types <- as.numeric(gsub("TCGA-..-....-(..).*","\\1",colnames(em)))
# filtering expresison data for cols with type under 10
em <- em[, types < 10]
# shortening expr colnames
colnames(em) <- gsub("(TCGA-..-....-...).*", "\\1", colnames(em))
# unique genes from mutation list
swiGenes <- unique(swiMuts$Hugo_Symbol)
n <- length(swiGenes)
# matrix of zeros, patients, by complex genes
M <- matrix(0,nrow=length(swiPats),ncol=n, dimnames=list(swiPats, swiGenes))
# create a matrix of tumors associated with gene mutations
for(gene in swiGenes){
# for each complex gene, get the tumors associated with that gene
pats <- swiMuts$Tumor_SampleBarcode[swiMuts$Hugo_Symbol == gene]
M[swiPats %in% pats,gene] <- 1
}
# patients with tumors, clinical data, and expression data
commonPats <- intersect(intersect(clin$bcr_patient_barcode,
gsub("(TCGA-..-....).*","\\1", rownames(M))),
gsub("(TCGA-..-....).*","\\1", colnames(em)))
# index of clinical data rows to use
clin.idxs <- match(commonPats, clin$bcr_patient_barcode)
# index of mutation data rows to use
mut.idxs <- match(commonPats,gsub("(TCGA-..-....).*","\\1", rownames(M)))
# index of expression data cols to use
expr.idxs <- match(commonPats, gsub("(TCGA-..-....).*","\\1", colnames(em)))
M.m <- M[mut.idxs,]
clin.m <- clin[clin.idxs,]
expr.m <- em[, expr.idxs]
# for laptop
rm(em)
rm(clin)
#######################
####
# unique tumor types
tt <- unique(clin.m$acronym)
R <- lapply(tt, function(x){
# filter data for tumor type
mask <- clin.m$acronym == x
# rows of M.m that are of this tumor and have atleast one complex gene mutation
mut <- rowSums(M.m[mask,]) > 0
# expression data for this tumor
X <- expr.m[,mask]
# Is gene differentially expressed in tumors grouped on 0/ >0 mutations in complex genes
pvals <- eBayes(lmFit(X, model.matrix(~mut)))$p.value[,2]
# list n tumors, gene pvalues, n tumors with >0 complex gene mutations
list(N=sum(mask),pvals=pvals,Nmut=sum(mut))
})
pdf("./ExprSignal.pdf",width=10,height=30)
par(mfrow=c(8,4))
for(i in 1:32){
tmp <- R[[i]]
hist(tmp$pvals,main=paste(tt[i],tmp$Nmut,"of",tmp$N))
}
dev.off()
#####################
# ridge modeling
boostrap_samples <- function(X, mut){
print("bootstrapping")
samples <- colnames(X)[colnames(X) %in% names(mut)]
training_samples <- sample(x = samples, size = as.integer(.67 * ncol(X)), replace = T)
test_samples <- samples[!samples %in% training_samples]
training_exp <- t(X[,training_samples])
test_exp <- t(X[, test_samples])
training_mut <- mut[training_samples]
test_mut <- mut[test_samples]
cv.fit <- cv.glmnet(training_exp, training_mut, family="binomial",alpha = 0,type.measure="auc",parallel=TRUE)
yhat <- predict(cv.fit, test_exp, type="response", s = "lambda.min")
yhat_df <- yhat %>%
data.frame %>%
rownames_to_column("sample")
pred <- prediction(yhat, test_mut)
auc <- performance(pred,"auc")@y.values[[1]]
return(list(yhat_df, auc))
}
# pick tumor types with atleast 15 tumors and atleast 15% of total with one or more
# mutations in complex genes
tt_to_model <- tt[which(sapply(R, function(x) x$Nmut > 15 && x$Nmut / x$N > .15))]
boostrap_tumor <- function(tumorType, M.m, clin.m, expr.m){
print(paste("bootstrapping", tumorType))
mask <- clin.m$acronym == tumorType
# whether or not tumors have atleast one complex tumor
mut <- rowSums(M.m[mask,]) > 0
X <- expr.m[,mask]
result_list <- rerun(30, boostrap_samples(X, mut))
return(result_list)
}
results <- map(tt_to_model, boostrap_tumor, M.m, clin.m, expr.m)
names(results) <- tt_to_model
saveRDS(results, file = 'bootstrap_results.RDS')