-
Notifications
You must be signed in to change notification settings - Fork 3
/
geneExpressionFromGEO.r
215 lines (162 loc) · 10.3 KB
/
geneExpressionFromGEO.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#' Function that returns numeric values with 2 decimal numbers.
#'
#' @param x input numeric value with N decimal numbers.
#' @return a numeric value with 2 decimal numbers.
#' @examples
#' aaa <- dec_two(8.31232)
dec_two <- function(x) {
return (format(round(x, 2), nsmall = 2));
}
#' Function that reads in a URL to check and verifies if it exists (function taken from https://stackoverflow.com/a/12195574 )
#'
#' @param url the URL of a webpage
#' @return the output of a webpage verification check
#' @examples y <- readUrl("http://stat.ethz.ch/R-manual/R-devel/library/base/html/connections.html")
readUrl <- function(url) {
out <- tryCatch(
{
# Just to highlight: if you want to use more than one
# R expression in the "try" part then you'll have to
# use curly brackets.
# 'tryCatch()' will return the last evaluated expression
# in case the "try" part was completed successfully
readLines(con=url, warn=FALSE)
# The return value of `readLines()` is the actual value
# that will be returned in case there is no condition
# (e.g. warning or error).
# You don't need to state the return value via `return()` as code
# in the "try" part is not wrapped inside a function (unlike that
# for the condition handlers for warnings and error below)
},
error=function(cond) {
message(paste("URL does not seem to exist:", url))
message("Here's the original error message:")
message(cond)
# Choose a return value in case of error
return("EMPTY_STRING")
},
warning=function(cond) {
message(paste("URL caused a warning:", url))
message(cond)
# Choose a return value in case of warning
return("EMPTY_STRING")
},
finally={
# NOTE:
# Here goes everything that should be executed at the end,
# regardless of success or error.
# If you want more than one expression to be executed, then you
# need to wrap them in curly brackets ({...}); otherwise you could
# just have written 'finally=<expression>'
message(paste("Processed URL:", url))
}
)
return(out)
}
#' Function that reads in the GEO code of a dataset, and returns the gene expression dataframe.
#'
#' @param datasetGeoCode the GEO code of a dataset.
#' @param retrieveGeneSymbols a boolean flag stating if the function should retrieve the gene symbols or not.
#' @param verbose a boolean flag stating if helping messages should be printed or not
#' @return a gene expression dataset.
#' @examples
#' geneExpressionDF1 <- getGeneExpressionFromGEO("GSE3268", FALSE, FALSE)
getGeneExpressionFromGEO <- function(datasetGeoCode, retrieveGeneSymbols, verbose = FALSE)
{
GSE_code <- datasetGeoCode
# check URL
checked_html_text <- "EMPTY_STRING"
checked_html_text <- xml2::read_html("https://ftp.ncbi.nlm.nih.gov/geo/series/")
checked_html_text_url <- "EMPTY_STRING"
url_to_check <- paste0("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=", datasetGeoCode)
GSE_code_for_url <- GSE_code
GSE_code_for_url <- substr(GSE_code_for_url,1,nchar(GSE_code_for_url)-3)
GSE_code_for_url <- paste0(GSE_code_for_url, "nnn")
complete_url <- paste0("https://ftp.ncbi.nlm.nih.gov/geo/series/", GSE_code_for_url, "/", GSE_code)
checked_html_text_url <- lapply(complete_url, readUrl)
#
if(all(checked_html_text == "EMPTY_STRING")) {
cat("The web url https://ftp.ncbi.nlm.nih.gov/geo/series/ is unavailable right now. Please try again later. The function will stop here\n")
return(NULL)
} else if(all(checked_html_text_url == "EMPTY_STRING" | is.null(checked_html_text_url[[1]]) )) {
cat("The web url ", complete_url," is unavailable right now (Error 404 webpage not found). The GEO code might be wrong. The function will stop here\n", sep="")
return(NULL)
} else {
gset <- GEOquery::getGEO(GSE_code, GSEMatrix =TRUE, getGPL=FALSE)
thisGEOplatform <- toString((gset)[[1]]@annotation)
if (length(gset) > 1) idx <- grep(thisGEOplatform, attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
gene_expression <- as.data.frame(Biobase::exprs(gset))
if(retrieveGeneSymbols == TRUE) {
gene_expression$GeneSymbol <- ""
# we retrieve the platform details
platform_ann <- annotate::readGEOAnn(GEOAccNum = thisGEOplatform)
platform_ann_df <- as.data.frame(platform_ann, stringsAsFactors=FALSE)
if (verbose == TRUE) {
print("sort(names(platform_ann_df))")
print(sort(names(platform_ann_df)))
}
# "gene_assignment
platformsWithGene_assignmentField <- c("GPL11532", "GPL23126", "GPL6244")
# "Gene Symbol"
platformsWithGeneSpaceSymbolField <- c("GPL80", "GPL8300", "GPL80", "GPL96", "GPL570", "GPL571")
# "gene_symbol"
platformsWithGene_SymbolField <- c("GPL20115")
# "symbol"
platformsWithSymbolField <- c("GPL1293", "GPL6102", "GPL6104", "GPL6883", "GPL6884")
# "GENE_SYMBOL
platformsWith_GENE_SYMBOL_Field <- c("GPL13497", "GPL14550", "GPL17077", "GPL6480")
# if symbol
if(thisGEOplatform %in% c(platformsWithGeneSpaceSymbolField, platformsWithGene_SymbolField, platformsWithSymbolField, platformsWith_GENE_SYMBOL_Field) ) {
emptyGeneSymbol <- ""
FIRST_GENE_EXPRESSION_INDEX <- 2
if (verbose == TRUE) cat("\n[start] loop for the association of the gene symbols to the probeset ID's: completed \n", sep="")
# we start from 2 because 1 is the label
for(k in FIRST_GENE_EXPRESSION_INDEX:nrow(gene_expression)) {
currentCompletionPerc <- k*100 / nrow(gene_expression)
kForPrint <- 1000
if (verbose == TRUE) if ((k %% kForPrint)==0) { cat(dec_two(currentCompletionPerc), "% ", sep="") }
if(thisGEOplatform %in% platformsWithGeneSpaceSymbolField) thisSymbol <- platform_ann_df[platform_ann_df$ID==rownames(gene_expression[k,]),]$"Gene Symbol"
if(thisGEOplatform %in% platformsWithGene_SymbolField) thisSymbol <- platform_ann_df[platform_ann_df$ID==rownames(gene_expression[k,]),]$"gene_symbol"
if(thisGEOplatform %in% platformsWithSymbolField) thisSymbol <- platform_ann_df[platform_ann_df$ID==rownames(gene_expression[k,]),]$"symbol"
if(thisGEOplatform %in% platformsWith_GENE_SYMBOL_Field) thisSymbol <- platform_ann_df[platform_ann_df$ID==rownames(gene_expression[k,]),]$"GENE_SYMBOL"
gene_expression[k,]$GeneSymbol <- thisSymbol
}
if (verbose == TRUE) cat("\n [end] loop for the association of the gene symbols to the probeset ID's \n ", sep="")
} else if(thisGEOplatform %in% platformsWithGene_assignmentField) { # if assignment
FIRST_GENE_EXPRESSION_INDEX <- 2
SIZE_SPLIT_STRING <- 2
GENE_SYMBOL_INDEX <- 2
if (verbose == TRUE) cat("\n[start] loop for the association of the gene symbols to the probeset ID's: completed ", sep="")
# we start from 2 because 1 is the label
for(k in FIRST_GENE_EXPRESSION_INDEX:nrow(gene_expression)) {
currentCompletionPerc <- k*100 / nrow(gene_expression)
kForPrint <- 1000
if (verbose == TRUE) if ((k %% kForPrint)==0) { cat(dec_two(currentCompletionPerc), "% ", sep="") }
thisAssignment <- NULL
thisProbesetID <- NULL
thisProbesetID <- rownames(gene_expression[k,])
# cat("this probeset ID: ", thisProbesetID, "\t", sep="" )
thisAssignment <- platform_ann_df[platform_ann_df$ID==thisProbesetID, ]$"gene_assignment"
split_string <- strsplit(thisAssignment, "//")
if (length(split_string[[1]]) >= SIZE_SPLIT_STRING) {
thisGeneSymbol_temp <- NULL
thisGeneSymbol <- NULL
thisGeneSymbol_temp <- split_string[[1]][GENE_SYMBOL_INDEX]
thisGeneSymbol <- gsub(" ", "", thisGeneSymbol_temp, fixed = TRUE)
# cat("\t this gene symbol: ", thisGeneSymbol, "\n", sep="")
gene_expression[k,]$GeneSymbol <- thisGeneSymbol
} else {
gene_expression[k,]$GeneSymbol <- thisAssignment
}
}
if (verbose == TRUE) cat("\n [end] loop for the association of the gene symbols to the probeset ID's ", sep="")
} else {
if (verbose == TRUE) cat("\n\n[Impossible to perform gene symbol retrieval]\n")
if (verbose == TRUE) cat("We're sorry but the indicated platform (", thisGEOplatform, ") is not among the platforms included in this function.\nThe gene symbol retrieval cannot be performed.\nPlease visit the https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=", thisGEOplatform, " website for more information about this platform.\n\n", sep="")
gene_expression$GeneSymbol <- NULL
}
}
return(gene_expression)
}
}