-
Notifications
You must be signed in to change notification settings - Fork 0
/
intersect_script_for_rakergui.r
50 lines (31 loc) · 2.2 KB
/
intersect_script_for_rakergui.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
###### Example how to intersect the data using R sctipt
#In this example we are merging multiple (13) cancer data files together for the comparisons
###### Path for the Directory
setwd("./")
#### keep all the files in one foler
dat_files <- list.files("./", pattern='*.txt')
### you can trim the names of files
trim_names <- lapply(basename(dat_files), gsub, pattern = "-Full_table.*", replacement = "")
#### load files and assign into dataFrames
library(data.table)
l <- lapply( dat_files, read.table, sep = '\t' ) ## loading files into l
for (i in 1:length(l)) {
d <- as.data.frame(l[[i]]) #d is temp variable
##extracting specified coulmns from the files ## these col names should be same in all the files
names(d) <- c("Gene_symbol","Gene_ID","Log2_FC","adj_P") ##col with gene/probe_id is must; either Log2_FC or pvalue/padj can be used for this analysis
assign(trim_names[[i]], d) #Assigning names back
}
Common_genes <- Reduce(intersect, list(ACC$Gene_symbol, BRCA$Gene_symbol, BLCA$Gene_symbol,HNSC$Gene_symbol,KIRC$Gene_symbol,LAML$Gene_symbol,LGG$Gene_symbol, LIHC$Gene_symbol, LUAD$Gene_symbol, LUSC$Gene_symbol,PAAD$Gene_symbol, SKCM$Gene_symbol,THCA$Gene_symbol))
## extract logFC for common genes for each files and merge
for (i in 1:length(l)) {
d <- as.data.frame(l[[i]]) #d is temp variable
names(d) <- c("Gene_symbol","Gene_ID","Median_tumor","Median_normal","Log2_FC","adj_P")
d <- d[!duplicated(d[ , c("Gene_symbol")]),]
d <- d[d$Gene_symbol %in% Common_genes,c("Gene_symbol","Log2_FC")]
names(d) <- c("Gene_symbol",paste(trim_names[[i]],"Log2FC", sep = '_'))
#assign(trim_names[[i]], d)
assign(paste('d',i,sep = ''), d)
}
#final_data <- Reduce(function(x, y) merge(x, y,by='Gene_symbol', all=TRUE), list(ACC$Gene_symbol, BRCA$Gene_symbol, BLCA$Gene_symbol,HNSC$Gene_symbol,KIRC$Gene_symbol,LAML$Gene_symbol,LGG$Gene_symbol, LIHC$Gene_symbol, LUAD$Gene_symbol, LUSC$Gene_symbol,PAAD$Gene_symbol, SKCM$Gene_symbol,THCA$Gene_symbol))
final_data <- Reduce(function(x, y) merge(x, y,by='Gene_symbol', all=TRUE), list(d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13))
write.table(final_data,"ALL_data.tsv", sep = '\t', row.names = F)