-
Notifications
You must be signed in to change notification settings - Fork 5
/
visualize_ds.R
117 lines (99 loc) · 5.17 KB
/
visualize_ds.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
rm(list = ls())
check.packages <- function(pkg){
# check.packages function: install and load multiple R packages.
# Check to see if packages are installed. Install them if they are not,
# then load them into the R session.
# https://gist.github.com/smithdanielle/9913897
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, repos = "http://cran.us.r-project.org", dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
packages <- c("tidyverse", "reshape2", "ggbeeswarm", "viridis")
check.packages(packages)
library(tidyverse)
n.iters <- 100
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00",
"#CC79A7", "#c5679b", "#be548f")
accu <- read_tsv(paste0("RNASeq/accuracies_ds/100_100_", n.iters - 1, ".tsv"))
colnames(accu)[1] <- "dataidx"
accu.melt <- reshape2::melt(accu, id = "dataidx")
filenames <- list.files("RNASeq/pipelines_ds", pattern="*.py", full.names=TRUE)
files.short <- gsub("pipelines_ds/RNASeq_|.py|score_", "", filenames)
selected.sub <- data.frame(matrix(NA, nrow = length(filenames), ncol = 1),
row.names = files.short)
colnames(selected.sub) <- "selectedSubsetID"
for (file in filenames){
file.short <- gsub("RNASeq/pipelines_ds/RNASeq_|.py|score_", "", file)
pipe <- read.delim(file, stringsAsFactors = F)
pipe.idx <- grep(" DatasetSelector", pipe[,1])
select.i <- gsub("\\, subset_list=module23.csv)\\,| DatasetSelector\\(sel_subset=", "", pipe[pipe.idx, 1])
selected.sub[file.short, 1] <- select.i
}
selected.sub$dataidx <- as.numeric(gsub("MDD", "", rownames(selected.sub)))
accu.subset <- merge(selected.sub, accu, by = "dataidx")
accu.subset$subidx <- as.numeric(accu.subset$selectedSubsetID)
accu.subset.sum <-
accu.subset %>%
group_by(subidx) %>%
summarise(avg.test = mean(`Testing Accuracy`), avg.train.CV = mean(`Training CV Accuracy`))
accu.subset$subname <- as.factor(paste0("DGM-", accu.subset$subidx+1))
accu.subset$subname <- factor(accu.subset$subname, levels = paste0("DGM-", sort(unique(accu.subset$subidx))+1))
# write_csv(accu.subset, "RNASeq/accuracyDF.csv")
accu12 <- accu.subset[accu.subset$subidx==12,]
accu12 <- accu12[order(accu12$`Testing Accuracy`, decreasing = T),]
q <- ggplot(accu.subset, aes(x = subname, y = `Testing Accuracy`, color = subname)) +
geom_boxplot(color = "grey40") +
stat_summary(fun.data = function(x) c(y = 0.77, label = round(length(x)/n.iters, 2)),
geom = "text", fun.y = NULL,
position = position_dodge(width = 0.75)) +
ggbeeswarm::geom_beeswarm(priority = "random", cex = 1.8, size = 1, alpha = 0.8) +
theme_bw() +
viridis::scale_color_viridis(discrete = T) +
labs(x = "Subset ID", y = "Testing Accuracy") +
guides(fill = FALSE) + guides(colour=FALSE)
q
# ggsave(q, filename = paste0("real_", n.iters, ".svg"), width = 5, height = 4, units = "in")
accu.sub.melt <- reshape2::melt(
accu.subset[, c("Training CV Accuracy", "Testing Accuracy", "subname", "dataidx")],
id = c("subname", "dataidx"))
ggplot(accu.sub.melt, aes(y = value, x = variable, group = subname, color = subname)) +
geom_point() + geom_line(aes(group = dataidx)) +
viridis::scale_color_viridis(discrete = T) +
labs(color = "Subset") +
theme_bw() + labs(y = "Accuracy", x = "") +
theme(legend.position = c(0.15,0.28))
accu.subset$box <- accu.subset$subname %in% c("DGM-5", "DGM-13")
accu.subset$col <- accu.subset$subname %in% c("DGM-3", "DGM-5", "DGM-17")
q <- ggplot(accu.subset, aes(x = subname, y = `Testing Accuracy`, color = col)) +
stat_summary(fun.data = function(x) c(y = 0.77, label = length(x)),
geom = "text", fun = NULL,
position = position_dodge(width = 0.75)) +
geom_boxplot(data = accu.subset[accu.subset$box == TRUE, ],
aes(x = subname, y = `Testing Accuracy`), color = "grey70") +
ggbeeswarm::geom_beeswarm(priority = "random", cex = 1.6, size = 1.8, alpha = 0.8, stroke = 0) +
theme_bw() +
annotate("text", x = 4.2, y = 0.45, size = 2.7, fontface = 'italic',
label = "* Boxplots are drawn for subsets with more than three data points") +
# viridis::scale_color_viridis(discrete = T, option = "E") +
scale_color_manual(values = c(cbbPalette[6], cbbPalette[10])) +
scale_y_continuous(labels = scales::percent, name = "Holdout accuracy") +
labs(x = NULL) +
guides(fill = FALSE) + guides(colour=FALSE)
q
# ggsave(q, filename = paste0("real_", n.iters, ".svg"), width = 5, height = 3.5, units = "in")
# ggsave(q, filename = paste0("RNASeq/real_", n.iters, ".pdf"), width = 5, height = 3.5, units = "in")
library(ggdark)
q_dark <- q +
dark_theme_gray() +
theme(
plot.background = element_rect(fill = "#111111"),
panel.background = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major = element_line(color = "grey30", size = 0.2),
panel.grid.minor = element_line(color = "grey30", size = 0.2),
legend.background = element_blank(),
axis.ticks = element_blank(),
legend.key = element_blank(),
legend.position = c(0.815, 0.27))
# ggsave(q_dark, filename = paste0("dark_real_", n.iters, ".svg"), height = 3, width = 5)