-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpixelpicking_validation.R
112 lines (101 loc) · 4.47 KB
/
pixelpicking_validation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# Setup ----
library(tidyverse)
dataset_version <- "FT2040"
# dataset_version <- "MS3000"
# dataset_version <- "CultureData"
# dataset_version <- "Pttime"
output_folder <- paste0("made_data_", dataset_version, "/")
features_extracted <- read_csv(paste0(output_folder, "features_extracted.csv"))
classified_feats <- read_csv(paste0(output_folder, "classified_feats.csv")) %>%
select(feature, feat_class)
quickclass_feats <- read_csv(paste0(output_folder, "quickclass_feats.csv")) %>%
select(feature=feat_id, quick_class=feat_class)
# features_extracted %>%
# left_join(quickclass_feats, by=c(feat_id="feature")) %>%
# ggplot() +
# geom_point(aes(x=med_cor, y=mean_pw, color=quick_class))
# Check performance against classified dataset ----
left_join(classified_feats, quickclass_feats) %>%
with(table(feat_class, quick_class))
# Model comparison
# Model keeps blowing up with all features, reducing to a subset of "best"
full_model <- features_extracted %>%
select(-feat_class) %>%
left_join(classified_feats, by=c(feat_id="feature")) %>%
select(-feat_id) %>%
filter(feat_class%in%c("Good", "Bad")) %>%
mutate(feat_class=ifelse(feat_class=="Bad", 0, 1)) %>%
glm(feat_class~., family=binomial, data=.)
pred_probs <- features_extracted %>%
mutate(pred_prob=predict(full_model, newdata=., type="response"))
pred_probs %>%
# filter(feat_class%in%c("Good", "Bad")) %>%
ggplot() +
geom_histogram(aes(x=pred_prob, fill=feat_class), bins=50)
pred_probs %>%
mutate(pred_class=ifelse(pred_prob>0.5, "Good", "Bad")) %>%
with(table(pred_class, feat_class))
quick_model <- features_extracted %>%
select(-feat_class) %>%
left_join(quickclass_feats, by=c(feat_id="feature")) %>%
select(-feat_id) %>%
filter(quick_class%in%c("Good", "Bad")) %>%
mutate(quick_class=ifelse(quick_class=="Bad", 0, 1)) %>%
glm(quick_class~., family=binomial, data=.)
features_extracted %>%
left_join(quickclass_feats, by=c(feat_id="feature")) %>%
mutate(pred_prob=predict(quick_model, newdata=., type="response")) %>%
ggplot() +
geom_histogram(aes(x=pred_prob, fill=feat_class), bins=50)
features_extracted %>%
left_join(quickclass_feats, by=c(feat_id="feature")) %>%
mutate(pred_prob=predict(quick_model, newdata=., type="response")) %>%
mutate(pred_class=ifelse(pred_prob>0.5, "Good", "Bad")) %>%
with(table(pred_class, feat_class))
# Best features only below - dodging model explosion this way
full_model <- features_extracted %>%
select(feat_class, med_cor, med_SNR, sd_rt) %>%
filter(feat_class%in%c("Good", "Bad")) %>%
mutate(feat_class=ifelse(feat_class=="Bad", 0, 1)) %>%
glm(feat_class~., family=binomial, data=.)
pred_probs <- features_extracted %>%
mutate(pred_prob=predict(full_model, newdata=., type="response"))
pred_probs %>%
filter(feat_class%in%c("Good", "Bad")) %>%
ggplot() +
geom_histogram(aes(x=pred_prob, fill=feat_class), bins=50)
pred_probs %>%
mutate(pred_class=ifelse(pred_prob>0.5, "Good", "Bad")) %>%
with(table(pred_class, feat_class))
quick_model <- features_extracted %>%
select(-feat_class) %>%
left_join(quickclass_feats, by=c(feat_id="feature")) %>%
select(c("med_cor", "med_SNR", "sd_rt",
# "mean_mz", "sd_ppm", "mean_rt", "mean_pw", "sd_pw",
# "log_mean_height", "log_sd_height","sn", "f", "scale",
# "lmin", "feat_npeaks", "n_found", "samps_found", "stans_found",
# "med_missed_scans", "smp_to_blk", "smp_to_std",
# "shape_cor", "area_cor",
"quick_class")) %>%
filter(quick_class%in%c("Good", "Bad")) %>%
mutate(quick_class=ifelse(quick_class=="Bad", 0, 1)) %>%
glm(quick_class~., family=binomial, data=.)
pred_probs <- features_extracted %>%
mutate(pred_prob=predict(quick_model, newdata=., type="response")) %>%
left_join(quickclass_feats, by=c(feat_id="feature"))
pred_probs %>%
filter(feat_class%in%c("Good", "Bad")) %>%
ggplot() +
geom_histogram(aes(x=pred_prob, fill=feat_class), bins=50)
pred_probs %>%
mutate(pred_class=ifelse(pred_prob>0.5, "Good", "Bad")) %>%
with(table(pred_class, feat_class))
list(manual=broom::tidy(full_model), quick=broom::tidy(quick_model)) %>%
bind_rows(.id="label_type") %>%
ggplot(aes(x=term, color=label_type)) +
geom_point(aes(y=estimate), position = position_dodge(width = 0.5)) +
geom_errorbar(aes(ymin=estimate-2*std.error, ymax=estimate+2*std.error),
position = position_dodge(width = 0.5)) +
facet_wrap(~term, ncol=5, scales = "free") +
theme(axis.text.x = element_blank()) +
geom_hline(yintercept = 0, color="black")