forked from iandurbach/ml-for-ecology
-
Notifications
You must be signed in to change notification settings - Fork 0
/
classification-trees.R
103 lines (82 loc) · 3.07 KB
/
classification-trees.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#### Classification trees
# - repeats the regression tree analysis for a binary classification problem
library(tree)
load("data/aloe.RData")
head(aloe_pa)
# ensure outcome is a factor variable
aloe_pa$present <- factor(aloe_pa$present)
# make training, validation, test datasets (60/20/20 split)
# shuffle rows
set.seed(123)
aloe_pa <- aloe_pa[sample(1:nrow(aloe_pa)),]
# get numbers in train, valid, test sets
ntrain <- round(0.6 * nrow(aloe_pa))
nvalid <- round(0.2 * nrow(aloe_pa))
ntest <- nrow(aloe_pa) - ntrain - nvalid
# allocate data to train, valid, test sets
aloe_pa$train_id <- c(rep(1,ntrain), rep(2, nvalid), rep(3, ntest))
#### fit data on train + valid, assess on test
# build tree
tree_aloe_pa <- tree(present ~ Latitude + Longitude + MAP + MAT,
data = subset(aloe_pa, train_id != 3),
split = "deviance")
# plot the tree
plot(tree_aloe_pa)
text(tree_aloe_pa, cex=0.9)
# assess training accuracy
pred_aloe_pa <- predict(tree_aloe_pa, type="class")
predtrain <- table(tree_aloe_pa$y, pred_aloe_pa)
predtrain
sum(diag(predtrain))/sum(predtrain) # training accuracy
#? is this good? can compare to size of most common class
max(table(tree_aloe_pa$y))/length(tree_aloe_pa$y)
#? or use "Cohen's kappa"
# function calculating CK from a table of obs x pred
cohens_kappa = function(acc_table){
p <- apply(acc_table,1,sum) / sum(acc_table)
q <- apply(acc_table,2,sum) / sum(acc_table)
p0 <- sum(diag(acc_table)) / sum(acc_table)
kappa <- (p0 - sum(p * q)) / (1 - sum(p * q))
return(kappa)
}
cohens_kappa(predtrain)
# accuracy in test dataset
pred_aloe_pa <- predict(tree_aloe_pa,
type = "class",
newdata = subset(aloe_pa, train_id == 3))
observed <- aloe_pa[aloe_pa$train_id == 3, "present"]
predtest <- table(observed, pred_aloe_pa)
predtest
sum(diag(predtest))/sum(predtest) # test accuracy
max(table(observed))/length(observed)
cohens_kappa(predtest) # test kappa
## try again with the overfitted tree
# build tree
tree_aloe_pa <- tree(present ~ Latitude + Longitude + MAP + MAT,
data = subset(aloe_pa, train_id != 3),
split = "deviance",
mincut = 1,
minsize = 2,
mindev = 0)
# plot the tree
plot(tree_aloe_pa)
text(tree_aloe_pa, cex=0.9)
# assess training accuracy
pred_aloe_pa <- predict(tree_aloe_pa, type="class")
predtrain <- table(tree_aloe_pa$y, pred_aloe_pa)
predtrain
sum(diag(predtrain))/sum(predtrain) # training accuracy
max(table(tree_aloe_pa$y))/length(tree_aloe_pa$y)
cohens_kappa(predtrain) # training kappa
# accuracy in test dataset
pred_aloe_pa <- predict(tree_aloe_pa,
type = "class",
newdata = subset(aloe_pa, train_id == 3))
observed <- aloe_pa[aloe_pa$train_id == 3, "present"]
predtest <- table(observed, pred_aloe_pa)
predtest
sum(diag(predtest))/sum(predtest) # test accuracy
max(table(observed))/length(observed)
cohens_kappa(predtest) # test kappa
# training accuracy >> test accuracy => over-fitting, but still better
# than previous tree!