-
Notifications
You must be signed in to change notification settings - Fork 0
/
decision_tree.r
167 lines (134 loc) · 4.78 KB
/
decision_tree.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
library(reshape2)
library(caret)
library(rpart.plot)
library(class)
# Read deletion information
df = read.table("deletion.tsv.gz", header=T)
print(summary(df))
print(dim(df))
unknown = df[is.na(df$status), ]
dim(unknown)
# Keep ID as unique rowname
rownames(df) = df$id
df = df[,!colnames(df) %in% c("chr_start_end","id")]
unknown = df[is.na(df$status),] # unlabeled data
df = df[!is.na(df$status),] # labeled data
df$status = factor(df$status)
print("Labeled data")
print(table(df$status))
print(dim(df))
# Split training and testing
train = createDataPartition(y = df$status, p=0.8, list=F)
training = df[train,]
print("Training dimensions")
print(dim(training))
print("Training status")
print(table(training$status))
testing = df[-train,]
print("Testing dimensions")
print(dim(testing))
print("Testing status")
print(table(testing$status))
# 10-fold cross-validation
trctrl = trainControl(
method="repeatedcv",
number=10,
repeats=10,
search ="grid")
# difrent values of mtry (number of randomly sampled variables)
tunegrid <- expand.grid(.mtry = c(1: 14))
# Build depth one decision tree
# fit = train(status ~ ., data=training, method="rpart",
# parms = list(split="information"),
# trControl=trctrl, tuneLength=10,
# control = list(maxdepth = 1))
# tune mtry of random forest using grid search defined in trctrl and tunegrid
fit2 = train(status ~ ., data=training, method="rf",
parms = list(split="information"),
metric = "Accuracy",
tuneGrid = tunegrid,
trControl=trctrl, tuneLength=10,
control = list(maxdepth = 1))
# print(fit2)
# save the best accuracy and number of randomly sampled variables
best_tune_mtry = data.frame(
accuracy = max(fit2$results$Accuracy),
mtry = fit2$bestTune$mtry
)
#select the number of randomly sampled parameters with the best accuracy
tunegrid <- expand.grid(.mtry = best_tune_mtry$mtry)
# find the best nodesize for random forest
maxnodes = c(1:20)
for (node in c(1:20)) {
tmp_maxnode = train(status ~ ., data=training, method="rf",
parms = list(split="information"),
metric = "Accuracy",
tuneGrid = tunegrid,
nodesize = node,
trControl=trctrl, tuneLength=10,
ntree = 800,
control = list(maxdepth = 1))
maxnodes[node] <- tmp_maxnode$results$Accuracy
}
#save the nodesize for the best accuracy of random forest
best_tune_nodesize = data.frame(
accuracy = max(maxnodes),
nodesize = which(maxnodes == max(maxnodes) )[1]
)
# set the node size by saved value
nodesize = best_tune_nodesize$nodesize
# find out the best tree size for random forest
maxtrees_tree = c(1:36)
maxtrees_accuracy = c(1:36)
index = 1
for (ntree in seq(250, 2000, 50)) {
tmp_maxntree = train(status ~ ., data=training, method="rf",
parms = list(split="information"),
metric = "Accuracy",
tuneGrid = tunegrid,
nodesize = nodesize,
trControl=trctrl, tuneLength=10,
ntree = ntree,
control = list(maxdepth = 1))
maxtrees_tree[index] = ntree
maxtrees_accuracy[index] = tmp_maxntree$results$Accuracy
index = index + 1
}
# maxtrees_tree
# maxtrees_accuracy
# save the size of the tree with the best accuracy
best_tune_ntree = data.frame(
accuracy = max(maxtrees_accuracy),
ntree = maxtrees_tree[which(maxtrees_accuracy == max(maxtrees_accuracy) )[1]]
)
# set tree size for random forest by saved value
ntree = best_tune_ntree$ntree
# train final model using saved parameters
fit_rf = train(status ~ ., data=training, method="rf",
parms = list(split="information"),
metric = "Accuracy",
tuneGrid = tunegrid,
nodesize = nodesize,
trControl=trctrl, tuneLength=10,
ntree = ntree,
control = list(maxdepth = 1))
# prp(fit$finalModel, tweak=1.2)
# vizualize the final model
plot(fit_rf$finalModel)
# Evaluate decision tree
# pred = predict(fit, newdata=testing)
# print(caret::confusionMatrix(pred, testing$status, positive="1"))
pred_rf = predict(fit_rf, newdata = testing)
print(caret::confusionMatrix(pred_rf, testing$status, positive="1"))
# Apply to unknown data
# pred = predict(fit, newdata=unknown)
print("Outcome prediction of unlabeled data")
# print(table(pred))
# unknown$status = pred
pred_rf = predict(fit_rf, newdata = unknown)
print(table(pred_rf))
unknown$status = pred_rf
# Write the predicted data to a file
df = rbind(df, unknown)
df$id = rownames(df)
write.table(df, "predictions.tsv", quote=F, row.names=F, col.names=T, sep="\t")