-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.R
139 lines (121 loc) · 5.05 KB
/
main.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#Author & date: Navin K Ipe. July 2018
#License: MIT
#Decision tree for Cardiotocographic data
#if(!is.null(dev.list())) dev.off()#clear old plots
cat("\f")#clear console
#rm(list = setdiff(ls(), lsf.str()))#remove all objects except functions
#rm(list=ls())#remove all objects loaded into R
#rm(list=lsf.str())#remove all functions but not variables
#rm(list=ls(all=TRUE))#clear all variables
library(readr)
library(dplyr)
library(party)
library(rpart)
library(rpart.plot)
library(ROCR)
library(party)
library(partykit)
set.seed(50)
fileName = "CTG.csv"
stR = 2; enR = 2126; stC = 7; enC = 28; aC = 40
message("Loading data from: ", fileName)
dat <- read.csv(fileName, header=TRUE, sep=",")
#LB - FHR baseline (beats per minute)
#AC - # of accelerations per second
#FM - # of fetal movements per second
#NSP - fetal state class code (N=normal; S=suspect; P=pathologic)
#dat <- select(dat,LB,AC,FM,UC,ASTV,MSTV,ALTV,MLTV,DL,DS,DP,DR,Width,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,NSP)#choose relevant columns
dat <- select(dat,LB,AC,FM,NSP)#choose relevant columns
dat <- dat[stR:enR,]#choose relevant rows
cat("Loaded data dimensions:", dim(dat));#print(paste("Loaded data with ", nrow(dat), "rows", ncol(dat),"columns"))
dat$NSPF <- as.factor(dat$NSP)
str(dat)
#splitting the data into training and validate
set.seed(1)
#Below code will partition the data into 2 sets. When we run the pd code, the data is created with 1 & 2 values
# training data is considered 80% and validation data is 20%
pd=sample(2,nrow(dat),replace=TRUE,prob=c(0.8,0.2))
train=dat[pd==1,]
validate=dat[pd==2,]
#View the structure of train and validate data
str(train)
str(validate)
#ctree is for classification tree
#for illustration we are using only few variables
#Building the tree on the training data
#tree=ctree(NSPF~LB+AC+FM+UC+ASTV+MSTV+ALTV+MLTV+DL+DS+DP+DR+Width+Min+Max+Nmax+Nzeros+Mode+Mean+Median+Variance+Tendency, data=train)
tree=ctree(NSPF~LB+AC+FM, data=train)
tree
plot(tree)
#Pruning the tree with controls option (to reduce the branches)
#mincriterion=0.99 specifies the confidence level, i.e., 99% confident that the variable is significant
#minsplit=1100 specifies that the branch will be split into 2 only when the sample is 1100 which restricts the growth of the tee
#tree=ctree(NSPF~LB+AC+FM+UC+DL+DS+DP+ASTV+MSTV+ALTV+MLTV, data=train, controls=ctree_control(mincriterion=0.99, minsplit=1800))
tree=ctree(NSPF~LB+AC+FM, data=train, control=ctree_control(mincriterion=0.99, minsplit=500))
tree
plot(tree,type="simple")
plot(tree)
#Predict function is used to predict the probabilities using the tree which is built above on the validate data
predict(tree,validate,type="prob")
#Predict NSP
predict(tree,validate)
########################################
#Misclassification error
#######################################
#Misclassification error on training data
tab<-table(predict(tree), train$NSPF)
#All the values in Diagonal are predicted correctly. Off diagonal values are incorrect prediction
print(tab)
#Below is the output of the above statements in R.
#The left 1,2,3 are the actual NSP values in the data and the top 1,2,3 are the predicted values.
tab<-table(predict(tree), train$NSPF)
print(tab)
# % of misclassification is provided based on the training data
percentErrTraining <- 1-sum(diag(tab))/sum(tab)
print(cat("percentErrTraining:", percentErrTraining))
#Misclassification error on validate data
testpred=predict(tree, newdata=validate)
tab<-table(testpred, validate$NSPF)
print(tab)
# % of misclassification is provided based on the Validation data
percentErrValidation <- 1-sum(diag(tab))/sum(tab)
print(cat("percentErrValidation:",percentErrValidation))
# fitTree <- rpart(LB~., data = dat, method="class")
# summary(fitTree)
# predicted = predict(fitTree, dat)
# #rpart.plot(fitTree)
# #----------- ROC curve example
# outlook = c('rain', 'overcast', 'rain', 'sunny', 'rain',
# 'rain', 'sunny', 'overcast', 'overcast', 'overcast',
# 'sunny', 'sunny', 'rain', 'rain', 'overcast',
# 'sunny', 'overcast', 'overcast', 'sunny', 'sunny',
# 'sunny', 'overcast')
# humidity = c(79, 74, 80, 60, 65, 79, 60, 74, 77, 80,
# 71, 70, 80, 65, 70, 56, 80, 70, 56, 70,
# 71, 77)
# windy = c(T, T, F, T, F, T, T, T, T, F, T, F, F, F, T, T, F, T, T, F, T, T)
# play = c(F, F, T, F, T, F, F, T, T, T, F, F, T, T, T, T, T, T, F, T, F, T)
#
# game = data.frame(outlook, humidity, windy, play)
# game$score = NA
#
# attach(game)
# game$score[outlook == 'sunny' & humidity <= 70] = 5/8
# game$score[outlook == 'sunny' & humidity > 70] = 1 - 3/4
# game$score[outlook == 'overcast'] = 4/5
# game$score[outlook == 'rain' & windy == T] = 1 - 2/2
# game$score[outlook == 'rain' & windy == F] = 3/3
# detach(game)
#
# game$predict = game$score >= 0.5
# game$correct = game$predict == game$play
#
# library(ROCR)
#
# pred = prediction(game$score, game$play)
# roc = performance(pred, measure="tpr", x.measure="fpr")
# plot(roc, main="ROC", col="orange", lwd=2)
# lines(x=c(0, 1), y=c(0, 1), col="red", lwd=2)
#
# auc = performance(pred, 'auc')
# slot(auc, 'y.values')