-
Notifications
You must be signed in to change notification settings - Fork 0
/
Classifying satisfied students with training using Logistic regression.R
160 lines (72 loc) · 3.13 KB
/
Classifying satisfied students with training using Logistic regression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#=====================================================================
#LOGISTIC REGRESSION CLASSIFICATION
projo=read.csv("projo.csv",header=T)
str(projo)
table(projo$SATISFIED)
# Install and load caTools package
install.packages("caTools")
library(caTools)
# Randomly split data
set.seed(88)
split = sample.split(projo$SATISFIED, SplitRatio = 0.75)
split
# Create training and testing sets
projoTrain = subset(projo, split == TRUE)
projoTest = subset(projo, split == FALSE)
nrow(projoTrain)
nrow(projoTest)
# Logistic Regression Model
ProjoLog = glm(SATISFIED ~ .,data=projoTrain, family=binomial)
summary(ProjoLog)
#Prediction
predictTrain = predict(ProjoLog, type="response")
summary(predictTrain)
tapply(predictTrain, projoTrain$SATISFIED, mean)
#Thresholding
#We can convert the probabilities to predictions using what's called a
#threshold value, t. If the probability of YES is greater than this
#threshold value, t, we predict SATISFACTION. But if the probability of
#SATISFACTION is less than the threshold value, t, then we predict NO SATISFACTION
# Confusion matrix for threshold of 0.5
table(projoTrain$SATISFIED, predictTrain > 0.5)
# Sensitivity= true positives/(true positive + false negative)
# Specificity=true negatives/(true negative + false positives)
# Confusion matrix for threshold of 0.7
table(projoTrain$SATISFIED, predictTrain > 0.7)
# Sensitivity= true positive/(true positive + false negative)
# Specificity=true negative/(true negative + false positives)
# Confusion matrix for threshold of 0.5
table(projoTrain$SATISFIED, predictTrain > 0.2)
# Sensitivity= true positives/(true positive + false negative)
# Specificity=true negatives/(true negative + false positives)
#We see that by increasing the threshold value, the model's sensitivity
#decreases and specificity increases while the reverse happens if the
#threshold value is decreased. So how to choose the optimum threshold value.
#Picking a good threshold value is often challenging.A Receiver Operator
#Characteristic curve, or ROC curve, can help us decide which value of the
#threshold is best.
# Install and load ROCR package
install.packages("ROCR")
library(ROCR)
ROCRpred = prediction(predictTrain, projoTrain$SATISFIED)
#We now use the performance function which defines what we'd like to plot
#on the x and y-axes of our ROC curve.
# Performance function
ROCRperf = performance(ROCRpred, "tpr", "fpr")
#Now, we just need to plot the output of the performance function.
# Plot ROC curve
plot(ROCRperf)
# Add colors
plot(ROCRperf, colorize=TRUE)
# Add threshold labels
plot(ROCRperf, colorize=TRUE, print.cutoffs.at=seq(0,1,by=0.1), text.adj=c(-0.2,1.7))
#Prediction on Test Set
#In this particular example, we used a threshold value of 0.3 and we
#obtain the following confusion matrix.
predictTest = predict(ProjoLog, type = "response", newdata = projoTest)
table( projoTest$SATISFIED,predictTest >= 0.3)
# Accuracy=sum(main diagnol values)/total sum
#Conclusion
#The model can accurately identify students satisfied with training.
#Test set accuracy being equal to % which is greater than our baseline
#model.