-
Notifications
You must be signed in to change notification settings - Fork 0
/
CYO R script.R
316 lines (214 loc) · 11 KB
/
CYO R script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
# Name- Deeksha RV
# Country - India
# Project title - Prediction of Heart Failure using ML algorithms
# Downloaded dataset from kaggle
# https://www.kaggle.com/fedesoriano/heart-failure-prediction
#Download following packages
if(!require(tidyverse)) install.packages("tidyverse")
if(!require(dplyr)) install.packages("dplyr")
if(!require(readxl)) install.packages("readxl")
if(!require(knitr)) install.packages("knitr")
if(!require(caret)) install.packages("caret")
if(!require(gam)) install.packages("gam")
if(!require(MASS)) install.packages("MASS")
if(!require(RColorBrewer)) install.packages("RColorBrewer")
if(!require(kableExtra)) install.packages("kableExtra")
library(tidyverse)
library(dplyr)
library(readxl)
library(knitr)
library(caret)
library(gam)
library(MASS)
library(RColorBrewer)
library(kableExtra)
# import .csv file from working directory of laptop
heart_attack <- read.csv("C:/Users/hp/Documents/Capstone Projects/CYO project/heart attack prediction dataset.csv")
### Viewing the Dataset ###
# what does the dataset give us ?
View(heart_attack)
# Age - in years
# Sex - M for male, F for female
table(heart_attack$ChestPainType)
# types of chest pain :
# ASY - asymptomatic
# ATA - atypical angina
# NAP - Non anginal pain
# TA - typical angina
# RestingBP - systolic, in mm Hg, at time of admission in hospital
# cholesterol - serum cholestoral in mg/dl
# Fasting BS - fasting blood sugar > 120 mg/dl, (1 = true; 0 = false)
# restecg - resting electrocardiographic results
# Normal = normal
# ST = having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
# LVH = showing probable or definite left ventricular hypertrophy by Estes' criteria
# MaxHR - maximum heart rate achieved
# ExerciseAngina - exercise induced angina
# Y = yes, N = no
# Oldpeak - ST depression induced by exercise relative to rest
# STslope - the slope of the peak exercise ST segment
# Up = upsloping
# Flat = flat
# Down = downsloping
# HeartDisease
# 0 for absense, 1 for presence
# no missing values in dataset. checked using is.na() for all attributes
# add info about original dataset in cyo report. saved bookmark in google
### Modify the dataset ###
# change Yes and No to 1 and 0, respectively for values of ExerciseAngina variable
heart_attack$ExerciseAngina <- ifelse(heart_attack$ExerciseAngina == "Y",1,0)
View(heart_attack)
# change M=1 and F=0, respectively for values of Sex variable
heart_attack$Sex <- ifelse(heart_attack$Sex == "M",1,0)
View(heart_attack)
## change ASY=1,ATA=2,NAP=3,TA=4 for ChestPainType variable
# first change to factor, then to numeric
heart_attack$ChestPainType <- as.factor(heart_attack$ChestPainType)
heart_attack$ChestPainType <- as.numeric(heart_attack$ChestPainType)
## change RestingECG values to numeric values (LVH=1, Normal=2, ST=3)
heart_attack$RestingECG <- as.numeric(as.factor(heart_attack$RestingECG))
## change ST_Slope values : Down=1, Flat=2, Up=3
heart_attack$ST_Slope <- as.numeric(as.factor(heart_attack$ST_Slope))
### Exploratory Data Analysis ###
# dimensions of dataset
dim(heart_attack)
# 918 rows, 12 columns
# how many predictors for heart disease ?
# 11
# proportion of data of people who have heart disease
mean(heart_attack$HeartDisease == 1)
# comes to 0.5533769. means 55.3% of people in dataset had heart disease
#how many men and women?
sum(heart_attack$Sex==1)
sum(heart_attack$Sex==0)
#725 men and 193 women
# types of attributes after modifying dataset
sapply(heart_attack, class)
# dataset has numerics and integers only. Characters were changed to numerics
# snippet of data
first10 <- head(heart_attack,10)
knitr::kable(first10)
# summarise data
summary <- summary(heart_attack)
# distribution of people by age
Graph1 <- heart_attack %>% ggplot(aes(Age)) + geom_histogram(binwidth = 0.5, fill = "orange") + xlab("Distribution of Age") + ggtitle("Graph 1")
# types of chest pain in patients
Graph2 <- heart_attack %>% ggplot(aes(ChestPainType)) + geom_bar(fill = "magenta") + xlab("Chest Pain Type") + ggtitle("Graph 2")
# highest cases of ASY or Asymptomatic type chest pain is seen (1 = ASY as per modification)
# how does Fasting Blood Sugar vary with Age
Graph3 <- heart_attack %>% ggplot(aes(factor(FastingBS), Age, fill = factor(FastingBS))) + geom_boxplot(alpha = 0.3) + xlab("Fasting Blood Sugar") + ylab("Age in Years") + ggtitle("Graph 3") + scale_fill_discrete("Legend")
# inference - fasting blood sugar greater than 120 mg/dl is prevalent in ages 52-60 years, the avg being at 56 years
# spatial heatmap of all the features
# create matrix first. heatmap can only take matrix
matrix <- data.matrix(heart_attack)
heatmap(matrix, Rowv=NA, Colv=NA, col = heat.colors(256), scale="column")
# find relationship of all attributes with HeartDisease
# calculate pearson's correlation coefficient for each variable with Heartdisease
pearson <- heart_attack %>% summarise(Parameter = c("Age","Sex","ChestPainType","Resting BP","Cholesterol","Fasting BS","RestingECG","Max HR","Exercise Angina","Old peak","ST Slope"),
"correlation with heart" = c(cor(Age, HeartDisease, method = "pearson"),
cor(Sex,HeartDisease, method = "pearson"),
cor(ChestPainType, HeartDisease, method = "pearson"),
cor(RestingBP,HeartDisease, method = "pearson"),
cor(Cholesterol,HeartDisease,method = "pearson"),
cor(FastingBS,HeartDisease,method = "pearson"),
cor(RestingECG, HeartDisease, method = "pearson"),
cor(MaxHR,HeartDisease,method = "pearson"),
cor(ExerciseAngina, HeartDisease, method = "pearson"),
cor(Oldpeak,HeartDisease,method = "pearson"),
cor(ST_Slope, HeartDisease, method = "pearson")
))
View(pearson)
# calculate spearman's correlation coffecient for each numeric variable with Heartdisease
spearman <- heart_attack %>%
summarise(Parameter = c("Age","Sex","ChestPainType","Resting BP","Cholesterol","Fasting BS","Resting ECG","Max HR","Exercise Angina","Old peak","ST Slope"),
"correlation with heart disease" = c(cor(Age, HeartDisease, method = "spearman"),
cor(Sex, HeartDisease, method = "spearman"),
cor(ChestPainType, HeartDisease, method = "spearman"),
cor(RestingBP,HeartDisease, method = "spearman"),
cor(Cholesterol,HeartDisease,method = "spearman"),
cor(FastingBS,HeartDisease,method = "spearman"),
cor(RestingECG, HeartDisease, method = "spearman"),
cor(MaxHR,HeartDisease,method = "spearman"),
cor(ExerciseAngina,HeartDisease, method = "spearman"),
cor(Oldpeak,HeartDisease,method = "spearman"),
cor(ST_Slope, HeartDisease, method = "spearman")
))
View(spearman)
## Building the algorithm
# create vector y, having all values of HeartDisease
y <- heart_attack$HeartDisease
# create subset of heart attack dataset by removing HeartDisease variable
heart_attack_2 <- heart_attack %>% subset(select = -c(HeartDisease))
# scale the new dataset
scaled_heart_attack <- scale(heart_attack_2)
View(scaled_heart_attack)
# create test and train sets
test_index <- createDataPartition(y, times = 1, p = 0.25, list = FALSE)
# test set of heart_attack. HA stands for Heart Attack.
test_HA <- scaled_heart_attack[test_index,]
# test set of y (HeartDisease)
test_y <- y[test_index]
# train set of heart_attack. HA stands for Heart Attack.
train_HA <- scaled_heart_attack[-test_index,]
# train set of y (HeartDisease)
train_y <- y[-test_index]
### Models ###
# convert y, test_y and train_y to factors
y <- as.factor(y)
train_y <- as.factor(train_y)
test_y <- as.factor(test_y)
# fit a lda model
train_lda <- train(train_HA, train_y, method = "lda", family = "binomial")
lda_preds <- predict(train_lda, test_HA)
mean(lda_preds == test_y)
# accuracy is 0.8565217 or 85.6%
# fit a qda model
train_qda <- train(train_HA, train_y, method = "qda", family = "binomial")
qda_preds <- predict(train_qda, test_HA)
mean(qda_preds == test_y)
# accuracy is 0.8304348 or 83.04%
# fit a logistic regression model
train_glm <- train(train_HA, train_y, method = "glm", family = "binomial")
glm_preds <- predict(train_glm, test_HA)
mean(glm_preds == test_y)
# accuracy is 0.8652174 or 86.5%
# fit a loess model
train_loess <- train(train_HA, train_y, method = "gamLoess")
loess_preds <- predict(train_loess, test_HA)
mean(loess_preds == test_y)
# accuracy is 0.8695652 or 87%
# fit a knn model
set.seed(7, sample.kind = "Rounding")
tuning <- data.frame(k = seq(21, 41, 2))
train_knn <- train(train_HA, train_y,
method = "knn",
tuneGrid = tuning)
# find best k value
train_knn$bestTune
knn_preds <- predict(train_knn, test_HA)
mean(knn_preds == test_y)
# accuracy is 88.69%
# fit a rf model
set.seed(7, sample.kind = "Rounding")
tune_rf <- data.frame(mtry = c(5,7,9,13))
train_rf <- train(train_HA, train_y,
method = "rf",
tuneGrid = tune_rf,
importance = TRUE)
# get best mtry value
train_rf$bestTune
rf_preds <- predict(train_rf, test_HA)
mean(rf_preds == test_y)
# accuracy is 0.8826087 or 88.26%
# ensemble model
# first ensemble model has all models
ensemble <- cbind(lda = lda_preds == 1, qda = qda_preds == 1, glm = glm_preds ==1, loess = loess_preds == 1, knn = knn_preds == 1, rf = rf_preds == 1)
ensemble_preds <- ifelse(rowMeans(ensemble) > 0.5, 1, 0)
mean(ensemble_preds == test_y)
# accuracy is 86.95% or 87%
# second ensemble model uses only rf and knn models
ensemble_2 <- cbind(rf = rf_preds ==1, knn = knn_preds ==1)
ensemble_2_preds <- ifelse(rowMeans(ensemble_2)> 0.5, 1, 0)
mean(ensemble_2_preds== test_y)
# accuracy is exactly 90%; final accuracy of my algorithm.
#*****************************************************************************#