-
Notifications
You must be signed in to change notification settings - Fork 0
/
Project.R
240 lines (206 loc) · 9.22 KB
/
Project.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
data=read.csv("Walmart_date_seperate.csv")
head(data)
dim(data)[1]
hist(data$Weekly_Sales, main="", xlab="Weekly Sales")
boxplot(Weekly_Sales~Store, main="Boxplot for weekly sales versus stores",xlab="Store",ylab="Weekly Sales",data=data)
boxplot(Weekly_Sales~Month, main="Boxplot for weekly sales versus months",xlab="Months",ylab="Weekly Sales",data=data)
boxplot(Weekly_Sales~Year, main="Boxplot for weekly sales versus years",xlab="Years",ylab="Weekly Sales",data=data)
counts=table(data$Holiday_Flag,data$Month)
barplot(counts,main="Barplot for number of holidays in months",xlab="Months",ylab="Count of 0 or 1",)
plot(data$Temperature,data$Weekly_Sales,xlab="Temperature",ylab="Weekly Sales",main="Relation between weekly sales and temperature",cex=0.5)
abline(lm(Weekly_Sales~Temperature,data=data),col="red")
plot(data$Fuel_Price,data$Weekly_Sales,xlab="Fuel_Price",ylab="Weekly Sales",main="Relation between weekly sales and fuel price",cex=0.5)
abline(lm(Weekly_Sales~Fuel_Price,data=data),col="red")
plot(data$CPI,data$Weekly_Sales,xlab="CPI",ylab="Weekly Sales",main="Relation between weekly sales and CPI",cex=0.5)
abline(lm(Weekly_Sales~CPI,data=data),col="red")
#Test train split
set.seed(0)
sample_size=floor(0.8*nrow(data))
picked=sample(seq_len(nrow(data)),size=sample_size)
train=data[picked,]
# Converting the numerical cateogrical variables to predictors
train$Store = as.factor(train$Store)
train$Holiday_Flag = as.factor(train$Holiday_Flag)
train$Year = as.factor(train$Year)
train$Month = as.factor(train$Month)
train$Day = as.factor(train$Day)
train$Weekday = as.factor(train$Weekday)
test=data[-picked,]
test$Store = as.factor(test$Store)
test$Holiday_Flag = as.factor(test$Holiday_Flag)
test$Year = as.factor(test$Year)
test$Month = as.factor(test$Month)
test$Day = as.factor(test$Day)
test$Weekday = as.factor(test$Weekday)
# Fit the model
model1 <- lm(Weekly_Sales~., data=train)
summary(model1)
which(summary(model1)$coeff[,4]>0.05)
model1_resids=rstandard(model1)
model1_fits=model1$fitted
#Linearity assumption for model1
plot(model1_fits,model1_resids, main="Scatter plot of residuals and fitted values(model1)", xlab="Fitted Values", ylab="Residuals",cex=0.5)
plot(train$Temperature,model1_resids, main="Scatter plot of residuals and temperature", xlab="Temperature", ylab="Residuals",cex=0.5)
plot(train$Fuel_Price,model1_resids, main="Scatter plot of residuals and fuel price", xlab="Fuel Price", ylab="Residuals",cex=0.5)
plot(train$CPI,model1_resids, main="Scatter plot of residuals and CPI", xlab="CPI", ylab="Residuals",cex=0.5)
plot(train$Unemployment,model1_resids, main="Scatter plot of residuals and unemployment", xlab="Unemployment", ylab="Residuals", cex=0.5)
#Normality assumption for model1
hist(model1_resids, nclass=100)
library("car")
qqPlot(model1_resids, main="qq plot of model1 residuals")
qqline(model1_resids,col="red")
#outliers for model1
cook=cooks.distance(model1)
plot(cook,main="cook distance of model1",type="h")
#check multicollinearity for model1
vif(model1) #remove CPI
#clean data for testing and training submodel
clean_outlier_index=which(cook<=(4/dim(train)[1]))
clean_outlier_train=train[clean_outlier_index,]
clean_CPI_train=subset(train,select=-c(CPI))
clean_CPI_test=subset(test,select=-c(CPI))
clean_outlier_CPI_train=subset(clean_outlier_train,select=-c(CPI))
#model2(Delete CPI column)
model2 <- lm(Weekly_Sales~., data=clean_CPI_train)
summary(model2)
which(summary(model2)$coeff[,4]>0.05)
model2_resids=rstandard(model2)
model2_fits=model2$fitted
#model3(Delete outliers)
model3 <- lm(Weekly_Sales~., data=clean_outlier_train)
summary(model3)
which(summary(model3)$coeff[,4]>0.05)
model3_resids=rstandard(model3)
model3_fits=model3$fitted
#model4(Squared Weekly_Sales)
model4 <- lm(sqrt(Weekly_Sales)~., data=train)
summary(model4)
which(summary(model4)$coeff[,4]>0.05)
model4_resids=rstandard(model4)
model4_fits=model4$fitted
#model5(Delete CPI column and squared Weekly_Sales)
model5 <- lm(sqrt(Weekly_Sales)~., data=clean_CPI_train)
summary(model5)
which(summary(model5)$coeff[,4]>0.05)
model5_resids=rstandard(model5)
model5_fits=model5$fitted
#model6(Delete outliers and squared Weekly_Sales)
model6 <- lm(sqrt(Weekly_Sales)~., data=clean_outlier_train)
summary(model6)
which(summary(model6)$coeff[,4]>0.05)
model6_resids=rstandard(model6)
model6_fits=model6$fitted
#model7(Delete outliers and delete CPI column)
model7 <- lm(Weekly_Sales~., data=clean_outlier_CPI_train)
summary(model7)
which(summary(model7)$coeff[,4]>0.05)
model7_resids=rstandard(model7)
model7_fits=model7$fitted
#model8(Delete outliers and delete CPI column and squared Weekly_Sales)
model8 <- lm(sqrt(Weekly_Sales)~., data=clean_outlier_CPI_train)
summary(model8)
which(summary(model8)$coeff[,4]>0.05)
model8_resids=rstandard(model8)
model8_fits=model8$fitted
#prediction1
pred1=predict(model1,test,interval='prediction')
test.pred1=pred1[,1]
test.lwr1=pred1[,2]
test.upr1=pred1[,3]
mean((test.pred1-test$Weekly_Sales)^2)#MSPE
sum((test.pred1-test$Weekly_Sales)^2)/sum((test$Weekly_Sales-mean(test$Weekly_Sales))^2)
#prediction2
pred2=predict(model2,clean_CPI_test,interval='prediction')
test.pred2=pred2[,1]
test.lwr2=pred2[,2]
test.upr2=pred2[,3]
mean((test.pred2-clean_CPI_test$Weekly_Sales)^2)#MSPE
sum((test.pred2-clean_CPI_test$Weekly_Sales)^2)/sum((clean_CPI_test$Weekly_Sales-mean(clean_CPI_test$Weekly_Sales))^2)
#prediction3
pred3=predict(model3,test,interval='prediction')
test.pred3=pred3[,1]
test.lwr3=pred3[,2]
test.upr3=pred3[,3]
mean((test.pred3-test$Weekly_Sales)^2)#MSPE
sum((test.pred3-test$Weekly_Sales)^2)/sum((test$Weekly_Sales-mean(test$Weekly_Sales))^2)
#prediction4
pred4=predict(model4,test,interval='prediction')
test.pred4=pred4[,1]^2
test.lwr4=pred4[,2]^2
test.upr4=pred4[,3]^2
mean((test.pred4-test$Weekly_Sales)^2)#MSPE
sum((test.pred4-test$Weekly_Sales)^2)/sum((test$Weekly_Sales-mean(test$Weekly_Sales))^2)
#prediction5
pred5=predict(model5,clean_CPI_test,interval='prediction')
test.pred5=pred5[,1]^2
test.lwr5=pred5[,2]^2
test.upr5=pred5[,3]^2
mean((test.pred5-clean_CPI_test$Weekly_Sales)^2)#MSPE
sum((test.pred5-clean_CPI_test$Weekly_Sales)^2)/sum((clean_CPI_test$Weekly_Sales-mean(clean_CPI_test$Weekly_Sales))^2)
#prediction6
pred6=predict(model6,test,interval='prediction')
test.pred6=pred6[,1]^2
test.lwr6=pred6[,2]^2
test.upr6=pred6[,3]^2
mean((test.pred6-test$Weekly_Sales)^2)#MSPE
sum((test.pred6-test$Weekly_Sales)^2)/sum((test$Weekly_Sales-mean(test$Weekly_Sales))^2)
#prediction7
pred7=predict(model7,clean_CPI_test,interval='prediction')
test.pred7=pred7[,1]
test.lwr7=pred7[,2]
test.upr7=pred7[,3]
mean((test.pred7-clean_CPI_test$Weekly_Sales)^2)#MSPE
sum((test.pred7-clean_CPI_test$Weekly_Sales)^2)/sum((clean_CPI_test$Weekly_Sales-mean(clean_CPI_test$Weekly_Sales))^2)
#prediction8
pred8=predict(model8,clean_CPI_test,interval='prediction')
test.pred8=pred8[,1]^2
test.lwr8=pred8[,2]^2
test.upr8=pred8[,3]^2
mean((test.pred8-clean_CPI_test$Weekly_Sales)^2)#MSPE
sum((test.pred8-clean_CPI_test$Weekly_Sales)^2)/sum((clean_CPI_test$Weekly_Sales-mean(clean_CPI_test$Weekly_Sales))^2)
#Assumptions for model2
plot(model2_fits,model2_resids, main="Scatter plot of residuals and fitted values(model2)", xlab="Fitted Values", ylab="Residuals",cex=0.5)
hist(model2_resids, nclass=100)
library("car")
qqPlot(model2_resids,main="qq plot of model2 residuals")
qqline(model2_resids,col="red")
#Assumptions for model3
plot(model3_fits,model3_resids, main="Scatter plot of residuals and fitted values(model3)", xlab="Fitted Values", ylab="Residuals",cex=0.5)
hist(model3_resids, nclass=100)
library("car")
qqPlot(model3_resids,main="qq plot of model3 residuals")
qqline(model3_resids,col="red")
#Assumptions for model4
plot(model4_fits,model4_resids, main="Scatter plot of residuals and fitted values(model4)", xlab="Fitted Values", ylab="Residuals",cex=0.5)
hist(model4_resids, nclass=100)
library("car")
qqPlot(model4_resids,main="qq plot of model4 residuals")
qqline(model4_resids,col="red")
#Assumptions for model5
plot(model5_fits,model5_resids, main="Scatter plot of residuals and fitted values(model5)", xlab="Fitted Values", ylab="Residuals",cex=0.5)
hist(model5_resids, nclass=100)
library("car")
qqPlot(model5_resids,main="qq plot of model5 residuals")
qqline(model5_resids,col="red")
#Assumptions for model6
plot(model6_fits,model6_resids, main="Scatter plot of residuals and fitted values(model6)", xlab="Fitted Values", ylab="Residuals",cex=0.5)
hist(model6_resids, nclass=100)
library("car")
qqPlot(model6_resids,main="qq plot of model6 residuals")
qqline(model6_resids,col="red")
#Assumptions for model7
plot(model7_fits,model7_resids, main="Scatter plot of residuals and fitted values(model7)", xlab="Fitted Values", ylab="Residuals",cex=0.5)
hist(model7_resids, nclass=100)
library("car")
qqPlot(model7_resids,main="qq plot of model7 residuals")
qqline(model7_resids,col="red")
#Assumptions for model8
plot(model8_fits,model8_resids, main="Scatter plot of residuals and fitted values(model8)", xlab="Fitted Values", ylab="Residuals",cex=0.5)
hist(model8_resids, nclass=100)
library("car")
qqPlot(model8_resids,main="qq plot of model8 residuals")
qqline(model8_resids,col="red")
#perform feature selection
full=lm(Weekly_Sales~.,data=clean_CPI_train)
minimum=lm(Weekly_Sales~Store+Holiday_Flag+Month,data=clean_CPI_train)
step(full, scope=list(lower=minimum, upper=full), direction="backward")