-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_preparation.py
308 lines (202 loc) · 7.45 KB
/
data_preparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
# -*- coding: utf-8 -*-
'''
Created on Sun Jul 29 12:19:43 2018
@author: takalyan
'''
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import re
from sklearn.cluster import MiniBatchKMeans
import pickle
from sklearn.decomposition import PCA
from scipy.stats import skew
from scipy.special import boxcox1p
import ml_functions as dp
#Import dataset
dataset_df = pd.read_csv('train.csv')
def plot_relplot(df,x,y):
sns.relplot(x=x,y=y,
data=df,aspect=1,height=5)
plt.show()
##########################
#Exploratory Data Analysis
##########################
#Explore the data types
print(dataset_df.info())
#understand feature data types
print(dataset_df.get_dtype_counts())
#preview the data
preview_df = dataset_df.head()
#Summary stats of features
describe_df = dataset_df.describe()
#List of all columns
col_list = list(dataset_df.columns)
#Display columns with missing data
dp.display_missing_colrows(dataset_df,0)
#Display rows with missing data
dp.display_missing_colrows(dataset_df,1)
#List of all categorical features
print(dataset_df.select_dtypes(include=['object']).columns)
##### Correlation Maps #########
#Plot correlation matrix
dp.plot_heatmap(dataset_df,vmax=0.8,vmin=-0.8)
#Add features with similarity to be removed later
cols_to_remove = ['GarageCars','GarageYrBlt']
#Get correlation values for feature pairs
corr_vals = dp.get_corrvals(dataset_df)
#saleprice correlation matrix
#define top/bottom n features we want to look at vs target variable
n = 10
target = 'SalePrice'
#Plot features with highest correlation to target
highcorr_cols = dp.plot_tb_heatmap(dataset_df,target,n,'largest')
for cols in highcorr_cols:
plot_relplot(dataset_df,cols,target)
#Plot features with lowest correlation to target
lowcorr_cols = dp.plot_tb_heatmap(dataset_df,target,n,'smallest')
for cols in lowcorr_cols:
plot_relplot(dataset_df,cols,target)
#### Visualizations ######
#Visualize target variable distribution
sns.distplot(dataset_df["SalePrice"])
#plot histo of all numerical values
dp.plot_all_hist(dataset_df)
#Pattern for anything that is an area feature
pattern = re.compile('.*(area|sf).*',re.I)
#Store all area columns/features here
area_cols = []
#Iterate through df columns and plot vs target variable
for col in col_list:
if (re.match(pattern,col)):
area_cols.append(col)
plot_relplot(dataset_df,col,'SalePrice')
#Pairplots
sns.pairplot(dataset_df,height=2,hue='OverallQual', vars=[
'LotArea',
'LotFrontage',
'YearBuilt',
'GrLivArea',
'SalePrice'])
#Printing quantiles
var = dataset_df.LotArea
print(var.quantile([.5,0.75,0.90,0.95,0.99,1]))
#Rel plots
sns.relplot(x='LotArea',y='SalePrice',
col='YrSold', col_wrap=2,
data=dataset_df,aspect=1,height=8,)
#Cat plots
sns.catplot(x='OverallQual',y='SalePrice', data=dataset_df,
kind = 'box',height=8
)
#Check for skewness of features
numerical_feats = dataset_df.dtypes[dataset_df.dtypes != "object"].index
skewed_feats = dataset_df[numerical_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})
#Visualize skewness
sns.distplot(dataset_df['PoolArea'])
################
# Data Cleansing
################
#Copy dataframe
new_df=dataset_df.copy()
# Based on the description of the dataset, anything missing here is actually N/A
# We should replace the values with "None" so that they are encoded to a specific value
cols_to_mod = ['MSSubClass','MiscFeature','Alley','Fence','FireplaceQu','GarageType', 'GarageFinish',\
'GarageQual','GarageCond','PoolQC','BsmtQual', 'BsmtCond', 'BsmtExposure',\
'BsmtFinType1', 'BsmtFinType2','MasVnrType']
#Replace the values with "None"
for col in cols_to_mod:
new_df[col] = dataset_df[col].fillna("None")
# Based on the description of the dataset, anything missing here is actually N/A
# We should replace the values with 0 so that they are encoded to a specific value
cols_to_mod = ['MasVnrArea','BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',\
'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']
#Fill in missing values with a zero
for col in cols_to_mod:
new_df[col] = new_df[col].fillna(0)
##Boxcox transformation for skewed features
#define minimum skewness for transformation
skewness = skewness[abs(skewness) > 0.75]
#Feature names are just the index values here
skewed_features = skewness.index
#Define lambda value for boxcox transform
lam = 0.15
#Transform skewed features
for col in skewed_features:
new_df[col] = boxcox1p(new_df[col], lam)
#Removing outliers for GrLivArea
plot_relplot(dataset_df,'GrLivArea','SalePrice')
new_df = new_df.drop(dataset_df[(dataset_df['GrLivArea']>4000) & (dataset_df['SalePrice'] < 300000)].index)
plot_relplot(new_df,'GrLivArea','SalePrice')
#Drop features missing > 90% of trainset
dp.display_missing_colrows(dataset_df,0)
cols_to_drop = ['Alley','MiscFeature','PoolQC']
new_df = dataset_df.drop(cols_to_drop,axis=1)
lowcorr_cols = ['YrSold', 'Id', 'MiscVal', 'BsmtHalfBath']
#Drop low correlation vars
new_df = new_df.drop(lowcorr_cols,axis=1)
#Drop redundant, duplicate features
new_df = new_df.drop(cols_to_remove,axis=1)
area_cols = ['LotArea',
'MasVnrArea',
'BsmtFinSF1',
'BsmtFinSF2',
'BsmtUnfSF',
'TotalBsmtSF',
'1stFlrSF',
'2ndFlrSF',
'GrLivArea',
'GarageArea',
'WoodDeckSF',
'OpenPorchSF']
#Remove outliers using quantile values
col_list = area_cols
plot_relplot(new_df,'GrLivArea','SalePrice')
#Remove values > 99 percentile of the population for each column in the area list
new_df = dp.remove_outliers_quant(new_df,col_list,0.99)
plot_relplot(new_df,'GrLivArea','SalePrice')
#Encode categorical variables
encoder, new_df = dp.encode_df(new_df,'label')
#impute missing
new_df = dp.impute_missing_mean(new_df)
#reset index since we have dropped some rows
new_df.reset_index(drop=True, inplace=True)
####################
#Feature Engineering
####################
#Create a 2D array required for Kmeans clustering
GrLivArea_array = np.array(new_df["GrLivArea"]).reshape(-1,1)
#Use Kmeans with 5 groups for clustering data
kmeans = MiniBatchKMeans(n_clusters=5, batch_size=32).fit(GrLivArea_array)
#Assign new feature based on cluster
new_df['area_cluster'] = kmeans.predict(GrLivArea_array)
#Plot the clusters
sns.relplot(x='GrLivArea',y='SalePrice',data=new_df,
hue='area_cluster',style='area_cluster',height=8,
palette='YlGnBu')
#Create new feature for overall living area
new_df['TotalSF'] = new_df['TotalBsmtSF'] + new_df['1stFlrSF'] +new_df['2ndFlrSF']
#Create new feature of area * (number or rooms + overall quality)
new_df['area_rooms']=new_df['TotalSF']*(new_df['TotRmsAbvGrd']+new_df['FullBath']+new_df['HalfBath']+new_df['OverallQual'])
####################
# Save data
####################
#Save clean data to csv
#Random shuffle
new_df = new_df.sample(frac=1).reset_index(drop=True)
#Define train and test set
test_pct = 0.2
test_row_count = np.int(test_pct * new_df.shape[0])
train_row_count = new_df.shape[0] - test_row_count
#Create train and validation dataframe and dump to csv
train_df = new_df.loc[0:train_row_count-1,:]
test_df = new_df.loc[train_row_count:,:]
#Check train and test distributions
sns.distplot(train_df["SalePrice"])
sns.distplot(test_df["SalePrice"])
train_df.to_csv('new_train.csv',index=False)
test_df.to_csv('new_test.csv',index=False)
pickle.dump(kmeans,open ('cluster_model.pickle','wb'))
pickle.dump(encoder,open('encoder.pickle','wb'))