-
Notifications
You must be signed in to change notification settings - Fork 5
/
exercise-Pipeline.py
160 lines (128 loc) · 5.77 KB
/
exercise-Pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# Set up code checking
import os
if not os.path.exists("../input/train.csv"):
os.symlink("../input/home-data-for-ml-course/train.csv", "../input/train.csv")
os.symlink("../input/home-data-for-ml-course/test.csv", "../input/test.csv")
from learntools.core import binder
binder.bind(globals())
from learntools.ml_intermediate.ex4 import *
print("Setup Complete")
# load the training and validation sets in X_train, X_valid, y_train, and y_valid. The test set is loaded in X_test.
import pandas as pd
from sklearn.model_selection import train_test_split
# Read the data
X_full = pd.read_csv('../input/train.csv', index_col='Id')
X_test_full = pd.read_csv('../input/test.csv', index_col='Id')
# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)
# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y,
train_size=0.8, test_size=0.2,
random_state=0)
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
X_train_full[cname].nunique() < 10 and
X_train_full[cname].dtype == "object"]
# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if
X_train_full[cname].dtype in ['int64', 'float64']]
# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()
X_train.head()
"""
MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope Condition1 Condition2 ... GarageArea WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold
Id
619 RL Pave NaN Reg Lvl AllPub Inside Gtl Norm Norm ... 774 0 108 0 0 260 0 0 7 2007
871 RL Pave NaN Reg Lvl AllPub Inside Gtl PosN Norm ... 308 0 0 0 0 0 0 0 8 2009
93 RL Pave Grvl IR1 HLS AllPub Inside Gtl Norm Norm ... 432 0 0 44 0 0 0 0 8 2009
818 RL Pave NaN IR1 Lvl AllPub CulDSac Gtl Norm Norm ... 857 150 59 0 0 0 0 0 7 2008
303 RL Pave NaN IR1 Lvl AllPub Corner Gtl Norm Norm ... 843 468 81 0 0 0 0 0 1 2006
"""
# preprocess the data and train a model
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])
# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0)
# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
('model', model)
])
# Preprocessing of training data, fit model
clf.fit(X_train, y_train)
# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)
print('MAE:', mean_absolute_error(y_valid, preds))
# MAE: 17861.780102739725
# Step 1: Improve the performance
# define your own preprocessing steps and random forest model. Fill in values for the following variables:
# numerical_transformer
# categorical_transformer
# model
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean') # Your code here
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
]) # Your code here
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])
# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0) # Your code here
# Check your answer
step_1.a.check()
# Part B
# need to have defined a pipeline in Part A that achieves lower MAE than the code above
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('model', model)
])
# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)
# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)
# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)
# Check your answer
step_1.b.check()
# MAE: 17648.417157534244
# Step 2: Generate test predictions
# use your trained model to generate predictions with the test data
# Preprocessing of test data, fit model
preds_test = my_pipeline.predict(X_test) # Your code here
# Check your answer
step_2.check()
# Run the next code cell without changes to save your results to a CSV file
# that can be submitted directly to the competition.
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)