-
Notifications
You must be signed in to change notification settings - Fork 5
/
exercise-MissingValuesRF.py
135 lines (99 loc) · 4.54 KB
/
exercise-MissingValuesRF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# Set up code checking
import os
if not os.path.exists("../input/train.csv"):
os.symlink("../input/home-data-for-ml-course/train.csv", "../input/train.csv")
os.symlink("../input/home-data-for-ml-course/test.csv", "../input/test.csv")
from learntools.core import binder
binder.bind(globals())
from learntools.ml_intermediate.ex2 import *
print("Setup Complete")
import pandas as pd
from sklearn.model_selection import train_test_split
# Read the data
X_full = pd.read_csv('../input/train.csv', index_col='Id')
X_test_full = pd.read_csv('../input/test.csv', index_col='Id')
# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)
# To keep things simple, we'll use only numerical predictors
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
random_state=0)
X_train.head()
# Shape of training data (num_rows, num_columns)
print(X_train.shape)
# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])
# Fill in the line below: How many rows are in the training data?
num_rows = 1168
# Fill in the line below: How many columns in the training data
# have missing values?
num_cols_with_missing = 3
# Fill in the line below: How many missing entries are contained in
# all of the training data?
tot_missing = 276
# Check your answers
step_1.a.check()
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
preds = model.predict(X_valid)
return mean_absolute_error(y_valid, preds)
# Fill in the line below: get names of columns with missing values
cols_with_missing = [col for col in X_train.columns
if X_train[col].isnull().any()] # Your code here
# Fill in the lines below: drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)
# Check your answers
step_2.check()
print("MAE (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))
# MAE (Drop columns with missing values):
# 17837.82570776256
from sklearn.impute import SimpleImputer
# Fill in the lines below: imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid)) # Your code here
# Fill in the lines below: imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns
# Check your answers
step_3.a.check()
print("MAE (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))
# MAE (Imputation):
# 18062.894611872147
# Part A
# Use the next code cell to preprocess the training and validation data. Set the preprocessed DataFrames to final_X_train and final_X_valid. You can use any approach of your choosing here! in order for this step to be marked as correct, you need only ensure:
# the preprocessed DataFrames have the same number of columns,
# the preprocessed DataFrames have no missing values,
# final_X_train and y_train have the same number of rows, and
# final_X_valid and y_valid have the same number of rows.
# Preprocessed training and validation features
import numpy as np
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)
final_X_train = pd.DataFrame(imputer.fit_transform(X_train), columns = X_train.columns) #X_train.fillna(X_train.mean())
final_X_valid = pd.DataFrame(imputer.fit_transform(X_valid), columns = X_valid.columns) #X_valid.fillna(X_valid.mean())
# Check your answers
step_4.a.check()
# Check your answers
step_4.a.check()
# Define and fit model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(final_X_train, y_train)
# Get validation predictions and MAE
preds_valid = model.predict(final_X_valid)
print("MAE (Your approach):")
print(mean_absolute_error(y_valid, preds_valid))
# MAE (Your approach):
# 18031.884897260275