-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLAB5_assignment8.py
152 lines (122 loc) · 5.18 KB
/
LAB5_assignment8.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot') # Look Pretty
def drawLine(model, X_test, y_test, title):
# This convenience method will take care of plotting your
# test observations, comparing them to the regression line,
# and displaying the R2 coefficient
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(X_test, y_test, c='g', marker='o')
ax.plot(X_test, model.predict(X_test), color='orange', linewidth=1, alpha=0.7)
print "Est 2014 " + title + " Life Expectancy: ", model.predict([[2014]])[0]
print "Est 2030 " + title + " Life Expectancy: ", model.predict([[2030]])[0]
print "Est 2045 " + title + " Life Expectancy: ", model.predict([[2045]])[0]
score = model.score(X_test, y_test)
title += " R2: " + str(score)
ax.set_title(title)
plt.show()
#
# TODO: Load up the data here into a variable called 'X'.
# As usual, do a .describe and a print of your dataset and
# compare it to the dataset loaded in a text file or in a
# spread sheet application
#
X = pd.read_csv('C:/Users/mckinns/Documents/GitHub/DAT210x/Module5/Datasets/life_expectancy.csv',
sep='\t', header=0)
#
# TODO: Create your linear regression model here and store it in a
# variable called 'model'. Don't actually train or do anything else
# with it yet:
#
from sklearn import linear_model
model = linear_model.LinearRegression()
#
# TODO: Slice out your data manually (e.g. don't use train_test_split,
# but actually do the Indexing yourself. Set X_train to be year values
# LESS than 1986, and y_train to be corresponding WhiteMale age values.
#
# INFO You might also want to read the note about slicing on the bottom
# of this document before proceeding.
#
X_train = X.Year[X.Year < 1986]
y_train = X.WhiteMale[X.Year < 1986]
print type(y_train), type(X_train) # y_train and X_train are Series, as described below; need to convert X_train to DataFrame
print len(X_train), len(y_train) # X_train and y_train are the same length
X_train = pd.DataFrame(X_train)
#
# TODO: Train your model then pass it into drawLine with your training
# set and labels. You can title it "WhiteMale". drawLine will output
# to the console a 2014 extrapolation / approximation for what it
# believes the WhiteMale's life expectancy in the U.S. will be...
# given the pre-1986 data you trained it with. It'll also produce a
# 2030 and 2045 extrapolation.
#
model.fit(X_train, y_train)
drawLine(model, X_train, y_train, "WhiteMale")
#
# TODO: Print the actual 2014 WhiteMale life expectancy from your
# loaded dataset
#
print "Actual 2014 WhiteMale life expectancy from loaded dataset:",
X.WhiteMale[(X.Year == 2014)].values[0]
model.predict(2014)
#
# TODO: Repeat the process, but instead of for WhiteMale, this time
# select BlackFemale. Create a slice for BlackFemales, fit your
# model, and then call drawLine. Lastly, print out the actual 2014
# BlackFemale life expectancy
#
z_train = X.BlackFemale[X.Year < 1986]
z_model = linear_model.LinearRegression()
z_model.fit(X_train, z_train)
drawLine(z_model, X_train, z_train, "Black Female")
print "Actual 2014 Black Female:", X.BlackFemale[(X.Year == 2014)].values[0]
#
# TODO: Lastly, print out a correlation matrix for your entire
# dataset, and display a visualization of the correlation
# matrix, just as we described in the visualization section of
# the course
#
# .. your code here ..
print X.corr()
plt.imshow(X.corr(), cmap=plt.cm.Blues, interpolation='nearest')
plt.colorbar()
tick_marks = [i for i in range(len(X.columns))]
plt.xticks(tick_marks, X.columns, rotation='vertical')
plt.yticks(tick_marks, X.columns)
plt.show()
#
# INFO + HINT On Fitting, Scoring, and Predicting:
#
# Here's a hint to help you complete the assignment without pulling
# your hair out! When you use .fit(), .score(), and .predict() on
# your model, SciKit-Learn expects your training data to be in
# spreadsheet (2D Array-Like) form. This means you can't simply
# pass in a 1D Array (slice) and get away with it.
#
# To properly prep your data, you have to pass in a 2D Numpy Array,
# or a dataframe. But what happens if you really only want to pass
# in a single feature?
#
# If you slice your dataframe using df[['ColumnName']] syntax, the
# result that comes back is actually a *dataframe*. Go ahead and do
# a type() on it to check it out. Since it's already a dataframe,
# you're good -- no further changes needed.
#
# But if you slice your dataframe using the df.ColumnName syntax,
# OR if you call df['ColumnName'], the result that comes back is
# actually a series (1D Array)! This will cause SKLearn to bug out.
# So if you are slicing using either of those two techniques, before
# sending your training or testing data to .fit / .score, do a
# my_column = my_column.reshape(-1,1). This will convert your 1D
# array of [n_samples], to a 2D array shaped like [n_samples, 1].
# A single feature, with many samples.
#
# If you did something like my_column = [my_column], that would produce
# an array in the shape of [1, n_samples], which is incorrect because
# SKLearn expects your data to be arranged as [n_samples, n_features].
# Keep in mind, all of the above only relates to your "X" or input
# data, and does not apply to your "y" or labels.