-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathapp.py
327 lines (266 loc) · 9.75 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
"""The main module of the app.
Contains most of the functions governing the
different app modes.
"""
import os
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
from viz import mnist_like_viz, training_curves
from utils import poly, paths
import dl
def main():
"""The main function of the app.
Calls the appropriate mode function, depending on the user's choice
in the sidebar. The mode function that can be called are
`regression`, `sinus`, `mnist_viz`, and `fashionmnist`.
Returns
-------
None
"""
st.title("Some data manipulations")
home_data = get_data()
app_mode = st.sidebar.selectbox(
"Choose the app mode",
[
"Show instructions",
"Home data regression",
"Sinus regression",
"Show MNIST",
"Deep Learning",
],
) # , "Show the source code"])
if app_mode == "Show instructions":
st.write("To continue select a mode in the selection box to the left.")
# elif app_mode == "Show the source code":
# st.code(get_file_content_as_string("./app.py"))
elif app_mode == "Home data regression":
regression(home_data)
elif app_mode == "Sinus regression":
sinus()
elif app_mode == "Show MNIST":
mnist()
elif app_mode == "Deep Learning":
fashionmnist()
@st.cache
def get_data():
"""Loads the home training data.
Returns
-------
home_data: pd.DataFrame
The home training data.
Notes
-----
This is the dataset dowloaded from https://www.kaggle.com/competitions/home-data-for-ml-course/data.
"""
iowa_file_path = "./home-data-for-ml-course/train.csv"
home_data = pd.read_csv(iowa_file_path)
return home_data
# def get_file_content_as_string(path):
# with open(path) as f:
# lines = f.read()
# return lines
def regression(home_data):
"""Performs regression on the home training data.
The dataset is split in a training and
a validation sets.
The user has the choice of which covariates to incoporate
in the model. Then a decision tree, a decision tree
with `max_leaf_nodes=100`, and a random forest are fitted
on the training set. Finally the validation mean
absolute errors are displayed.
Parameters
----------
home_data: pd.DataFrame
The home training data. It can be any DataFrame except it needs
the columns `SalePrice`, `LotArea`, `YearBuilt`, `1stFlrSF`,
`2ndFlrSF`, `FullBath`, `BedroomAbvGr`, and `TotRmsAbvGrd`.
Returns
-------
None
"""
# Create target object and call it y
y = home_data.SalePrice
features = [
"LotArea",
"YearBuilt",
"1stFlrSF",
"2ndFlrSF",
"FullBath",
"BedroomAbvGr",
"TotRmsAbvGrd",
]
home_data_extracted = home_data[["SalePrice"] + features]
st.text(
"This is the head of the dataframe of Iowa house prices with many covariates"
)
st.write(home_data_extracted.head())
# Create X
covariates = st.multiselect(
"Select covariates to keep for regression:", features, features
)
covariates.sort()
X = home_data[covariates]
# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
dict_val_maes = {"method": [], "Val MAE": []}
# Specify Model
iowa_model = DecisionTreeRegressor(random_state=1)
# Fit Model
iowa_model.fit(train_X, train_y)
# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
dict_val_maes["method"].append("DecisionTreeRegressor")
dict_val_maes["Val MAE"].append(val_mae)
# Using best value for max_leaf_nodes
iowa_model = DecisionTreeRegressor(max_leaf_nodes=100, random_state=1)
iowa_model.fit(train_X, train_y)
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
dict_val_maes["method"].append("DecisionTreeRegressor with max leaf nodes")
dict_val_maes["Val MAE"].append(val_mae)
# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(train_X, train_y)
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)
dict_val_maes["method"].append("RandomForestRegressor")
dict_val_maes["Val MAE"].append(rf_val_mae)
val_maes = pd.DataFrame(dict_val_maes).set_index("method")
st.write(val_maes)
st.text("(Test what happens when removing TotRmsAbvGrd)")
def sinus():
"""A simple example of regression on the sinus function on the interval [0,5].
Some points are perturbed with noise after applying
the sinus function to them.
The user decides the number of noisy points with a slider,
and the maximum order for the polynomial regression. They
also decide if they want to fit two regression trees (with
`max_depth=2` and `max_depth=5`) in addition to the polynomial
regression. Then the fitted models are plotted along with
the training noisy data.
Returns
-------
None
"""
noise = st.slider("Noise volume", 1, 10, 5, format="1 of each %d point(s)")
# Order of the polynom for the linear regression with polynom
order = st.slider(
"Choose the order of the polynom for the polynomial regression", 2, 20, 3
)
trees = st.checkbox("Show decision trees", True)
# Create a random dataset
rng = np.random.RandomState(1)
X = np.sort(5 * rng.rand(80, 1))
y = np.sin(X).ravel()
y[::noise] += 3 * (0.5 - rng.rand(y[::noise].size))
X2 = poly(X, order=order)
# Fit regression models
if trees:
regr_1 = DecisionTreeRegressor(max_depth=2, random_state=1)
regr_2 = DecisionTreeRegressor(max_depth=5, random_state=1)
regr_3 = LinearRegression()
if trees:
regr_1.fit(X, y)
regr_2.fit(X, y)
regr_3.fit(X2, y)
# Predict
X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
X2_test = poly(X_test, order=order)
if trees:
y_1 = regr_1.predict(X_test)
y_2 = regr_2.predict(X_test)
y_3 = regr_3.predict(X2_test)
# Plot the results
fig = plt.figure()
plt.scatter(X, y, s=20, edgecolor="black", c="darkorange", label="data")
if trees:
plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
plt.plot(X_test, y_3, color="red", label="polynom", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
if trees:
plt.title("Decision Trees and Polynomial Regression")
else:
plt.title("Polynomial Regression")
plt.xlim(-0.2, 5.2)
plt.ylim(-2.7, 2.7)
plt.legend()
st.pyplot(fig)
def mnist():
"""Selects randomly 6 images from the training MNIST dataset and displays them.
Returns
-------
None
"""
train_data = MNIST("data", train=True, download=True, transform=ToTensor())
classes = list(range(10))
mnist_like_viz(train_data, classes)
def fashionmnist():
"""Training a simple MLP on the FashionMNIST dataset and displaying the metrics evolution during the training.
The user can decide the number of hidden layers of the MLP. They can also choose the number of epochs
for training. Once a model with given hyperparameters is trained, it is saved and used
again the next times without new training, unless the user clicks the button to delete
the saved model and train again. The MLP architecture is displayed.
Then 2 figures that are the evolution
of, respectively, the losses (train and test) and accuracies (train and test)
with respect to the epoch, are displayed. Finally 6 random images of the test dataset are
displayed, along with their ground truth and predicted labels.
Returns
-------
None
Notes
-----
Inspired by https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html.
"""
st.header("A simple deep learning model applied on the FashionMNIST dataset")
hidden_layers = st.slider("Choose the number of hidden layers", 1, 5, 2)
dropout_rate = st.slider("Choose the dropout rate", 0.0, 0.9, 0.0, 0.1)
epochs = st.slider("Choose the number of epochs to train", 1, 1000, 50)
st.write(
"Note that the epoch parameter is only relevant for training a new model, so if there is no already saved model for this config"
)
if st.button("Delete saved model and train again"):
path_weights, path_metrics = paths(hidden_layers, dropout_rate)
try:
os.remove(path_weights)
os.remove(path_metrics)
except FileNotFoundError:
pass
train_dataloader, test_dataloader, _, test_data = dl.get_FashionMNIST_datasets(
64, only_loader=False
)
model = dl.get_and_train_model(
train_dataloader,
test_dataloader,
hidden_layers=hidden_layers,
dropout_rate=dropout_rate,
epochs=epochs,
mode="st",
)
classes = [
"T-shirt/top",
"Trouser",
"Pullover",
"Dress",
"Coat",
"Sandal",
"Shirt",
"Sneaker",
"Bag",
"Ankle boot",
]
training_curves(model, "st")
mnist_like_viz(test_data, classes, model)
if __name__ == "__main__":
main()