diff --git a/experimentation/Diabetes Ridge Regression Scoring.ipynb b/experimentation/Diabetes Ridge Regression Scoring.ipynb index 9ac340ed..f4538180 100644 --- a/experimentation/Diabetes Ridge Regression Scoring.ipynb +++ b/experimentation/Diabetes Ridge Regression Scoring.ipynb @@ -1,4 +1,4 @@ -{ + { "cells": [ { "cell_type": "markdown", @@ -39,6 +39,7 @@ "metadata": {}, "outputs": [], "source": [ + "def init():\n", "model_path = Model.get_model_path(model_name=\"sklearn_regression_model.pkl\")\n", "model = joblib.load(model_path)" ] @@ -56,10 +57,13 @@ "metadata": {}, "outputs": [], "source": [ - "raw_data = '{\"data\":[[1,2,3,4,5,6,7,8,9,10],[10,9,8,7,6,5,4,3,2,1]]}'\n", + "def run(raw_data, request_headers):\n" "\n", "data = json.loads(raw_data)[\"data\"]\n", - "data = numpy.array(data)" + "data = numpy.array(data)\n", + "result = model.predict(data)\n", + "\n" + "return {"result": result.tolist()}" ] }, { diff --git a/experimentation/Diabetes Ridge Regression Training.ipynb b/experimentation/Diabetes Ridge Regression Training.ipynb index fa192115..1ee2165a 100644 --- a/experimentation/Diabetes Ridge Regression Training.ipynb +++ b/experimentation/Diabetes Ridge Regression Training.ipynb @@ -28,358 +28,99 @@ "import pandas as pd" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load Data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "sample_data = load_diabetes()\n", - "\n", - "df = pd.DataFrame(\n", - " data=sample_data.data,\n", - " columns=sample_data.feature_names)\n", - "df['Y'] = sample_data.target" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(442, 10)\n" - ] - } - ], - "source": [ - "print(df.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
agesexbmibps1s2s3s4s5s6Y
count4.420000e+024.420000e+024.420000e+024.420000e+024.420000e+024.420000e+024.420000e+024.420000e+024.420000e+024.420000e+02442.000000
mean-3.634285e-161.308343e-16-8.045349e-161.281655e-16-8.835316e-171.327024e-16-4.574646e-163.777301e-16-3.830854e-16-3.412882e-16152.133484
std4.761905e-024.761905e-024.761905e-024.761905e-024.761905e-024.761905e-024.761905e-024.761905e-024.761905e-024.761905e-0277.093005
min-1.072256e-01-4.464164e-02-9.027530e-02-1.123996e-01-1.267807e-01-1.156131e-01-1.023071e-01-7.639450e-02-1.260974e-01-1.377672e-0125.000000
25%-3.729927e-02-4.464164e-02-3.422907e-02-3.665645e-02-3.424784e-02-3.035840e-02-3.511716e-02-3.949338e-02-3.324879e-02-3.317903e-0287.000000
50%5.383060e-03-4.464164e-02-7.283766e-03-5.670611e-03-4.320866e-03-3.819065e-03-6.584468e-03-2.592262e-03-1.947634e-03-1.077698e-03140.500000
75%3.807591e-025.068012e-023.124802e-023.564384e-022.835801e-022.984439e-022.931150e-023.430886e-023.243323e-022.791705e-02211.500000
max1.107267e-015.068012e-021.705552e-011.320442e-011.539137e-011.987880e-011.811791e-011.852344e-011.335990e-011.356118e-01346.000000
\n", - "
" - ], - "text/plain": [ - " age sex bmi bp s1 \\\n", - "count 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 \n", - "mean -3.634285e-16 1.308343e-16 -8.045349e-16 1.281655e-16 -8.835316e-17 \n", - "std 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 \n", - "min -1.072256e-01 -4.464164e-02 -9.027530e-02 -1.123996e-01 -1.267807e-01 \n", - "25% -3.729927e-02 -4.464164e-02 -3.422907e-02 -3.665645e-02 -3.424784e-02 \n", - "50% 5.383060e-03 -4.464164e-02 -7.283766e-03 -5.670611e-03 -4.320866e-03 \n", - "75% 3.807591e-02 5.068012e-02 3.124802e-02 3.564384e-02 2.835801e-02 \n", - "max 1.107267e-01 5.068012e-02 1.705552e-01 1.320442e-01 1.539137e-01 \n", - "\n", - " s2 s3 s4 s5 s6 \\\n", - "count 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 \n", - "mean 1.327024e-16 -4.574646e-16 3.777301e-16 -3.830854e-16 -3.412882e-16 \n", - "std 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 \n", - "min -1.156131e-01 -1.023071e-01 -7.639450e-02 -1.260974e-01 -1.377672e-01 \n", - "25% -3.035840e-02 -3.511716e-02 -3.949338e-02 -3.324879e-02 -3.317903e-02 \n", - "50% -3.819065e-03 -6.584468e-03 -2.592262e-03 -1.947634e-03 -1.077698e-03 \n", - "75% 2.984439e-02 2.931150e-02 3.430886e-02 3.243323e-02 2.791705e-02 \n", - "max 1.987880e-01 1.811791e-01 1.852344e-01 1.335990e-01 1.356118e-01 \n", - "\n", - " Y \n", - "count 442.000000 \n", - "mean 152.133484 \n", - "std 77.093005 \n", - "min 25.000000 \n", - "25% 87.000000 \n", - "50% 140.500000 \n", - "75% 211.500000 \n", - "max 346.000000 " - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# All data in a single dataframe\n", - "df.describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Split Data into Training and Validation Sets" - ] - }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ + "# Split the dataframe into test and train data\n", + "def = split_data(df):\n", "X = df.drop('Y', axis=1).values\n", "y = df['Y'].values\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.2, random_state=0)\n", "data = {\"train\": {\"X\": X_train, \"y\": y_train},\n", - " \"test\": {\"X\": X_test, \"y\": y_test}}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train Model on Training Set" + " \"test\": {\"X\": X_test, \"y\": y_test}}\n", + " return data" ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,\n", - " normalize=False, random_state=None, solver='auto', tol=0.001)" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# experiment parameters\n", - "args = {\n", - " \"alpha\": 0.5\n", - "}\n", - "\n", - "reg_model = Ridge(**args)\n", - "reg_model.fit(data[\"train\"][\"X\"], data[\"train\"][\"y\"])" - ] - }, - { - "cell_type": "markdown", + "execution_count": 3, "metadata": {}, + "outputs": [], "source": [ - "## Validate Model on Validation Set" + "# Train the model, return the model\n", + "def train_model(data, args):\n", + "reg_model = Ridge(**args)\n", + "reg_model.fit(data[\"train\"][\"X\"], data[\"train\"][\"y\"])\n", + " return reg_model" ] - }, - { + }, + { "cell_type": "code", - "execution_count": 18, + "execution_count": 4, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'mse': 3298.9096058070622}\n" - ] - } - ], + "outputs": [], "source": [ + "# Evaluate the metrics for the model\n", + "def get_model_metrics(reg_model, data):\n", "preds = reg_model.predict(data[\"test\"][\"X\"])\n", "mse = mean_squared_error(preds, y_test)\n", "metrics = {\"mse\": mse}\n", - "print(metrics)" + " return metrics" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 5, "metadata": {}, + "outputs": [], "source": [ - "## Save Model" + "def main():\n", + "# Load data\n", + "sample_data = load_diabetes()\n", + "\n", + "df = pd.DataFrame(\n", + " data=sample_data.data,\n", + " columns=sample_data.feature_names)\n", + "df['Y'] = sample_data.target\n", + "\n", + "# Split Data into Training and Validation Sets\n", + " data = split_data(df):\n", + "\n", + "# Train Model on Training Set\n", + "args = {\n", + " \"alpha\": 0.5\n", + "}\n", + "\n", + "reg_model = Ridge(**args)\n", + "\n", + "# Validate Model on Validation Set\n", + "metrics = get_model_metrics(reg, data)\n", + "\n", + "# Save Model\n", + "model_name = \"sklearn_regression_model.pkl\"\n", + "\n", + "joblib.dump(value=reg, filename=model_name)" ] }, - { - "cell_type": "code", +{ +"cell_type": "code", "execution_count": 7, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['sklearn_regression_model.pkl']" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model_name = \"sklearn_regression_model.pkl\"\n", + "outputs": [], + "source": ["model_name = \"sklearn_regression_model.pkl\"\n", "\n", "joblib.dump(value=reg, filename=model_name)" ] } - ], + ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python ", "language": "python", "name": "python3" }, @@ -399,3 +140,5 @@ "nbformat": 4, "nbformat_minor": 2 } + +