From be46f16f9eb95cb9a9246ef8e5bb9b45293ccc09 Mon Sep 17 00:00:00 2001 From: Jeremy Gresham Date: Tue, 2 Jun 2015 14:04:53 -0400 Subject: [PATCH] done --- .gitignore | 5 +- Spambase.ipynb | 125 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 + 3 files changed, 131 insertions(+), 1 deletion(-) create mode 100644 Spambase.ipynb diff --git a/.gitignore b/.gitignore index f00dbf2..df67b4b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ # Created by https://www.gitignore.io - +*.data +.DS_Store +.envrc +.direnv ### IPythonNotebook ### # Temporary data .ipynb_checkpoints/ diff --git a/Spambase.ipynb b/Spambase.ipynb new file mode 100644 index 0000000..89ea99e --- /dev/null +++ b/Spambase.ipynb @@ -0,0 +1,125 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn import preprocessing, cross_validation\n", + "from sklearn.naive_bayes import MultinomialNB" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "spambase = pd.read_csv('./spambase.data', header=None)\n", + "spam2 = spambase.iloc[:, 0:54].join(spambase.iloc[:, 57]) # drop the integer values\n", + "spam3 = spambase.iloc[:, 0:47].join(spambase.iloc[:, 57]) # drop the single character values\n", + "\n", + "series = spambase.iloc[48:54].mean()\n", + "spam4 = spambase.iloc[:, 0:47]\n", + "spam4[47] = spambase.iloc[48:54].sum(1)\n", + "spam4 = spam4.join(spambase.iloc[:, 57])\n", + "#last is taking the sum over the single character value columns" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def spam_classify(spam, perc=.4, Ys=57):\n", + " X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(spam.iloc[:, 0:-2], \\\n", + " spam.loc[:, 57], test_size=perc, \\\n", + " random_state = 144)\n", + " \n", + " nb = MultinomialNB()\n", + " nb.fit(X_train, Y_train)\n", + " set = nb.predict(spam.iloc[:, 0:-2])\n", + " final = sum(set)\n", + " score = nb.score(X_test, Y_test)\n", + " return round(score, 3), final, len(set) - final" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "test score, spam, non-spam\n", + "(0.81200000000000006, 2018, 2583)\n", + "(0.86699999999999999, 2239, 2362)\n", + "(0.86599999999999999, 2230, 2371)\n", + "(0.86599999999999999, 2230, 2371)\n" + ] + } + ], + "source": [ + "print('test score, spam, non-spam')\n", + "print(spam_classify(spambase))\n", + "print(spam_classify(spam2))\n", + "print(spam_classify(spam3))\n", + "print(spam_classify(spam4))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "original values were\n", + " - spam: 1813\n", + " - non-spam: 2788\n", + " \n", + "# Overfitting achieved" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/requirements.txt b/requirements.txt index 473a3b2..0a08f6f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ +ipython[notebook] scikit-learn scipy pandas numpy matplotlib +seaborn