tiyd-python-2015-05 · IHautaI · Jun 2, 2015
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,8 @@
 # Created by https://www.gitignore.io
-
+*.data
+.DS_Store
+.envrc
+.direnv
 ### IPythonNotebook ###
 # Temporary data
 .ipynb_checkpoints/

diff --git a/Spambase.ipynb b/Spambase.ipynb
@@ -0,0 +1,125 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn import preprocessing, cross_validation\n",
+    "from sklearn.naive_bayes import MultinomialNB"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "spambase = pd.read_csv('./spambase.data', header=None)\n",
+    "spam2 = spambase.iloc[:, 0:54].join(spambase.iloc[:, 57])  # drop the integer values\n",
+    "spam3 = spambase.iloc[:, 0:47].join(spambase.iloc[:, 57])  # drop the single character values\n",
+    "\n",
+    "series = spambase.iloc[48:54].mean()\n",
+    "spam4 = spambase.iloc[:, 0:47]\n",
+    "spam4[47] = spambase.iloc[48:54].sum(1)\n",
+    "spam4 = spam4.join(spambase.iloc[:, 57])\n",
+    "#last is taking the sum over the single character value columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def spam_classify(spam, perc=.4, Ys=57):\n",
+    "    X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(spam.iloc[:, 0:-2], \\\n",
+    "                                           spam.loc[:, 57], test_size=perc, \\\n",
+    "                                           random_state = 144)\n",
+    "    \n",
+    "    nb = MultinomialNB()\n",
+    "    nb.fit(X_train, Y_train)\n",
+    "    set = nb.predict(spam.iloc[:, 0:-2])\n",
+    "    final = sum(set)\n",
+    "    score = nb.score(X_test, Y_test)\n",
+    "    return round(score, 3), final, len(set) - final"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "test score, spam, non-spam\n",
+      "(0.81200000000000006, 2018, 2583)\n",
+      "(0.86699999999999999, 2239, 2362)\n",
+      "(0.86599999999999999, 2230, 2371)\n",
+      "(0.86599999999999999, 2230, 2371)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('test score, spam, non-spam')\n",
+    "print(spam_classify(spambase))\n",
+    "print(spam_classify(spam2))\n",
+    "print(spam_classify(spam3))\n",
+    "print(spam_classify(spam4))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "original values were\n",
+    "   - spam: 1813\n",
+    "   - non-spam: 2788\n",
+    "   \n",
+    "# Overfitting achieved"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,7 @@
+ipython[notebook]
 scikit-learn
 scipy
 pandas
 numpy
 matplotlib
+seaborn