From 5ab45bd1a7e0789532e7272d287f8e5d0c54f1c7 Mon Sep 17 00:00:00 2001 From: Alan R Date: Tue, 2 Jun 2015 13:56:07 -0400 Subject: [PATCH] predicts if email is spam, have training and test scores --- .envrc | 1 + .gitignore | 4 + Spam_Filter.ipynb | 488 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 493 insertions(+) create mode 100644 .envrc create mode 100644 Spam_Filter.ipynb diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..94840b3 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +layout python3 diff --git a/.gitignore b/.gitignore index f00dbf2..9673df7 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,7 @@ docs/_build/ # PyBuilder target/ +How Much is* +.direnv/ +.DS_store +spambase/ diff --git a/Spam_Filter.ipynb b/Spam_Filter.ipynb new file mode 100644 index 0000000..e637d13 --- /dev/null +++ b/Spam_Filter.ipynb @@ -0,0 +1,488 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 107, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn import linear_model\n", + "from sklearn.cross_validation import train_test_split\n", + "from sklearn.naive_bayes import MultinomialNB" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##Reading and Processing Data" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "data_file = \"spambase/spambase.data\"\n", + "names_file = \"spambase/spambase.names\"" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "names = [] # establishing the column headers, some manual work required\n", + "with open(names_file) as f:\n", + " l = f.readline()\n", + " for l in f:\n", + " if len(l) > 0:\n", + " if l[:4] == \"word\" or l[:4] == \"char\":\n", + " l2 = l[10:l.index(\":\")]\n", + " names.append(l2)\n", + "names.append(\"capital_avg\")\n", + "names.append(\"capital_longest\")\n", + "names.append(\"capital_total\")\n", + "names.append(\"spam\")" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "58" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(names) # had to get this to match the number of columns from the data" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "texts = pd.read_csv(data_file, names=names) # read the data file" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "4601" + ] + }, + "execution_count": 112, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(texts) # total number of records" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1840" + ] + }, + "execution_count": 113, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "int(0.4*len(texts)) # we expect this many records in the test " + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
makeaddressall3douroverremoveinternetordermail...;([!$#capital_avgcapital_longestcapital_totalspam
00.000.640.6400.320.000.000.000.000.00...0.000.00000.7780.0000.0003.756612781
10.210.280.5000.140.280.210.070.000.94...0.000.13200.3720.1800.0485.11410110281
20.060.000.7101.230.190.190.120.640.25...0.010.14300.2760.1840.0109.82148522591
30.000.000.0000.630.000.310.630.310.63...0.000.13700.1370.0000.0003.537401911
40.000.000.0000.630.000.310.630.310.63...0.000.13500.1350.0000.0003.537401911
\n", + "

5 rows × 58 columns

\n", + "
" + ], + "text/plain": [ + " make address all 3d our over remove internet order mail ... \\\n", + "0 0.00 0.64 0.64 0 0.32 0.00 0.00 0.00 0.00 0.00 ... \n", + "1 0.21 0.28 0.50 0 0.14 0.28 0.21 0.07 0.00 0.94 ... \n", + "2 0.06 0.00 0.71 0 1.23 0.19 0.19 0.12 0.64 0.25 ... \n", + "3 0.00 0.00 0.00 0 0.63 0.00 0.31 0.63 0.31 0.63 ... \n", + "4 0.00 0.00 0.00 0 0.63 0.00 0.31 0.63 0.31 0.63 ... \n", + "\n", + " ; ( [ ! $ # capital_avg capital_longest \\\n", + "0 0.00 0.000 0 0.778 0.000 0.000 3.756 61 \n", + "1 0.00 0.132 0 0.372 0.180 0.048 5.114 101 \n", + "2 0.01 0.143 0 0.276 0.184 0.010 9.821 485 \n", + "3 0.00 0.137 0 0.137 0.000 0.000 3.537 40 \n", + "4 0.00 0.135 0 0.135 0.000 0.000 3.537 40 \n", + "\n", + " capital_total spam \n", + "0 278 1 \n", + "1 1028 1 \n", + "2 2259 1 \n", + "3 191 1 \n", + "4 191 1 \n", + "\n", + "[5 rows x 58 columns]" + ] + }, + "execution_count": 114, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "texts.head() # columns and data look coherent" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Breaking into training/sample sets" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "txt_array = np.asarray(texts)\n", + "a = txt_array[:,:-1]\n", + "b = txt_array[:,-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "train_data, test_data, train_scores, test_scores = train_test_split(a, b, test_size=0.4, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1841" + ] + }, + "execution_count": 119, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(test_scores) # this is the predicted number of records in test set " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Applying Naieve Bayesan Regression\n", + "This approach comes from the sklearn package. It works like the prior linear regression tools, but is more effective for this type of data set." + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)" + ] + }, + "execution_count": 117, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classifier = MultinomialNB()\n", + "classifier.fit(train_data, train_scores)" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Training correlation score: 0.782608695652\n", + " Training correlation score: 0.781097229766\n" + ] + } + ], + "source": [ + "train_r2 = classifier.score(train_data, train_scores)\n", + "test_r2 = classifier.score(test_data, test_scores)\n", + "print(\" Training correlation score: \"+str(train_r2))\n", + "print(\" Training correlation score: \"+str(test_r2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These scores matches with our expectations. The model is decent at predicting whether an email is spam within its training set, and it is a little bit worse at predicting if an email in the test set is spam." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}