diff --git a/Naive_Bayes_text_classification/Naive_Bayes_text_classification.ipynb b/Naive_Bayes_text_classification/Naive_Bayes_text_classification.ipynb
new file mode 100644
index 0000000..87f1e7a
--- /dev/null
+++ b/Naive_Bayes_text_classification/Naive_Bayes_text_classification.ipynb
@@ -0,0 +1,762 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 126,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package stopwords to /home/eugene/nltk_data...\n",
+ "[nltk_data] Package stopwords is already up-to-date!\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 126,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from collections import Counter\n",
+ "import re\n",
+ "from utils import load_dataset, split_dataset, split_dataset_data_frame\n",
+ "import nltk\n",
+ "nltk.download('stopwords')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 127,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "corpus shape: (1118, 2)\n",
+ "train shape: (895, 1)\n",
+ "test shape: (223, 1)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Upload datasets\n",
+ "train_messages, train_labels, test_messages, test_labels, corpus = load_dataset()\n",
+ "print('corpus shape:', corpus.shape)\n",
+ "print('train shape:', train_messages.shape)\n",
+ "print('test shape:', test_labels.shape)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Text Data Cleaning and Preprocessing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 128,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def tokenize_and_normalize(text, remove_small_words = True, leave_only_letters = True, to_lower_case = True):\n",
+ " \"\"\"\n",
+ " Converting a sentence into list of words. Normalize text.\n",
+ " \n",
+ " Argument:\n",
+ " text -- a sentence that should be tokenized and normalized\n",
+ " to_lower_case -- reduced all words to lowercase. Default value is True\n",
+ " leave_only_letters -- remove all irrelevant characters (any non-letter characters). Default value is True\n",
+ " remove_small_words -- remove all small words (less than 3 characters). Default value is True\n",
+ " \n",
+ " Returns:\n",
+ " words -- list of words\n",
+ "\n",
+ " \"\"\"\n",
+ " if to_lower_case:\n",
+ " text=text.lower()\n",
+ " pattern = r'[A-Z,a-z]' if leave_only_letters else r'\\S' \n",
+ " pattern += r'{3,}' if remove_small_words else r'{1,}' \n",
+ " words=re.findall(pattern,text)\n",
+ " return words"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 129,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "number of words in row string: 481\n",
+ "number of words in normalized string: 372\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('number of words in row string: ', len(train_messages[3, 0].split()))\n",
+ "words = tokenize_and_normalize(train_messages[3, 0])\n",
+ "print('number of words in normalized string: ', len(words))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Remove stopwords"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 130,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def remove_stopwords(row_words):\n",
+ " \"\"\"\n",
+ " Remove stopwords from list of words.\n",
+ " \n",
+ " Argument:\n",
+ " row_words -- a list of words that contains stopwords that should be removed\n",
+ " \n",
+ " Returns:\n",
+ " words -- list of words\n",
+ "\n",
+ " \"\"\"\n",
+ " \n",
+ " clean_words = row_words.copy()\n",
+ " \n",
+ " stopwords = nltk.corpus.stopwords.words('english')\n",
+ " stopwords = tokenize_and_normalize(' '.join(stopwords))\n",
+ " stopwords = list(set(stopwords))\n",
+ " \n",
+ " clean_words = [x for x in clean_words if x not in stopwords]\n",
+ " \n",
+ " return clean_words"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 131,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "number of words in string before remove stopwords: 372\n",
+ "number of words in string after stopwords have been removed: 358\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('number of words in string before remove stopwords:', len(words))\n",
+ "words = remove_stopwords(words)\n",
+ "print('number of words in string after stopwords have been removed:', len(words))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Create the NaiveBayes class"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 132,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class NaiveBayes:\n",
+ " def __init__(self):\n",
+ " self.corpus = pd.DataFrame({'messages' : [], 'labels' : []})\n",
+ " self.classes = np.empty(0)\n",
+ " self.tokens = []\n",
+ " self.frequency_table = pd.DataFrame({'col' : []})\n",
+ " self.likelihoods_of_tokens = []\n",
+ " self.likelihoods_of_classes = []\n",
+ " self.likelihoods_of_tokens_for_each_classes = []\n",
+ " \n",
+ " def fit(self, corpus, withLogarithmTrick = False):\n",
+ " self.corpus = corpus.copy(deep=True)\n",
+ " self.classes = self.corpus['labels'].unique()\n",
+ " train_corpus, validation_corpus = self.split_corpus(self.corpus)\n",
+ " \n",
+ " self.frequency_table, self.tokens = self.__to_frequency_table(train_corpus)\n",
+ " self.likelihoods_of_tokens = self.__calc_likelihoods_of_tokens()\n",
+ " self.likelihoods_of_classes = self.__calc_likelihoods_of_classes()\n",
+ " self.likelihoods_of_tokens_for_each_classes = self.__calc_likelihoods_of_tokens_for_each_classes()\n",
+ " \n",
+ " # Calculate accuracy \n",
+ " messages = validation_corpus['messages'].values.tolist()\n",
+ " predicted_corpus = self.predict(messages, withLogarithmTrick)\n",
+ " predicted = predicted_corpus.loc[:,'predicted classes'].values\n",
+ " real = validation_corpus.loc[:,'labels'].values\n",
+ " accuracy = self.calc_accuracy(predicted, real)\n",
+ "\n",
+ " return accuracy\n",
+ " \n",
+ " def predictOne(self, text, withLogarithmTrick = False):\n",
+ " likelihoods_of_classes = self.__calc_likelihoods_of_classes_for_each_tokens(tokenize_and_normalize(text), withLogarithmTrick)\n",
+ "\n",
+ " predicted_class = likelihoods_of_classes.idxmax(axis=0)\n",
+ " \n",
+ " return (text, predicted_class.values[0], likelihoods_of_classes)\n",
+ "\n",
+ " \n",
+ " def predict(self, corpus, withLogarithmTrick = False):\n",
+ " x = np.array(corpus).reshape(len(corpus), 1)\n",
+ " y = np.repeat(-1, len(corpus)).reshape(len(corpus), 1)\n",
+ " data = np.append(x, y, axis=1)\n",
+ " \n",
+ " corpus_with_predicted_classes = pd.DataFrame(data=data,\n",
+ " index=np.arange(len(corpus)),\n",
+ " columns=[\"messages\", \"predicted classes\"])\n",
+ " \n",
+ " for idx, text in enumerate(corpus):\n",
+ " prediction = self.predictOne(corpus_with_predicted_classes.at[idx, \"messages\"], withLogarithmTrick)\n",
+ " corpus_with_predicted_classes.at[idx, \"predicted classes\"] = prediction[1]\n",
+ "\n",
+ " return corpus_with_predicted_classes\n",
+ " \n",
+ " def __to_frequency_table(self, input_corpus):\n",
+ " corpus = input_corpus.copy(deep=True)\n",
+ " tokens = tokenize_and_normalize(' '.join(corpus['messages'].values.tolist()))\n",
+ " tokens = list(set(tokens))\n",
+ " classes = corpus['labels'].unique()\n",
+ " \n",
+ " frequency_table = pd.DataFrame(1,\n",
+ " index=classes,\n",
+ " columns=tokens)\n",
+ " \n",
+ " for clss in classes:\n",
+ " class_corpus = corpus[corpus['labels'] == clss]\n",
+ " all_class_tokens = tokenize_and_normalize(' '.join(class_corpus['messages'].values.tolist()))\n",
+ " for token in all_class_tokens:\n",
+ " frequency_table.at[clss, token] += 1\n",
+ " \n",
+ " return frequency_table, tokens\n",
+ " \n",
+ " def __calc_likelihoods_of_classes(self):\n",
+ " sum_of_frequencies_of_tokens_by_classes = self.frequency_table.sum(axis=1)\n",
+ " sum_of_frequencies_of_tokens_at_all = sum_of_frequencies_of_tokens_by_classes.sum(axis=0)\n",
+ " likelihoods_of_classes = sum_of_frequencies_of_tokens_by_classes / sum_of_frequencies_of_tokens_at_all\n",
+ " return likelihoods_of_classes\n",
+ " \n",
+ " def __calc_likelihoods_of_tokens(self):\n",
+ " sum_of_frequencies_of_tokens_by_tokens = self.frequency_table.sum(axis=0)\n",
+ " sum_of_frequencies_of_tokens_at_all = sum_of_frequencies_of_tokens_by_tokens.sum(axis=0)\n",
+ " likelihoods_of_tokens = sum_of_frequencies_of_tokens_by_tokens / sum_of_frequencies_of_tokens_at_all\n",
+ " return likelihoods_of_tokens\n",
+ " \n",
+ " def __calc_likelihoods_of_tokens_for_each_classes(self):\n",
+ " sum_of_frequencies_of_classes_by_classes = self.frequency_table.sum(axis=1)\n",
+ " likelihoods_of_tokens_for_each_classes = self.frequency_table.loc[:,:] \\\n",
+ " .div(sum_of_frequencies_of_classes_by_classes, axis=0)\n",
+ " return likelihoods_of_tokens_for_each_classes\n",
+ " \n",
+ " def __calc_likelihoods_of_classes_for_each_tokens(self, tokens, withLogarithmTrick = False):\n",
+ " likelihoods_of_classes = pd.DataFrame(data=np.zeros((len(self.classes), 1), dtype=int),\n",
+ " index=self.classes,\n",
+ " columns=[\"likelihood\"])\n",
+ " \n",
+ " for clss in self.classes:\n",
+ " if withLogarithmTrick:\n",
+ " tokens_likelihoods = self.likelihoods_of_tokens_for_each_classes.loc[clss][tokens]\n",
+ " tokens_likelihoods_with_logarithm_trick = np.log(self.likelihoods_of_classes[clss]) + \\\n",
+ " np.log(tokens_likelihoods[tokens_likelihoods > 0]).sum()\n",
+ " likelihoods_of_classes.loc[clss] = tokens_likelihoods_with_logarithm_trick \n",
+ " else:\n",
+ " tokens_likelihoods = self.likelihoods_of_tokens_for_each_classes.loc[clss][tokens]\n",
+ " tokens_likelihoods_prod = tokens_likelihoods[tokens_likelihoods > 0].prod()\n",
+ " likelihoods_of_classes.loc[clss] = self.likelihoods_of_classes[clss] * tokens_likelihoods_prod\n",
+ "\n",
+ "\n",
+ " return likelihoods_of_classes\n",
+ " \n",
+ " \n",
+ " def split_corpus(self, corpus):\n",
+ " train_corpus, test_corpus = split_dataset_data_frame(corpus)\n",
+ " return train_corpus, test_corpus\n",
+ " \n",
+ " def calc_accuracy(self, X, Y):\n",
+ " \"\"\"\n",
+ " Calculate the model accuracy. Predicted labels vs true ones.\n",
+ "\n",
+ " Argument:\n",
+ " X -- a numpy array (labels) of shape (num_samples, 1 - a label). Usually, it's a matrix of predicted labels.\n",
+ " Y -- a numpy array (labels) of shape (num_samples, 1 - a label). Usually, it's a matrix of real labels.\n",
+ "\n",
+ " Returns:\n",
+ " accuracy -- a classification accuracy\n",
+ " \"\"\"\n",
+ " accuracy = (np.copy(X) == np.copy(Y)).mean()\n",
+ " return accuracy"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Validation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Prepare test corpus"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 133,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " messages | \n",
+ " labels | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Chinese Beijing Chinese | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Chinese Chinese Shanghai | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Chinese Macao | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Tokyo Japan Chinese | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " messages labels\n",
+ "0 Chinese Beijing Chinese 0\n",
+ "1 Chinese Chinese Shanghai 0\n",
+ "2 Chinese Macao 0\n",
+ "3 Tokyo Japan Chinese 1"
+ ]
+ },
+ "execution_count": 133,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = np.array([[\"Chinese Beijing Chinese\",\"0\"],\n",
+ " [\"Chinese Chinese Shanghai\",\"0\"], \n",
+ " [\"Chinese Macao\",\"0\"],\n",
+ " [\"Tokyo Japan Chinese\",\"1\"]])\n",
+ " \n",
+ "data = pd.DataFrame(data=data[0:,0:],\n",
+ " columns=[\"messages\",\"labels\"])\n",
+ "\n",
+ "data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Train and validation a NaiveBayes model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 134,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "train accuracy: 1.0\n",
+ "classify text \"Chinese Chinese Chinese Tokyo Japan\"\n",
+ "predicted class: 1\n",
+ "pobability of classes: {'0': 0.00035555555555555574, '1': 0.00043402777777777775}\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Create instance of NaiveBayes class\n",
+ "nb = NaiveBayes()\n",
+ "\n",
+ "# Train our model\n",
+ "# Tips: inside fit method it would be nice to split input data into train / test (80/20) sets and return model’ accuracy, e.g.:\n",
+ "accuracy = nb.fit(data, False) # return accuracy \n",
+ "print('train accuracy:', accuracy)\n",
+ "\n",
+ "# Try to predict class of text\n",
+ "prediction = nb.predictOne(\"Chinese Chinese Chinese Tokyo Japan\", False)\n",
+ "print('classify text \"{}\"'.format(prediction[0]))\n",
+ "print('predicted class:', prediction[1])\n",
+ "print('pobability of classes:', dict(prediction[2][\"likelihood\"]))\n",
+ "# Must return[ ('Chinese Chinese Chinese Tokyo Japan', '0')]\n",
+ "# pobability {'1': 0.00013548070246744226, '0': 0.00030121377997263036}\n",
+ "# or log {'1': -7.906681345001262, '0': -7.10769031284391}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Spam recognition with NaiveBayes model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 135,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "train accuracy: 0.797752808988764\n"
+ ]
+ }
+ ],
+ "source": [
+ "train_corpus, test_corpus = nb.split_corpus(corpus)\n",
+ "\n",
+ "train_accuracy = nb.fit(train_corpus, False)\n",
+ "print('train accuracy:', train_accuracy)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 136,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "list_of_messages = test_corpus['messages'].values.tolist()\n",
+ "predicted_corpus = nb.predict(list_of_messages, False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 137,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "test accuracy: 0.7847533632286996\n"
+ ]
+ }
+ ],
+ "source": [
+ "predicted = predicted_corpus.loc[:,'predicted classes'].values\n",
+ "real = test_corpus.loc[:,'labels'].values\n",
+ "test_accuracy = nb.calc_accuracy(predicted, real)\n",
+ "print('test accuracy:', test_accuracy)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Result table for Simple NaiveBayes model**: \n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ " | \n",
+ " Train | \n",
+ " Test | \n",
+ "
\n",
+ " \n",
+ " Accuracy | \n",
+ " 79.8% | \n",
+ " 78.5% | \n",
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Logarithm trick\n",
+ "\n",
+ "*Note:* If we have a lot of words in document, we will have zero value for P(text|class), because Python float limitation. We can use natural logarithm trick and change formula for P(text|class) into:\n",
+ "log(P(class|text))=log(P(class))+log(P(word_1|class))+…+log(P(word_n|class))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 138,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "train accuracy with logarithm trick: 0.9550561797752809\n"
+ ]
+ }
+ ],
+ "source": [
+ "train_accuracy_with_log_trick = nb.fit(train_corpus, True)\n",
+ "print('train accuracy with logarithm trick:', train_accuracy_with_log_trick)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 139,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "list_of_messages = test_corpus['messages'].values.tolist()\n",
+ "predicted_corpus = nb.predict(list_of_messages, True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 140,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "test accuracy with logarithm trick: 0.9282511210762332\n"
+ ]
+ }
+ ],
+ "source": [
+ "predicted = predicted_corpus.loc[:,'predicted classes'].values\n",
+ "real = test_corpus.loc[:,'labels'].values\n",
+ "test_accuracy_with_log_trick = nb.calc_accuracy(predicted, real)\n",
+ "print('test accuracy with logarithm trick:', test_accuracy_with_log_trick)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Result table for NaiveBayes model with logarithn trick**: \n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ " | \n",
+ " Train | \n",
+ " Test | \n",
+ "
\n",
+ " \n",
+ " Accuracy | \n",
+ " 95.5% | \n",
+ " 92.8% | \n",
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 141,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class NaiveBayesTfIdf(NaiveBayes):\n",
+ "\n",
+ " def predict(self, corpus, withLogarithmTrick = False):\n",
+ " x = np.array(corpus).reshape(len(corpus), 1)\n",
+ " y = np.repeat(-1, len(corpus)).reshape(len(corpus), 1)\n",
+ " data = np.append(x, y, axis=1)\n",
+ " \n",
+ " corpus_with_predicted_classes = pd.DataFrame(data=data,\n",
+ " index=np.arange(len(corpus)),\n",
+ " columns=[\"messages\", \"predicted classes\"])\n",
+ " \n",
+ " for idx, text in enumerate(corpus):\n",
+ " prediction = self.predictOne(corpus_with_predicted_classes.at[idx, \"messages\"], withLogarithmTrick)\n",
+ " corpus_with_predicted_classes.at[idx, \"predicted classes\"] = prediction[1]\n",
+ "\n",
+ " return corpus_with_predicted_classes\n",
+ " \n",
+ " def predictOne(self, text, withLogarithmTrick = False):\n",
+ " likelihoods_of_classes = pd.DataFrame(data=np.zeros((len(self.classes), 1), dtype=int),\n",
+ " index=self.classes,\n",
+ " columns=[\"likelihood\"])\n",
+ " \n",
+ " for clss in self.classes:\n",
+ " tokens = tokenize_and_normalize(text)\n",
+ " all_docs_in_class = self.corpus[self.corpus['labels'] == clss] \n",
+ " class_prior = all_docs_in_class.shape[0] / self.corpus.shape[0]\n",
+ " probability = class_prior\n",
+ " \n",
+ " for token in tokens:\n",
+ " freq_w = tokens.count(token) / len(tokens)\n",
+ " count_text_token = 1\n",
+ " corpus_docs = tokenize_and_normalize(' '.join(all_docs_in_class['messages'].values.tolist()))\n",
+ "\n",
+ "\n",
+ "\n",
+ " for it in range(self.corpus.shape[0]):\n",
+ " count_text_token += token in corpus_docs\n",
+ "\n",
+ " IDF = np.log(all_docs_in_class.shape[0] / count_text_token)\n",
+ " probability *= (freq_w * IDF)\n",
+ " \n",
+ " likelihoods_of_classes.loc[clss] = probability\n",
+ " \n",
+ " predicted_class = likelihoods_of_classes.idxmax(axis=0)\n",
+ "\n",
+ "\n",
+ " return (text, predicted_class.values[0], likelihoods_of_classes)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 143,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "train accuracy with TF-IDF: 0.5714285714285714\n"
+ ]
+ }
+ ],
+ "source": [
+ "nbTfIdf = NaiveBayesTfIdf()\n",
+ "train_corpus, test_corpus = nb.split_corpus(corpus)\n",
+ "train_corpus, test_corpus = nb.split_corpus(test_corpus)\n",
+ "# train_corpus, test_corpus = nb.split_corpus(test_corpus)\n",
+ "\n",
+ "train_accuracy_tf_idf = nbTfIdf.fit(train_corpus)\n",
+ "print('train accuracy with TF-IDF:', train_accuracy_tf_idf)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 144,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "list_of_messages = test_corpus['messages'].values.tolist()\n",
+ "predicted_corpus_tf_idf = nbTfIdf.predict(list_of_messages, False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 146,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "test accuracy with TF-IDF: 0.4318181818181818\n"
+ ]
+ }
+ ],
+ "source": [
+ "predicted_tf_idf = predicted_corpus_tf_idf.loc[:,'predicted classes'].values\n",
+ "real = test_corpus.loc[:,'labels'].values\n",
+ "test_accuracy_tf_idf = nbTfIdf.calc_accuracy(predicted_tf_idf, real)\n",
+ "print('test accuracy with TF-IDF:', test_accuracy_tf_idf)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Result table for NaiveBayes model with TF-IDF**: \n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ " | \n",
+ " Train | \n",
+ " Test | \n",
+ "
\n",
+ " \n",
+ " Accuracy | \n",
+ " 57,1% | \n",
+ " 43.2% | \n",
+ "
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Naive_Bayes_text_classification/data.csv b/Naive_Bayes_text_classification/dataset/data.csv
similarity index 100%
rename from Naive_Bayes_text_classification/data.csv
rename to Naive_Bayes_text_classification/dataset/data.csv
diff --git a/Naive_Bayes_text_classification/utils.py b/Naive_Bayes_text_classification/utils.py
new file mode 100644
index 0000000..c3aca83
--- /dev/null
+++ b/Naive_Bayes_text_classification/utils.py
@@ -0,0 +1,54 @@
+import pandas as pd
+import numpy as np
+from sklearn.utils import shuffle as sklearn_shuffle
+
+
+def load_dataset():
+ data = pd.read_csv('dataset/data.csv',sep=',', header=None)
+ data.columns=['messages', 'labels']
+ return (*split_dataset(data), data)
+
+def split_dataset(data):
+ dataset = data.copy(deep=True)
+ labels = dataset['labels'].unique()
+ test_datasets = pd.DataFrame()
+ train_datasets = pd.DataFrame()
+
+ for label in labels:
+ ds_with_label = dataset[dataset['labels'] == label]
+ num_test = int(ds_with_label.shape[0]*0.2)
+ num_test = 1 if num_test < 1 and ds_with_label.shape[0] > 1 else num_test
+ ds_test = ds_with_label[:num_test]
+ ds_train = ds_with_label[num_test:]
+ test_datasets=pd.concat((test_datasets, ds_test))
+ train_datasets=pd.concat((train_datasets, ds_train))
+
+ test_datasets = sklearn_shuffle(test_datasets, random_state=0)
+ train_datasets = sklearn_shuffle(train_datasets, random_state=0)
+
+ train_messages = np.array(train_datasets['messages']).reshape(train_datasets.shape[0], 1)
+ train_labels = np.array(train_datasets['labels']).reshape(train_datasets.shape[0], 1)
+ test_messages = np.array(test_datasets['messages']).reshape(test_datasets.shape[0], 1)
+ test_labels = np.array(test_datasets['labels']).reshape(test_datasets.shape[0], 1)
+
+ return train_messages, train_labels, test_messages, test_labels
+
+def split_dataset_data_frame(data):
+ dataset = data.copy(deep=True)
+ labels = dataset['labels'].unique()
+ test_datasets = pd.DataFrame()
+ train_datasets = pd.DataFrame()
+
+ for label in labels:
+ ds_with_label = dataset[dataset['labels'] == label]
+ num_test = int(ds_with_label.shape[0]*0.2)
+ num_test = 1 if num_test < 1 and ds_with_label.shape[0] > 1 else num_test
+ ds_test = ds_with_label[:num_test]
+ ds_train = ds_with_label[num_test:]
+ test_datasets=pd.concat((test_datasets, ds_test))
+ train_datasets=pd.concat((train_datasets, ds_train))
+
+ test_datasets = sklearn_shuffle(test_datasets, random_state=0)
+ train_datasets = sklearn_shuffle(train_datasets, random_state=0)
+
+ return train_datasets, test_datasets