diff --git a/Naive_Bayes_text_classification/Naive_Bayes_text_classification.ipynb b/Naive_Bayes_text_classification/Naive_Bayes_text_classification.ipynb
new file mode 100644
index 0000000..87f1e7a
--- /dev/null
+++ b/Naive_Bayes_text_classification/Naive_Bayes_text_classification.ipynb
@@ -0,0 +1,762 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 126,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package stopwords to /home/eugene/nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 126,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from collections import Counter\n",
+    "import re\n",
+    "from utils import load_dataset, split_dataset, split_dataset_data_frame\n",
+    "import nltk\n",
+    "nltk.download('stopwords')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 127,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "corpus shape: (1118, 2)\n",
+      "train shape: (895, 1)\n",
+      "test shape: (223, 1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Upload datasets\n",
+    "train_messages, train_labels, test_messages, test_labels, corpus = load_dataset()\n",
+    "print('corpus shape:', corpus.shape)\n",
+    "print('train shape:', train_messages.shape)\n",
+    "print('test shape:', test_labels.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Text Data Cleaning and Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 128,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize_and_normalize(text, remove_small_words = True, leave_only_letters = True, to_lower_case = True):\n",
+    "    \"\"\"\n",
+    "    Converting a sentence into list of words. Normalize text.\n",
+    "    \n",
+    "    Argument:\n",
+    "    text -- a sentence that should be tokenized and normalized\n",
+    "    to_lower_case -- reduced all words to lowercase. Default value is True\n",
+    "    leave_only_letters -- remove all irrelevant characters (any non-letter characters). Default value is True\n",
+    "    remove_small_words -- remove all small words (less than 3 characters). Default value is True\n",
+    "    \n",
+    "    Returns:\n",
+    "    words -- list of words\n",
+    "\n",
+    "    \"\"\"\n",
+    "    if to_lower_case:\n",
+    "        text=text.lower()\n",
+    "    pattern = r'[A-Z,a-z]' if leave_only_letters else r'\\S' \n",
+    "    pattern += r'{3,}' if remove_small_words else r'{1,}' \n",
+    "    words=re.findall(pattern,text)\n",
+    "    return words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 129,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "number of words in row string:  481\n",
+      "number of words in normalized string:  372\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('number of words in row string: ', len(train_messages[3, 0].split()))\n",
+    "words = tokenize_and_normalize(train_messages[3, 0])\n",
+    "print('number of words in normalized string: ', len(words))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Remove stopwords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 130,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def remove_stopwords(row_words):\n",
+    "    \"\"\"\n",
+    "    Remove stopwords from list of words.\n",
+    "    \n",
+    "    Argument:\n",
+    "    row_words -- a list of words that contains stopwords that should be removed\n",
+    "    \n",
+    "    Returns:\n",
+    "    words -- list of words\n",
+    "\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    clean_words = row_words.copy()\n",
+    "    \n",
+    "    stopwords = nltk.corpus.stopwords.words('english')\n",
+    "    stopwords = tokenize_and_normalize(' '.join(stopwords))\n",
+    "    stopwords = list(set(stopwords))\n",
+    "    \n",
+    "    clean_words = [x for x in clean_words if x not in stopwords]\n",
+    "    \n",
+    "    return clean_words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 131,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "number of words in string before remove stopwords: 372\n",
+      "number of words in string after stopwords have been removed: 358\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('number of words in string before remove stopwords:', len(words))\n",
+    "words = remove_stopwords(words)\n",
+    "print('number of words in string after stopwords have been removed:', len(words))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create the NaiveBayes class"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 132,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class NaiveBayes:\n",
+    "    def __init__(self):\n",
+    "        self.corpus = pd.DataFrame({'messages' : [], 'labels' : []})\n",
+    "        self.classes = np.empty(0)\n",
+    "        self.tokens = []\n",
+    "        self.frequency_table = pd.DataFrame({'col' : []})\n",
+    "        self.likelihoods_of_tokens = []\n",
+    "        self.likelihoods_of_classes = []\n",
+    "        self.likelihoods_of_tokens_for_each_classes = []\n",
+    "    \n",
+    "    def fit(self, corpus, withLogarithmTrick = False):\n",
+    "        self.corpus = corpus.copy(deep=True)\n",
+    "        self.classes = self.corpus['labels'].unique()\n",
+    "        train_corpus, validation_corpus = self.split_corpus(self.corpus)\n",
+    "        \n",
+    "        self.frequency_table, self.tokens = self.__to_frequency_table(train_corpus)\n",
+    "        self.likelihoods_of_tokens = self.__calc_likelihoods_of_tokens()\n",
+    "        self.likelihoods_of_classes = self.__calc_likelihoods_of_classes()\n",
+    "        self.likelihoods_of_tokens_for_each_classes = self.__calc_likelihoods_of_tokens_for_each_classes()\n",
+    "        \n",
+    "        # Calculate accuracy        \n",
+    "        messages = validation_corpus['messages'].values.tolist()\n",
+    "        predicted_corpus = self.predict(messages, withLogarithmTrick)\n",
+    "        predicted = predicted_corpus.loc[:,'predicted classes'].values\n",
+    "        real = validation_corpus.loc[:,'labels'].values\n",
+    "        accuracy = self.calc_accuracy(predicted, real)\n",
+    "\n",
+    "        return accuracy\n",
+    "    \n",
+    "    def predictOne(self, text, withLogarithmTrick = False):\n",
+    "        likelihoods_of_classes = self.__calc_likelihoods_of_classes_for_each_tokens(tokenize_and_normalize(text), withLogarithmTrick)\n",
+    "\n",
+    "        predicted_class = likelihoods_of_classes.idxmax(axis=0)\n",
+    "        \n",
+    "        return (text, predicted_class.values[0], likelihoods_of_classes)\n",
+    "\n",
+    "    \n",
+    "    def predict(self, corpus, withLogarithmTrick = False):\n",
+    "        x = np.array(corpus).reshape(len(corpus), 1)\n",
+    "        y = np.repeat(-1, len(corpus)).reshape(len(corpus), 1)\n",
+    "        data = np.append(x, y, axis=1)\n",
+    "        \n",
+    "        corpus_with_predicted_classes = pd.DataFrame(data=data,\n",
+    "                                                     index=np.arange(len(corpus)),\n",
+    "                                                     columns=[\"messages\", \"predicted classes\"])\n",
+    "        \n",
+    "        for idx, text in enumerate(corpus):\n",
+    "            prediction = self.predictOne(corpus_with_predicted_classes.at[idx, \"messages\"], withLogarithmTrick)\n",
+    "            corpus_with_predicted_classes.at[idx, \"predicted classes\"] = prediction[1]\n",
+    "\n",
+    "        return corpus_with_predicted_classes\n",
+    "    \n",
+    "    def __to_frequency_table(self, input_corpus):\n",
+    "        corpus = input_corpus.copy(deep=True)\n",
+    "        tokens = tokenize_and_normalize(' '.join(corpus['messages'].values.tolist()))\n",
+    "        tokens = list(set(tokens))\n",
+    "        classes = corpus['labels'].unique()\n",
+    "        \n",
+    "        frequency_table = pd.DataFrame(1,\n",
+    "                                     index=classes,\n",
+    "                                     columns=tokens)\n",
+    "        \n",
+    "        for clss in classes:\n",
+    "            class_corpus = corpus[corpus['labels'] == clss]\n",
+    "            all_class_tokens = tokenize_and_normalize(' '.join(class_corpus['messages'].values.tolist()))\n",
+    "            for token in all_class_tokens:\n",
+    "                frequency_table.at[clss, token] += 1\n",
+    "        \n",
+    "        return frequency_table, tokens\n",
+    "    \n",
+    "    def __calc_likelihoods_of_classes(self):\n",
+    "        sum_of_frequencies_of_tokens_by_classes = self.frequency_table.sum(axis=1)\n",
+    "        sum_of_frequencies_of_tokens_at_all = sum_of_frequencies_of_tokens_by_classes.sum(axis=0)\n",
+    "        likelihoods_of_classes = sum_of_frequencies_of_tokens_by_classes / sum_of_frequencies_of_tokens_at_all\n",
+    "        return likelihoods_of_classes\n",
+    "    \n",
+    "    def __calc_likelihoods_of_tokens(self):\n",
+    "        sum_of_frequencies_of_tokens_by_tokens = self.frequency_table.sum(axis=0)\n",
+    "        sum_of_frequencies_of_tokens_at_all = sum_of_frequencies_of_tokens_by_tokens.sum(axis=0)\n",
+    "        likelihoods_of_tokens = sum_of_frequencies_of_tokens_by_tokens / sum_of_frequencies_of_tokens_at_all\n",
+    "        return likelihoods_of_tokens\n",
+    "    \n",
+    "    def __calc_likelihoods_of_tokens_for_each_classes(self):\n",
+    "        sum_of_frequencies_of_classes_by_classes = self.frequency_table.sum(axis=1)\n",
+    "        likelihoods_of_tokens_for_each_classes = self.frequency_table.loc[:,:] \\\n",
+    "                                                     .div(sum_of_frequencies_of_classes_by_classes, axis=0)\n",
+    "        return likelihoods_of_tokens_for_each_classes\n",
+    "    \n",
+    "    def __calc_likelihoods_of_classes_for_each_tokens(self, tokens, withLogarithmTrick = False):\n",
+    "        likelihoods_of_classes = pd.DataFrame(data=np.zeros((len(self.classes), 1), dtype=int),\n",
+    "                     index=self.classes,\n",
+    "                     columns=[\"likelihood\"])\n",
+    "            \n",
+    "        for clss in self.classes:\n",
+    "            if withLogarithmTrick:\n",
+    "                tokens_likelihoods = self.likelihoods_of_tokens_for_each_classes.loc[clss][tokens]\n",
+    "                tokens_likelihoods_with_logarithm_trick = np.log(self.likelihoods_of_classes[clss]) + \\\n",
+    "                         np.log(tokens_likelihoods[tokens_likelihoods > 0]).sum()\n",
+    "                likelihoods_of_classes.loc[clss] = tokens_likelihoods_with_logarithm_trick    \n",
+    "            else:\n",
+    "                tokens_likelihoods = self.likelihoods_of_tokens_for_each_classes.loc[clss][tokens]\n",
+    "                tokens_likelihoods_prod = tokens_likelihoods[tokens_likelihoods > 0].prod()\n",
+    "                likelihoods_of_classes.loc[clss] = self.likelihoods_of_classes[clss] * tokens_likelihoods_prod\n",
+    "\n",
+    "\n",
+    "        return likelihoods_of_classes\n",
+    "    \n",
+    "    \n",
+    "    def split_corpus(self, corpus):\n",
+    "        train_corpus, test_corpus = split_dataset_data_frame(corpus)\n",
+    "        return train_corpus, test_corpus\n",
+    "    \n",
+    "    def calc_accuracy(self, X, Y):\n",
+    "        \"\"\"\n",
+    "        Calculate the model accuracy. Predicted labels vs true ones.\n",
+    "\n",
+    "        Argument:\n",
+    "        X -- a numpy array (labels) of shape (num_samples, 1 - a label). Usually, it's a matrix of predicted labels.\n",
+    "        Y -- a numpy array (labels) of shape (num_samples, 1 - a label). Usually, it's a matrix of real labels.\n",
+    "\n",
+    "        Returns:\n",
+    "        accuracy -- a classification accuracy\n",
+    "        \"\"\"\n",
+    "        accuracy = (np.copy(X) == np.copy(Y)).mean()\n",
+    "        return accuracy"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Validation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Prepare test corpus"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 133,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>messages</th>\n",
+       "      <th>labels</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Chinese Beijing Chinese</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Chinese Chinese Shanghai</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Chinese Macao</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Tokyo Japan Chinese</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                   messages labels\n",
+       "0   Chinese Beijing Chinese      0\n",
+       "1  Chinese Chinese Shanghai      0\n",
+       "2             Chinese Macao      0\n",
+       "3       Tokyo Japan Chinese      1"
+      ]
+     },
+     "execution_count": 133,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data = np.array([[\"Chinese Beijing Chinese\",\"0\"],\n",
+    "            [\"Chinese Chinese Shanghai\",\"0\"], \n",
+    "            [\"Chinese Macao\",\"0\"],\n",
+    "            [\"Tokyo Japan Chinese\",\"1\"]])\n",
+    "                \n",
+    "data = pd.DataFrame(data=data[0:,0:],\n",
+    "                  columns=[\"messages\",\"labels\"])\n",
+    "\n",
+    "data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Train and validation a NaiveBayes model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 134,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train accuracy: 1.0\n",
+      "classify text \"Chinese Chinese Chinese Tokyo Japan\"\n",
+      "predicted class: 1\n",
+      "pobability of classes: {'0': 0.00035555555555555574, '1': 0.00043402777777777775}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create instance of NaiveBayes class\n",
+    "nb = NaiveBayes()\n",
+    "\n",
+    "# Train our model\n",
+    "# Tips: inside fit method it would be nice to split input data into train / test (80/20) sets and return model’ accuracy, e.g.:\n",
+    "accuracy = nb.fit(data, False)  # return accuracy \n",
+    "print('train accuracy:', accuracy)\n",
+    "\n",
+    "# Try to predict class of text\n",
+    "prediction = nb.predictOne(\"Chinese Chinese Chinese Tokyo Japan\", False)\n",
+    "print('classify text \"{}\"'.format(prediction[0]))\n",
+    "print('predicted class:', prediction[1])\n",
+    "print('pobability of classes:', dict(prediction[2][\"likelihood\"]))\n",
+    "# Must return[ ('Chinese Chinese Chinese Tokyo Japan', '0')]\n",
+    "# pobability {'1': 0.00013548070246744226, '0': 0.00030121377997263036}\n",
+    "# or log     {'1': -7.906681345001262, '0': -7.10769031284391}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Spam recognition with NaiveBayes model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 135,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train accuracy: 0.797752808988764\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_corpus, test_corpus = nb.split_corpus(corpus)\n",
+    "\n",
+    "train_accuracy = nb.fit(train_corpus, False)\n",
+    "print('train accuracy:', train_accuracy)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 136,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_of_messages = test_corpus['messages'].values.tolist()\n",
+    "predicted_corpus = nb.predict(list_of_messages, False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 137,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "test accuracy: 0.7847533632286996\n"
+     ]
+    }
+   ],
+   "source": [
+    "predicted = predicted_corpus.loc[:,'predicted classes'].values\n",
+    "real = test_corpus.loc[:,'labels'].values\n",
+    "test_accuracy = nb.calc_accuracy(predicted, real)\n",
+    "print('test accuracy:', test_accuracy)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Result table for Simple NaiveBayes model**: \n",
+    "\n",
+    "<style type=\"text/css\">td {text-align:left}</style>\n",
+    "<table style=\"width:auto\">\n",
+    "    <tr>\n",
+    "        <td>  </td>\n",
+    "        <td> Train </td>\n",
+    "        <td> Test </td>\n",
+    "    </tr>\n",
+    "    <tr>\n",
+    "        <td> Accuracy </td>\n",
+    "        <td> 79.8% </td>\n",
+    "        <td> 78.5%</td>\n",
+    "    </tr>\n",
+    "</table>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Logarithm trick\n",
+    "\n",
+    "*Note:* If we have a lot of words in document, we will have zero value for P(text|class), because Python float limitation. We can use natural logarithm trick and change formula for P(text|class) into:\n",
+    "log(P(class|text))=log(P(class))+log(P(word_1|class))+…+log(P(word_n|class))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 138,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train accuracy with logarithm trick: 0.9550561797752809\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_accuracy_with_log_trick = nb.fit(train_corpus, True)\n",
+    "print('train accuracy with logarithm trick:', train_accuracy_with_log_trick)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 139,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_of_messages = test_corpus['messages'].values.tolist()\n",
+    "predicted_corpus = nb.predict(list_of_messages, True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 140,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "test accuracy with logarithm trick: 0.9282511210762332\n"
+     ]
+    }
+   ],
+   "source": [
+    "predicted = predicted_corpus.loc[:,'predicted classes'].values\n",
+    "real = test_corpus.loc[:,'labels'].values\n",
+    "test_accuracy_with_log_trick = nb.calc_accuracy(predicted, real)\n",
+    "print('test accuracy with logarithm trick:', test_accuracy_with_log_trick)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Result table for NaiveBayes model with logarithn trick**: \n",
+    "\n",
+    "<style type=\"text/css\">td {text-align:left}</style>\n",
+    "<table style=\"width:auto\">\n",
+    "    <tr>\n",
+    "        <td>  </td>\n",
+    "        <td> Train </td>\n",
+    "        <td> Test </td>\n",
+    "    </tr>\n",
+    "    <tr>\n",
+    "        <td> Accuracy </td>\n",
+    "        <td> 95.5% </td>\n",
+    "        <td> 92.8%</td>\n",
+    "    </tr>\n",
+    "</table>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 141,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class NaiveBayesTfIdf(NaiveBayes):\n",
+    "\n",
+    "    def predict(self, corpus, withLogarithmTrick = False):\n",
+    "        x = np.array(corpus).reshape(len(corpus), 1)\n",
+    "        y = np.repeat(-1, len(corpus)).reshape(len(corpus), 1)\n",
+    "        data = np.append(x, y, axis=1)\n",
+    "        \n",
+    "        corpus_with_predicted_classes = pd.DataFrame(data=data,\n",
+    "                                                     index=np.arange(len(corpus)),\n",
+    "                                                     columns=[\"messages\", \"predicted classes\"])\n",
+    "        \n",
+    "        for idx, text in enumerate(corpus):\n",
+    "            prediction = self.predictOne(corpus_with_predicted_classes.at[idx, \"messages\"], withLogarithmTrick)\n",
+    "            corpus_with_predicted_classes.at[idx, \"predicted classes\"] = prediction[1]\n",
+    "\n",
+    "        return corpus_with_predicted_classes\n",
+    "    \n",
+    "    def predictOne(self, text, withLogarithmTrick = False):\n",
+    "        likelihoods_of_classes = pd.DataFrame(data=np.zeros((len(self.classes), 1), dtype=int),\n",
+    "             index=self.classes,\n",
+    "             columns=[\"likelihood\"])\n",
+    "        \n",
+    "        for clss in self.classes:\n",
+    "            tokens = tokenize_and_normalize(text)\n",
+    "            all_docs_in_class = self.corpus[self.corpus['labels'] == clss]            \n",
+    "            class_prior = all_docs_in_class.shape[0] / self.corpus.shape[0]\n",
+    "            probability = class_prior\n",
+    "            \n",
+    "            for token in tokens:\n",
+    "                freq_w = tokens.count(token) / len(tokens)\n",
+    "                count_text_token = 1\n",
+    "                corpus_docs = tokenize_and_normalize(' '.join(all_docs_in_class['messages'].values.tolist()))\n",
+    "\n",
+    "\n",
+    "\n",
+    "                for it in range(self.corpus.shape[0]):\n",
+    "                    count_text_token += token in corpus_docs\n",
+    "\n",
+    "                IDF = np.log(all_docs_in_class.shape[0] / count_text_token)\n",
+    "                probability *= (freq_w * IDF)\n",
+    "        \n",
+    "            likelihoods_of_classes.loc[clss] = probability\n",
+    "            \n",
+    "        predicted_class = likelihoods_of_classes.idxmax(axis=0)\n",
+    "\n",
+    "\n",
+    "        return (text, predicted_class.values[0], likelihoods_of_classes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 143,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train accuracy with TF-IDF: 0.5714285714285714\n"
+     ]
+    }
+   ],
+   "source": [
+    "nbTfIdf = NaiveBayesTfIdf()\n",
+    "train_corpus, test_corpus = nb.split_corpus(corpus)\n",
+    "train_corpus, test_corpus = nb.split_corpus(test_corpus)\n",
+    "# train_corpus, test_corpus = nb.split_corpus(test_corpus)\n",
+    "\n",
+    "train_accuracy_tf_idf = nbTfIdf.fit(train_corpus)\n",
+    "print('train accuracy with TF-IDF:', train_accuracy_tf_idf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 144,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_of_messages = test_corpus['messages'].values.tolist()\n",
+    "predicted_corpus_tf_idf = nbTfIdf.predict(list_of_messages, False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 146,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "test accuracy with TF-IDF: 0.4318181818181818\n"
+     ]
+    }
+   ],
+   "source": [
+    "predicted_tf_idf = predicted_corpus_tf_idf.loc[:,'predicted classes'].values\n",
+    "real = test_corpus.loc[:,'labels'].values\n",
+    "test_accuracy_tf_idf = nbTfIdf.calc_accuracy(predicted_tf_idf, real)\n",
+    "print('test accuracy with TF-IDF:', test_accuracy_tf_idf)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Result table for NaiveBayes model with TF-IDF**: \n",
+    "\n",
+    "<style type=\"text/css\">td {text-align:left}</style>\n",
+    "<table style=\"width:auto\">\n",
+    "    <tr>\n",
+    "        <td>  </td>\n",
+    "        <td> Train </td>\n",
+    "        <td> Test </td>\n",
+    "    </tr>\n",
+    "    <tr>\n",
+    "        <td> Accuracy </td>\n",
+    "        <td> 57,1% </td>\n",
+    "        <td> 43.2%</td>\n",
+    "    </tr>\n",
+    "</table>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Naive_Bayes_text_classification/data.csv b/Naive_Bayes_text_classification/dataset/data.csv
similarity index 100%
rename from Naive_Bayes_text_classification/data.csv
rename to Naive_Bayes_text_classification/dataset/data.csv
diff --git a/Naive_Bayes_text_classification/utils.py b/Naive_Bayes_text_classification/utils.py
new file mode 100644
index 0000000..c3aca83
--- /dev/null
+++ b/Naive_Bayes_text_classification/utils.py
@@ -0,0 +1,54 @@
+import pandas as pd
+import numpy as np
+from sklearn.utils import shuffle as sklearn_shuffle
+
+
+def load_dataset():
+    data = pd.read_csv('dataset/data.csv',sep=',', header=None)
+    data.columns=['messages', 'labels']
+    return (*split_dataset(data), data)
+
+def split_dataset(data):
+    dataset = data.copy(deep=True)
+    labels = dataset['labels'].unique()
+    test_datasets = pd.DataFrame()
+    train_datasets = pd.DataFrame()
+
+    for label in labels:
+        ds_with_label = dataset[dataset['labels'] == label]
+        num_test = int(ds_with_label.shape[0]*0.2)
+        num_test = 1 if num_test < 1 and ds_with_label.shape[0] > 1 else num_test
+        ds_test = ds_with_label[:num_test]
+        ds_train = ds_with_label[num_test:]
+        test_datasets=pd.concat((test_datasets, ds_test))
+        train_datasets=pd.concat((train_datasets, ds_train))
+
+    test_datasets = sklearn_shuffle(test_datasets, random_state=0)
+    train_datasets = sklearn_shuffle(train_datasets, random_state=0)
+
+    train_messages = np.array(train_datasets['messages']).reshape(train_datasets.shape[0], 1)
+    train_labels = np.array(train_datasets['labels']).reshape(train_datasets.shape[0], 1)
+    test_messages = np.array(test_datasets['messages']).reshape(test_datasets.shape[0], 1)
+    test_labels = np.array(test_datasets['labels']).reshape(test_datasets.shape[0], 1)
+
+    return train_messages, train_labels, test_messages, test_labels
+
+def split_dataset_data_frame(data):
+    dataset = data.copy(deep=True)
+    labels = dataset['labels'].unique()
+    test_datasets = pd.DataFrame()
+    train_datasets = pd.DataFrame()
+
+    for label in labels:
+        ds_with_label = dataset[dataset['labels'] == label]
+        num_test = int(ds_with_label.shape[0]*0.2)
+        num_test = 1 if num_test < 1 and ds_with_label.shape[0] > 1 else num_test
+        ds_test = ds_with_label[:num_test]
+        ds_train = ds_with_label[num_test:]
+        test_datasets=pd.concat((test_datasets, ds_test))
+        train_datasets=pd.concat((train_datasets, ds_train))
+
+    test_datasets = sklearn_shuffle(test_datasets, random_state=0)
+    train_datasets = sklearn_shuffle(train_datasets, random_state=0)
+
+    return train_datasets, test_datasets