diff --git a/Naive_Bayes_text_classification/Naive_Bayes_text_classification.ipynb b/Naive_Bayes_text_classification/Naive_Bayes_text_classification.ipynb new file mode 100644 index 0000000..87f1e7a --- /dev/null +++ b/Naive_Bayes_text_classification/Naive_Bayes_text_classification.ipynb @@ -0,0 +1,762 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /home/eugene/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 126, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from collections import Counter\n", + "import re\n", + "from utils import load_dataset, split_dataset, split_dataset_data_frame\n", + "import nltk\n", + "nltk.download('stopwords')" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "corpus shape: (1118, 2)\n", + "train shape: (895, 1)\n", + "test shape: (223, 1)\n" + ] + } + ], + "source": [ + "# Upload datasets\n", + "train_messages, train_labels, test_messages, test_labels, corpus = load_dataset()\n", + "print('corpus shape:', corpus.shape)\n", + "print('train shape:', train_messages.shape)\n", + "print('test shape:', test_labels.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Text Data Cleaning and Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [], + "source": [ + "def tokenize_and_normalize(text, remove_small_words = True, leave_only_letters = True, to_lower_case = True):\n", + " \"\"\"\n", + " Converting a sentence into list of words. Normalize text.\n", + " \n", + " Argument:\n", + " text -- a sentence that should be tokenized and normalized\n", + " to_lower_case -- reduced all words to lowercase. Default value is True\n", + " leave_only_letters -- remove all irrelevant characters (any non-letter characters). Default value is True\n", + " remove_small_words -- remove all small words (less than 3 characters). Default value is True\n", + " \n", + " Returns:\n", + " words -- list of words\n", + "\n", + " \"\"\"\n", + " if to_lower_case:\n", + " text=text.lower()\n", + " pattern = r'[A-Z,a-z]' if leave_only_letters else r'\\S' \n", + " pattern += r'{3,}' if remove_small_words else r'{1,}' \n", + " words=re.findall(pattern,text)\n", + " return words" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of words in row string: 481\n", + "number of words in normalized string: 372\n" + ] + } + ], + "source": [ + "print('number of words in row string: ', len(train_messages[3, 0].split()))\n", + "words = tokenize_and_normalize(train_messages[3, 0])\n", + "print('number of words in normalized string: ', len(words))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Remove stopwords" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": {}, + "outputs": [], + "source": [ + "def remove_stopwords(row_words):\n", + " \"\"\"\n", + " Remove stopwords from list of words.\n", + " \n", + " Argument:\n", + " row_words -- a list of words that contains stopwords that should be removed\n", + " \n", + " Returns:\n", + " words -- list of words\n", + "\n", + " \"\"\"\n", + " \n", + " clean_words = row_words.copy()\n", + " \n", + " stopwords = nltk.corpus.stopwords.words('english')\n", + " stopwords = tokenize_and_normalize(' '.join(stopwords))\n", + " stopwords = list(set(stopwords))\n", + " \n", + " clean_words = [x for x in clean_words if x not in stopwords]\n", + " \n", + " return clean_words" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of words in string before remove stopwords: 372\n", + "number of words in string after stopwords have been removed: 358\n" + ] + } + ], + "source": [ + "print('number of words in string before remove stopwords:', len(words))\n", + "words = remove_stopwords(words)\n", + "print('number of words in string after stopwords have been removed:', len(words))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create the NaiveBayes class" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": {}, + "outputs": [], + "source": [ + "class NaiveBayes:\n", + " def __init__(self):\n", + " self.corpus = pd.DataFrame({'messages' : [], 'labels' : []})\n", + " self.classes = np.empty(0)\n", + " self.tokens = []\n", + " self.frequency_table = pd.DataFrame({'col' : []})\n", + " self.likelihoods_of_tokens = []\n", + " self.likelihoods_of_classes = []\n", + " self.likelihoods_of_tokens_for_each_classes = []\n", + " \n", + " def fit(self, corpus, withLogarithmTrick = False):\n", + " self.corpus = corpus.copy(deep=True)\n", + " self.classes = self.corpus['labels'].unique()\n", + " train_corpus, validation_corpus = self.split_corpus(self.corpus)\n", + " \n", + " self.frequency_table, self.tokens = self.__to_frequency_table(train_corpus)\n", + " self.likelihoods_of_tokens = self.__calc_likelihoods_of_tokens()\n", + " self.likelihoods_of_classes = self.__calc_likelihoods_of_classes()\n", + " self.likelihoods_of_tokens_for_each_classes = self.__calc_likelihoods_of_tokens_for_each_classes()\n", + " \n", + " # Calculate accuracy \n", + " messages = validation_corpus['messages'].values.tolist()\n", + " predicted_corpus = self.predict(messages, withLogarithmTrick)\n", + " predicted = predicted_corpus.loc[:,'predicted classes'].values\n", + " real = validation_corpus.loc[:,'labels'].values\n", + " accuracy = self.calc_accuracy(predicted, real)\n", + "\n", + " return accuracy\n", + " \n", + " def predictOne(self, text, withLogarithmTrick = False):\n", + " likelihoods_of_classes = self.__calc_likelihoods_of_classes_for_each_tokens(tokenize_and_normalize(text), withLogarithmTrick)\n", + "\n", + " predicted_class = likelihoods_of_classes.idxmax(axis=0)\n", + " \n", + " return (text, predicted_class.values[0], likelihoods_of_classes)\n", + "\n", + " \n", + " def predict(self, corpus, withLogarithmTrick = False):\n", + " x = np.array(corpus).reshape(len(corpus), 1)\n", + " y = np.repeat(-1, len(corpus)).reshape(len(corpus), 1)\n", + " data = np.append(x, y, axis=1)\n", + " \n", + " corpus_with_predicted_classes = pd.DataFrame(data=data,\n", + " index=np.arange(len(corpus)),\n", + " columns=[\"messages\", \"predicted classes\"])\n", + " \n", + " for idx, text in enumerate(corpus):\n", + " prediction = self.predictOne(corpus_with_predicted_classes.at[idx, \"messages\"], withLogarithmTrick)\n", + " corpus_with_predicted_classes.at[idx, \"predicted classes\"] = prediction[1]\n", + "\n", + " return corpus_with_predicted_classes\n", + " \n", + " def __to_frequency_table(self, input_corpus):\n", + " corpus = input_corpus.copy(deep=True)\n", + " tokens = tokenize_and_normalize(' '.join(corpus['messages'].values.tolist()))\n", + " tokens = list(set(tokens))\n", + " classes = corpus['labels'].unique()\n", + " \n", + " frequency_table = pd.DataFrame(1,\n", + " index=classes,\n", + " columns=tokens)\n", + " \n", + " for clss in classes:\n", + " class_corpus = corpus[corpus['labels'] == clss]\n", + " all_class_tokens = tokenize_and_normalize(' '.join(class_corpus['messages'].values.tolist()))\n", + " for token in all_class_tokens:\n", + " frequency_table.at[clss, token] += 1\n", + " \n", + " return frequency_table, tokens\n", + " \n", + " def __calc_likelihoods_of_classes(self):\n", + " sum_of_frequencies_of_tokens_by_classes = self.frequency_table.sum(axis=1)\n", + " sum_of_frequencies_of_tokens_at_all = sum_of_frequencies_of_tokens_by_classes.sum(axis=0)\n", + " likelihoods_of_classes = sum_of_frequencies_of_tokens_by_classes / sum_of_frequencies_of_tokens_at_all\n", + " return likelihoods_of_classes\n", + " \n", + " def __calc_likelihoods_of_tokens(self):\n", + " sum_of_frequencies_of_tokens_by_tokens = self.frequency_table.sum(axis=0)\n", + " sum_of_frequencies_of_tokens_at_all = sum_of_frequencies_of_tokens_by_tokens.sum(axis=0)\n", + " likelihoods_of_tokens = sum_of_frequencies_of_tokens_by_tokens / sum_of_frequencies_of_tokens_at_all\n", + " return likelihoods_of_tokens\n", + " \n", + " def __calc_likelihoods_of_tokens_for_each_classes(self):\n", + " sum_of_frequencies_of_classes_by_classes = self.frequency_table.sum(axis=1)\n", + " likelihoods_of_tokens_for_each_classes = self.frequency_table.loc[:,:] \\\n", + " .div(sum_of_frequencies_of_classes_by_classes, axis=0)\n", + " return likelihoods_of_tokens_for_each_classes\n", + " \n", + " def __calc_likelihoods_of_classes_for_each_tokens(self, tokens, withLogarithmTrick = False):\n", + " likelihoods_of_classes = pd.DataFrame(data=np.zeros((len(self.classes), 1), dtype=int),\n", + " index=self.classes,\n", + " columns=[\"likelihood\"])\n", + " \n", + " for clss in self.classes:\n", + " if withLogarithmTrick:\n", + " tokens_likelihoods = self.likelihoods_of_tokens_for_each_classes.loc[clss][tokens]\n", + " tokens_likelihoods_with_logarithm_trick = np.log(self.likelihoods_of_classes[clss]) + \\\n", + " np.log(tokens_likelihoods[tokens_likelihoods > 0]).sum()\n", + " likelihoods_of_classes.loc[clss] = tokens_likelihoods_with_logarithm_trick \n", + " else:\n", + " tokens_likelihoods = self.likelihoods_of_tokens_for_each_classes.loc[clss][tokens]\n", + " tokens_likelihoods_prod = tokens_likelihoods[tokens_likelihoods > 0].prod()\n", + " likelihoods_of_classes.loc[clss] = self.likelihoods_of_classes[clss] * tokens_likelihoods_prod\n", + "\n", + "\n", + " return likelihoods_of_classes\n", + " \n", + " \n", + " def split_corpus(self, corpus):\n", + " train_corpus, test_corpus = split_dataset_data_frame(corpus)\n", + " return train_corpus, test_corpus\n", + " \n", + " def calc_accuracy(self, X, Y):\n", + " \"\"\"\n", + " Calculate the model accuracy. Predicted labels vs true ones.\n", + "\n", + " Argument:\n", + " X -- a numpy array (labels) of shape (num_samples, 1 - a label). Usually, it's a matrix of predicted labels.\n", + " Y -- a numpy array (labels) of shape (num_samples, 1 - a label). Usually, it's a matrix of real labels.\n", + "\n", + " Returns:\n", + " accuracy -- a classification accuracy\n", + " \"\"\"\n", + " accuracy = (np.copy(X) == np.copy(Y)).mean()\n", + " return accuracy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Validation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Prepare test corpus" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
messageslabels
0Chinese Beijing Chinese0
1Chinese Chinese Shanghai0
2Chinese Macao0
3Tokyo Japan Chinese1
\n", + "
" + ], + "text/plain": [ + " messages labels\n", + "0 Chinese Beijing Chinese 0\n", + "1 Chinese Chinese Shanghai 0\n", + "2 Chinese Macao 0\n", + "3 Tokyo Japan Chinese 1" + ] + }, + "execution_count": 133, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = np.array([[\"Chinese Beijing Chinese\",\"0\"],\n", + " [\"Chinese Chinese Shanghai\",\"0\"], \n", + " [\"Chinese Macao\",\"0\"],\n", + " [\"Tokyo Japan Chinese\",\"1\"]])\n", + " \n", + "data = pd.DataFrame(data=data[0:,0:],\n", + " columns=[\"messages\",\"labels\"])\n", + "\n", + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Train and validation a NaiveBayes model" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train accuracy: 1.0\n", + "classify text \"Chinese Chinese Chinese Tokyo Japan\"\n", + "predicted class: 1\n", + "pobability of classes: {'0': 0.00035555555555555574, '1': 0.00043402777777777775}\n" + ] + } + ], + "source": [ + "# Create instance of NaiveBayes class\n", + "nb = NaiveBayes()\n", + "\n", + "# Train our model\n", + "# Tips: inside fit method it would be nice to split input data into train / test (80/20) sets and return model’ accuracy, e.g.:\n", + "accuracy = nb.fit(data, False) # return accuracy \n", + "print('train accuracy:', accuracy)\n", + "\n", + "# Try to predict class of text\n", + "prediction = nb.predictOne(\"Chinese Chinese Chinese Tokyo Japan\", False)\n", + "print('classify text \"{}\"'.format(prediction[0]))\n", + "print('predicted class:', prediction[1])\n", + "print('pobability of classes:', dict(prediction[2][\"likelihood\"]))\n", + "# Must return[ ('Chinese Chinese Chinese Tokyo Japan', '0')]\n", + "# pobability {'1': 0.00013548070246744226, '0': 0.00030121377997263036}\n", + "# or log {'1': -7.906681345001262, '0': -7.10769031284391}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Spam recognition with NaiveBayes model" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train accuracy: 0.797752808988764\n" + ] + } + ], + "source": [ + "train_corpus, test_corpus = nb.split_corpus(corpus)\n", + "\n", + "train_accuracy = nb.fit(train_corpus, False)\n", + "print('train accuracy:', train_accuracy)" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": {}, + "outputs": [], + "source": [ + "list_of_messages = test_corpus['messages'].values.tolist()\n", + "predicted_corpus = nb.predict(list_of_messages, False)" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "test accuracy: 0.7847533632286996\n" + ] + } + ], + "source": [ + "predicted = predicted_corpus.loc[:,'predicted classes'].values\n", + "real = test_corpus.loc[:,'labels'].values\n", + "test_accuracy = nb.calc_accuracy(predicted, real)\n", + "print('test accuracy:', test_accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Result table for Simple NaiveBayes model**: \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Train Test
Accuracy 79.8% 78.5%
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Logarithm trick\n", + "\n", + "*Note:* If we have a lot of words in document, we will have zero value for P(text|class), because Python float limitation. We can use natural logarithm trick and change formula for P(text|class) into:\n", + "log(P(class|text))=log(P(class))+log(P(word_1|class))+…+log(P(word_n|class))" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train accuracy with logarithm trick: 0.9550561797752809\n" + ] + } + ], + "source": [ + "train_accuracy_with_log_trick = nb.fit(train_corpus, True)\n", + "print('train accuracy with logarithm trick:', train_accuracy_with_log_trick)" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [], + "source": [ + "list_of_messages = test_corpus['messages'].values.tolist()\n", + "predicted_corpus = nb.predict(list_of_messages, True)" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "test accuracy with logarithm trick: 0.9282511210762332\n" + ] + } + ], + "source": [ + "predicted = predicted_corpus.loc[:,'predicted classes'].values\n", + "real = test_corpus.loc[:,'labels'].values\n", + "test_accuracy_with_log_trick = nb.calc_accuracy(predicted, real)\n", + "print('test accuracy with logarithm trick:', test_accuracy_with_log_trick)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Result table for NaiveBayes model with logarithn trick**: \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Train Test
Accuracy 95.5% 92.8%
" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [], + "source": [ + "class NaiveBayesTfIdf(NaiveBayes):\n", + "\n", + " def predict(self, corpus, withLogarithmTrick = False):\n", + " x = np.array(corpus).reshape(len(corpus), 1)\n", + " y = np.repeat(-1, len(corpus)).reshape(len(corpus), 1)\n", + " data = np.append(x, y, axis=1)\n", + " \n", + " corpus_with_predicted_classes = pd.DataFrame(data=data,\n", + " index=np.arange(len(corpus)),\n", + " columns=[\"messages\", \"predicted classes\"])\n", + " \n", + " for idx, text in enumerate(corpus):\n", + " prediction = self.predictOne(corpus_with_predicted_classes.at[idx, \"messages\"], withLogarithmTrick)\n", + " corpus_with_predicted_classes.at[idx, \"predicted classes\"] = prediction[1]\n", + "\n", + " return corpus_with_predicted_classes\n", + " \n", + " def predictOne(self, text, withLogarithmTrick = False):\n", + " likelihoods_of_classes = pd.DataFrame(data=np.zeros((len(self.classes), 1), dtype=int),\n", + " index=self.classes,\n", + " columns=[\"likelihood\"])\n", + " \n", + " for clss in self.classes:\n", + " tokens = tokenize_and_normalize(text)\n", + " all_docs_in_class = self.corpus[self.corpus['labels'] == clss] \n", + " class_prior = all_docs_in_class.shape[0] / self.corpus.shape[0]\n", + " probability = class_prior\n", + " \n", + " for token in tokens:\n", + " freq_w = tokens.count(token) / len(tokens)\n", + " count_text_token = 1\n", + " corpus_docs = tokenize_and_normalize(' '.join(all_docs_in_class['messages'].values.tolist()))\n", + "\n", + "\n", + "\n", + " for it in range(self.corpus.shape[0]):\n", + " count_text_token += token in corpus_docs\n", + "\n", + " IDF = np.log(all_docs_in_class.shape[0] / count_text_token)\n", + " probability *= (freq_w * IDF)\n", + " \n", + " likelihoods_of_classes.loc[clss] = probability\n", + " \n", + " predicted_class = likelihoods_of_classes.idxmax(axis=0)\n", + "\n", + "\n", + " return (text, predicted_class.values[0], likelihoods_of_classes)" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train accuracy with TF-IDF: 0.5714285714285714\n" + ] + } + ], + "source": [ + "nbTfIdf = NaiveBayesTfIdf()\n", + "train_corpus, test_corpus = nb.split_corpus(corpus)\n", + "train_corpus, test_corpus = nb.split_corpus(test_corpus)\n", + "# train_corpus, test_corpus = nb.split_corpus(test_corpus)\n", + "\n", + "train_accuracy_tf_idf = nbTfIdf.fit(train_corpus)\n", + "print('train accuracy with TF-IDF:', train_accuracy_tf_idf)" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [], + "source": [ + "list_of_messages = test_corpus['messages'].values.tolist()\n", + "predicted_corpus_tf_idf = nbTfIdf.predict(list_of_messages, False)" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "test accuracy with TF-IDF: 0.4318181818181818\n" + ] + } + ], + "source": [ + "predicted_tf_idf = predicted_corpus_tf_idf.loc[:,'predicted classes'].values\n", + "real = test_corpus.loc[:,'labels'].values\n", + "test_accuracy_tf_idf = nbTfIdf.calc_accuracy(predicted_tf_idf, real)\n", + "print('test accuracy with TF-IDF:', test_accuracy_tf_idf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Result table for NaiveBayes model with TF-IDF**: \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Train Test
Accuracy 57,1% 43.2%
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Naive_Bayes_text_classification/data.csv b/Naive_Bayes_text_classification/dataset/data.csv similarity index 100% rename from Naive_Bayes_text_classification/data.csv rename to Naive_Bayes_text_classification/dataset/data.csv diff --git a/Naive_Bayes_text_classification/utils.py b/Naive_Bayes_text_classification/utils.py new file mode 100644 index 0000000..c3aca83 --- /dev/null +++ b/Naive_Bayes_text_classification/utils.py @@ -0,0 +1,54 @@ +import pandas as pd +import numpy as np +from sklearn.utils import shuffle as sklearn_shuffle + + +def load_dataset(): + data = pd.read_csv('dataset/data.csv',sep=',', header=None) + data.columns=['messages', 'labels'] + return (*split_dataset(data), data) + +def split_dataset(data): + dataset = data.copy(deep=True) + labels = dataset['labels'].unique() + test_datasets = pd.DataFrame() + train_datasets = pd.DataFrame() + + for label in labels: + ds_with_label = dataset[dataset['labels'] == label] + num_test = int(ds_with_label.shape[0]*0.2) + num_test = 1 if num_test < 1 and ds_with_label.shape[0] > 1 else num_test + ds_test = ds_with_label[:num_test] + ds_train = ds_with_label[num_test:] + test_datasets=pd.concat((test_datasets, ds_test)) + train_datasets=pd.concat((train_datasets, ds_train)) + + test_datasets = sklearn_shuffle(test_datasets, random_state=0) + train_datasets = sklearn_shuffle(train_datasets, random_state=0) + + train_messages = np.array(train_datasets['messages']).reshape(train_datasets.shape[0], 1) + train_labels = np.array(train_datasets['labels']).reshape(train_datasets.shape[0], 1) + test_messages = np.array(test_datasets['messages']).reshape(test_datasets.shape[0], 1) + test_labels = np.array(test_datasets['labels']).reshape(test_datasets.shape[0], 1) + + return train_messages, train_labels, test_messages, test_labels + +def split_dataset_data_frame(data): + dataset = data.copy(deep=True) + labels = dataset['labels'].unique() + test_datasets = pd.DataFrame() + train_datasets = pd.DataFrame() + + for label in labels: + ds_with_label = dataset[dataset['labels'] == label] + num_test = int(ds_with_label.shape[0]*0.2) + num_test = 1 if num_test < 1 and ds_with_label.shape[0] > 1 else num_test + ds_test = ds_with_label[:num_test] + ds_train = ds_with_label[num_test:] + test_datasets=pd.concat((test_datasets, ds_test)) + train_datasets=pd.concat((train_datasets, ds_train)) + + test_datasets = sklearn_shuffle(test_datasets, random_state=0) + train_datasets = sklearn_shuffle(train_datasets, random_state=0) + + return train_datasets, test_datasets