Research project for Hinglish Sentiment Analysis

okaditya84 · Nov 30, 2023 · 5dc2869 · 5dc2869
1 parent 199f160
commit 5dc2869
Show file tree

Hide file tree

Showing 29 changed files with 62,671 additions and 0 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+    "[python]": {
+        "editor.defaultFormatter": "ms-python.black-formatter"
+    },
+    "python.formatting.provider": "none"
+}
diff --git a/Dataset making/Labelling.ipynb b/Dataset making/Labelling.ipynb
diff --git a/Dataset making/data.csv b/Dataset making/data.csv
diff --git a/Dataset making/same_sentiment_rows.csv b/Dataset making/same_sentiment_rows.csv
diff --git a/Dataset making/test.csv b/Dataset making/test.csv
diff --git a/Dataset making/test.ipynb b/Dataset making/test.ipynb
@@ -0,0 +1,213 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "data=pd.read_csv(\"train.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df=pd.DataFrame(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import pipeline\n",
+    "\n",
+    "sentiment_analysis = pipeline('sentiment-analysis', model='cardiffnlp/twitter-roberta-base-sentiment-latest')\n",
+    "\n",
+    "sentiments = []\n",
+    "for text in data['english']:\n",
+    "    result = sentiment_analysis(text)\n",
+    "    sentiments.append(result[0]['label'])\n",
+    "\n",
+    "data['sentiment'] = sentiments\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentiments = []\n",
+    "for text in data['english']:\n",
+    "    result = sentiment_analysis(text)\n",
+    "    sentiments.append(result[0]['label'])\n",
+    "\n",
+    "data['sentiment'] = sentiments\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# read the csv files\n",
+    "train_df = pd.read_csv('train.csv')\n",
+    "test_df = pd.read_csv('test.csv')\n",
+    "val_df = pd.read_csv('valid.csv')\n",
+    "\n",
+    "# concatenate the dataframes\n",
+    "data_df = pd.concat([train_df, test_df, val_df], ignore_index=True)\n",
+    "\n",
+    "# write the merged dataframe to a new csv file\n",
+    "data_df.to_csv('data.csv', index=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import pipeline\n",
+    "\n",
+    "sentiment_analysis = pipeline('sentiment-analysis', model='cardiffnlp/twitter-roberta-base-sentiment-latest')\n",
+    "\n",
+    "sentiments = []\n",
+    "for text in data['English']:\n",
+    "    result = sentiment_analysis(text)\n",
+    "    sentiments.append(result[0]['label'])\n",
+    "\n",
+    "data.rename(columns={'sentiment': 'twitter_roberta'}, inplace=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import required libraries\n",
+    "import pandas as pd\n",
+    "from transformers import pipeline\n",
+    "\n",
+    "# read the data.csv file\n",
+    "data_df = pd.read_csv('data.csv')\n",
+    "\n",
+    "# create a sentiment analysis pipeline using the beto model\n",
+    "sentiment_analysis = pipeline('sentiment-analysis', model='finiteautomata/beto-sentiment-analysis')\n",
+    "\n",
+    "# predict the sentiment for each text in the english column\n",
+    "sentiments = []\n",
+    "for text in data_df['English']:\n",
+    "    result = sentiment_analysis(text)\n",
+    "    sentiments.append(result[0]['label'])\n",
+    "\n",
+    "# add the predicted sentiments to the dataframe\n",
+    "data_df['beto_sentiment'] = sentiments\n",
+    "\n",
+    "# write the updated dataframe to a new csv file\n",
+    "data_df.to_csv('data_with_beto_sentiment.csv', index=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install vaderSentiment\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import the required libraries\n",
+    "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n",
+    "\n",
+    "# create an instance of the SentimentIntensityAnalyzer class\n",
+    "analyzer = SentimentIntensityAnalyzer()\n",
+    "\n",
+    "# predict the sentiment for each text in the english column\n",
+    "sentiments = []\n",
+    "for text in data_df['English']:\n",
+    "    result = analyzer.polarity_scores(text)\n",
+    "    compound_score = result['compound']\n",
+    "    if compound_score > 0.05:\n",
+    "        sentiments.append('positive')\n",
+    "    elif compound_score < -0.05:\n",
+    "        sentiments.append('negative')\n",
+    "    else:\n",
+    "        sentiments.append('neutral')\n",
+    "\n",
+    "# add the predicted sentiments to the dataframe\n",
+    "data_df['vader_sentiment'] = sentiments\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "    # read the data.csv file\n",
+    "data_df = pd.read_csv('data.csv')\n",
+    "\n",
+    "# create a new dataframe with only the required columns\n",
+    "new_df = data_df[['English', 'twitter_roberta', 'beto_sentiment', 'vader_sentiment']]\n",
+    "\n",
+    "new_df = new_df[new_df['twitter_roberta'] == new_df['beto_sentiment']]\n",
+    "new_df = new_df[new_df['twitter_roberta'] == new_df['vader_sentiment']]\n",
+    "\n",
+    "    # write the updated dataframe to a new csv file\n",
+    "new_df.to_csv('same_sentiment_rows.csv', index=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#read data.csv and print the first 5 rows\n",
+    "data_df = pd.read_csv('data.csv')\n",
+    "print(data_df.head())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}