added arabic graph files + topic assignment notebook

yasmeenhany · Jun 6, 2019 · 0fc95f6 · 0fc95f6
1 parent 17766a1
commit 0fc95f6
Show file tree

Hide file tree

Showing 27 changed files with 98,910 additions and 0 deletions.
diff --git a/.ipynb_checkpoints/arabic_topic_extraction-checkpoint.ipynb b/.ipynb_checkpoints/arabic_topic_extraction-checkpoint.ipynb
@@ -0,0 +1,323 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import arabic_reshaper"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#          The following 9 cells include the topic keyword arrays"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t= [\"ﻧﻴﻜﻰ ﻫﺎﻳﻠﻰ\",\"ﺍﻷﻣﺮﻳﻜﻲ\",\"ﺍﻷﻣﻴﺮﻛﻲ\",\"ﺍﻻﻣﺮﻳﻜﻲ\",\"أمريكان\",\"امريكان\",\"أمريكي\",\"امريكي\",\"امريكا\", \"أمريكا\", \"ترامب\", \"ترمب\", \"تدخل\", \"جاريد\" , \"ايفانكا\",\"دونالد\",\"ﺍﻷﻣﺮﻳﻜﻴﺔ\", \"ﺍﻟﻮﻻﻳﺎﺕ ﺍﻟﻤﺘﺤﺪﺓ\"]\n",
+    "t1 = []\n",
+    "t1.append(\"trump\")\n",
+    "# print(len(t))\n",
+    "# for i in t:\n",
+    "#     print(i)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "14\n"
+     ]
+    }
+   ],
+   "source": [
+    "t= [\"الحرة\",\"حرة\",\"ترجع\",\"أبدية\",\"خالدة\",\"ﻠﻦ ﺗﺴﻘﻂ ﺍﻟﻘﺪﺱ\",\"القدس لنا\",\"العودة\",\"العوده\",\"سنعود\",\"ستعود\",\"راجعين\",\"حق\",\"ستعود\"]\n",
+    "t2 = []\n",
+    "for word in t:\n",
+    "    word = arabic_reshaper.reshape(word)\n",
+    "    t2.append(word)\n",
+    "print(len(t2))\n",
+    "# for i in t:\n",
+    "#     print(i)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "39\n"
+     ]
+    }
+   ],
+   "source": [
+    "t= [\"كعرب\",\"سعود\",\"ﺧﺎﺩﻡ ﺍﻟﺤﺮﻣﻴﻦ\",\"ﺍﻟﺴﻴﺴﻲ\",\"أردوغان\",\"اردوغان\",\"اسطنبول\",\"ﺇﺳﻄﻨﺒﻮﻝ\",\"ﺇﺳﻄﻨﺒﻮﻝ\",\"ﺍﻟﻌﺮﺑﻴﻪ\",\"عربي\",\"عربيه\",\"ﻋﺮﺑﻴﺔ\",\"سوري\",\"ﺍﻟﻜﻮﻳﺖ\",\"ﺍﻟﺴﻌﻮﺩﻳﻪ\",\" ﺳﻌﻮﺩﻱ\",\"جزائر\",\"ﺟﺰﺍﺋﺮﻳﺔ\",\"إيران\",\"ايران\",\"تركيا\",\"ﺍﻟﻌﺮﺑﻴﺔ\",\"القاهرة\",\"مصر\",\"طبعوا\",\"التطبيع\",\"سلمان\",\"محمد بن\",\"العربي\",\"القطري\",\"اليمن\",\"قطر\",\"السعودية\",\"حكام\",\"الشعوب العربية\",\"العرب\", \"السعودي\",\"المملكة\"]\n",
+    "t3 = []\n",
+    "for word in t:\n",
+    "    word = arabic_reshaper.reshape(word)\n",
+    "    t3.append(word)\n",
+    "print(len(t3))\n",
+    "# for i in t:\n",
+    "#     print(i)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t= [\"بالحجارة\",\"الحجارة\",\"الرصاص\",\"الحجر\",\"الجهاد\",\"غزه\",\"غزة\",\"إطلاق\",\"العنف\",\"اطلاق\",\"ﺍﻃﻼﻕ ﻧﺎﺭ\",\"احتشاد\",\"ﻣﺴﻴﺮﺓ\",\"الشهداء\",\"جنود\",\"جرائم\",\"ﺟﺮﻳﻤﺔ\",\"غزة\",\"غارات\",\"هدم\",\"قتل\",\"ﻣﻘﺎﺗﻼﺕ\",\"ﺇﻧﺬﺍﺭ\",\"مجزرة\",\"مصابين\",\"قصف\",\"تقصف\",\"صاروخية\",\"دماء\",\"قذيفة\",\"ﺻﻔﺎﺭﺍﺕ ﺍﻹﻧﺬﺍﺭ\",\"قوات\",\"المقاومة\",\"صواريخ\",\"مجاهدين\",\"قصف\",\"موت\",\"جرحى\",\"إصابه\",\"اصابه\",\"إصابة\",\"اصابة\",\"مقتل\", \"جرحى\", \"قتلى\",\"قتلي\",\"اشتباك\",\"اشتباكات\",\"مصاب\", \"شهيد\", \"شهداء\", \"القصف\", \"الغارات\",\"المجاهدين\", \"عسكر\", \"صافرات\", \"تظاهر\"]\n",
+    "t4 = []\n",
+    "for word in t:\n",
+    "    word = arabic_reshaper.reshape(word)\n",
+    "    t4.append(word)\n",
+    "# print(len(t4))\n",
+    "# for i in t:\n",
+    "#     print(i + \",\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "14\n"
+     ]
+    }
+   ],
+   "source": [
+    "t= [\"ثالث الحرمين\",\"الوكيل\",\"ونعم\",\"حسبي\",\"لله\",\"المسلمون\",\"المسلمين\",\"مسلمون\",\"رمضان\",\"رب\",\"اللهم\", \"الله\", \"ربنا\", \"مسلمين\"]\n",
+    "t5 = []\n",
+    "for word in t:\n",
+    "#     word = arabic_reshaper.reshape(word)\n",
+    "    t5.append(word)\n",
+    "print(len(t5))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t= [\"سيدي\",\"ﺍﻻﻓﻄﺎﺭ\",\"سنفطر\",\"إعلان_زين\",\"اعلان\",\"زين\",\"إعلان\"]\n",
+    "t6 = []\n",
+    "for word in t:\n",
+    "    word = arabic_reshaper.reshape(word)\n",
+    "    t6.append(word)\n",
+    "# print(len(t6))\n",
+    "# for i in t:\n",
+    "#     print(i)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t= [\"المتصهين\",\"ﺍﻟﻤﺘﺼﻬﻴﻨﻴﻦ\",\"ﺑﺎﻟﺼﻬﺎﻳﻨﻪ\",\"ﺑﺎﻟﺼﻬﺎﻳﻨﻪ\",\"ﻟﺼﻬﺎﻳﻨﺔ\",\"ﺍﻟﺼﻬﻴﻮﻧﻰ\",\"ﺍﻟﺼﻬﻴﻮﻧﻴﺔ\",\"ﺍﻟﺼﻬﺎﻳﻨﺔ\",\"ﺍﻟﺼﻬﺎﻳﻨﻪ\",\"صهاينه\",\"ﺍﻟﺼﻬﻴﻮﻧﻲ\",\"صهيونية\",\"صهيوني\",\"صهاينة\"]\n",
+    "t7 = []\n",
+    "for word in t:\n",
+    "    word = arabic_reshaper.reshape(word)\n",
+    "    t7.append(word)\n",
+    "# print(len(t7))\n",
+    "# for i in t:\n",
+    "#     print(i + \",\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t = [\"الأطفال\",\"اطفالكم\",\"الاطفال\",\"أطفال\",\"اطفال\",\"طفل\"]\n",
+    "t8 = []\n",
+    "for word in t:\n",
+    "    word = arabic_reshaper.reshape(word)\n",
+    "    t8.append(word)\n",
+    "# print(len(t8))\n",
+    "# for i in t:\n",
+    "#     print(i + \",\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# The following cell maps tweets to topics based on keyword presence"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "41\n",
+      "310\n",
+      "231\n",
+      "0\n",
+      "124\n",
+      "49\n",
+      "16\n",
+      "536\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = pd.read_pickle(\"dataset/arabic/arabic_repliedto\")\n",
+    "df[\"type\"] = \"repliedto\"\n",
+    "#                       ----- uncomment the df we want to create mappings for\n",
+    "\n",
+    "# df = pd.read_pickle(\"dataset/arabic/arabic_replies\")\n",
+    "# df[\"type\"] = \"reply\"\n",
+    "# df = pd.read_pickle(\"dataset/arabic/arabic_tweets\")\n",
+    "# df[\"type\"] = \"tweet\"\n",
+    "america = []\n",
+    "ret = []\n",
+    "arab = []\n",
+    "violence = []\n",
+    "religion = []\n",
+    "zein = []\n",
+    "zionism = []\n",
+    "children = []\n",
+    "other = []\n",
+    "# the following block is run only for the first time, then is commented for rest of dfs (i.e: replies and tweets)\n",
+    "america_df = pd.DataFrame()\n",
+    "ret_df = pd.DataFrame()\n",
+    "arab_df = pd.DataFrame()\n",
+    "violence_df = pd.DataFrame()\n",
+    "religion_df = pd.DataFrame()\n",
+    "zein_df = pd.DataFrame()\n",
+    "zionism_df = pd.DataFrame()\n",
+    "children_df = pd.DataFrame()\n",
+    "other_df = pd.DataFrame()\n",
+    "\n",
+    "for i, row in df.head(len(df)).iterrows():\n",
+    "#     text = str(row[\"Text\"][2:len(row[\"Text\"]) - 2])\n",
+    "    text  = arabic_reshaper.reshape(str(row[\"Text\"]))\n",
+    "    for j in range(len(t1)):\n",
+    "        if(t1[j] in text ):\n",
+    "            america.append(text)\n",
+    "            america_df = america_df.append(pd.Series(row), ignore_index = True)\n",
+    "    for k in range(len(t2)):\n",
+    "        if(t2[k] in text ):\n",
+    "            ret.append(text)\n",
+    "            ret_df = ret_df.append(pd.Series(row), ignore_index = True)\n",
+    "    for l in range(len(t3)):\n",
+    "        if(t3[l] in text ):\n",
+    "            arab.append(text)\n",
+    "            arab_df = arab_df.append(pd.Series(row), ignore_index = True)\n",
+    "    for m in range(len(t4)):        \n",
+    "        if(t4[m] in text ):\n",
+    "            violence.append(text)\n",
+    "            violence_df = violence_df.append(pd.Series(row), ignore_index = True)\n",
+    "    for n in range(len(t5)):\n",
+    "        if(t5[n] in text ):\n",
+    "            religion.append(text)\n",
+    "            religion_df = religion_df.append(pd.Series(row), ignore_index = True)\n",
+    "    for o in range(len(t6)):\n",
+    "        if(t6[o] in text ):\n",
+    "            zein.append(text)\n",
+    "            zein_df = zein_df.append(pd.Series(row), ignore_index = True)\n",
+    "    for p in range(len(t7)):\n",
+    "        if(t7[p] in text ):\n",
+    "            zionism.append(text)\n",
+    "            zionism_df = zionism_df.append(pd.Series(row), ignore_index = True)\n",
+    "    for q in range(len(t8)):\n",
+    "        if(t8[q] in text ):\n",
+    "            children.append(text)\n",
+    "            children_df = children_df.append(pd.Series(row), ignore_index = True)\n",
+    "    if text not in america and text not in ret and text not in arab and text not in violence and text not in religion and text not in zein and text not in zionism and text not in children:\n",
+    "        other.append(text)\n",
+    "        other_df = other_df.append(pd.Series(row), ignore_index = True)\n",
+    "    \n",
+    "print(len(america))\n",
+    "print(len(ret))\n",
+    "print(len(arab))\n",
+    "print(len(violence))\n",
+    "print(len(religion))\n",
+    "print(len(zein))\n",
+    "print(len(zionism))\n",
+    "print(len(children))\n",
+    "print(len(other))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "other_df = other_df.drop(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "america_df.to_pickle(\"arabic_topic_dfs/america_df\")\n",
+    "ret_df.to_pickle(\"arabic_topic_dfs/return_df\")\n",
+    "arab_df.to_pickle(\"arabic_topic_dfs/arab_df\")\n",
+    "violence_df.to_pickle(\"arabic_topic_dfs/violence_df\")\n",
+    "religion_df.to_pickle(\"arabic_topic_dfs/religion_df\")\n",
+    "zein_df.to_pickle(\"arabic_topic_dfs/zein_df\")\n",
+    "zionism_df.to_pickle(\"arabic_topic_dfs/zionism_df\")\n",
+    "children_df.to_pickle(\"arabic_topic_dfs/children_df\")\n",
+    "other_df.to_pickle(\"arabic_topic_dfs/other_df\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}