-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added arabic graph files + topic assignment notebook
- Loading branch information
1 parent
17766a1
commit 0fc95f6
Showing
27 changed files
with
98,910 additions
and
0 deletions.
There are no files selected for viewing
323 changes: 323 additions & 0 deletions
323
.ipynb_checkpoints/arabic_topic_extraction-checkpoint.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,323 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import arabic_reshaper" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# The following 9 cells include the topic keyword arrays" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 12, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"t= [\"ﻧﻴﻜﻰ ﻫﺎﻳﻠﻰ\",\"ﺍﻷﻣﺮﻳﻜﻲ\",\"ﺍﻷﻣﻴﺮﻛﻲ\",\"ﺍﻻﻣﺮﻳﻜﻲ\",\"أمريكان\",\"امريكان\",\"أمريكي\",\"امريكي\",\"امريكا\", \"أمريكا\", \"ترامب\", \"ترمب\", \"تدخل\", \"جاريد\" , \"ايفانكا\",\"دونالد\",\"ﺍﻷﻣﺮﻳﻜﻴﺔ\", \"ﺍﻟﻮﻻﻳﺎﺕ ﺍﻟﻤﺘﺤﺪﺓ\"]\n", | ||
"t1 = []\n", | ||
"t1.append(\"trump\")\n", | ||
"# print(len(t))\n", | ||
"# for i in t:\n", | ||
"# print(i)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"14\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"t= [\"الحرة\",\"حرة\",\"ترجع\",\"أبدية\",\"خالدة\",\"ﻠﻦ ﺗﺴﻘﻂ ﺍﻟﻘﺪﺱ\",\"القدس لنا\",\"العودة\",\"العوده\",\"سنعود\",\"ستعود\",\"راجعين\",\"حق\",\"ستعود\"]\n", | ||
"t2 = []\n", | ||
"for word in t:\n", | ||
" word = arabic_reshaper.reshape(word)\n", | ||
" t2.append(word)\n", | ||
"print(len(t2))\n", | ||
"# for i in t:\n", | ||
"# print(i)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 10, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"39\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"t= [\"كعرب\",\"سعود\",\"ﺧﺎﺩﻡ ﺍﻟﺤﺮﻣﻴﻦ\",\"ﺍﻟﺴﻴﺴﻲ\",\"أردوغان\",\"اردوغان\",\"اسطنبول\",\"ﺇﺳﻄﻨﺒﻮﻝ\",\"ﺇﺳﻄﻨﺒﻮﻝ\",\"ﺍﻟﻌﺮﺑﻴﻪ\",\"عربي\",\"عربيه\",\"ﻋﺮﺑﻴﺔ\",\"سوري\",\"ﺍﻟﻜﻮﻳﺖ\",\"ﺍﻟﺴﻌﻮﺩﻳﻪ\",\" ﺳﻌﻮﺩﻱ\",\"جزائر\",\"ﺟﺰﺍﺋﺮﻳﺔ\",\"إيران\",\"ايران\",\"تركيا\",\"ﺍﻟﻌﺮﺑﻴﺔ\",\"القاهرة\",\"مصر\",\"طبعوا\",\"التطبيع\",\"سلمان\",\"محمد بن\",\"العربي\",\"القطري\",\"اليمن\",\"قطر\",\"السعودية\",\"حكام\",\"الشعوب العربية\",\"العرب\", \"السعودي\",\"المملكة\"]\n", | ||
"t3 = []\n", | ||
"for word in t:\n", | ||
" word = arabic_reshaper.reshape(word)\n", | ||
" t3.append(word)\n", | ||
"print(len(t3))\n", | ||
"# for i in t:\n", | ||
"# print(i)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"t= [\"بالحجارة\",\"الحجارة\",\"الرصاص\",\"الحجر\",\"الجهاد\",\"غزه\",\"غزة\",\"إطلاق\",\"العنف\",\"اطلاق\",\"ﺍﻃﻼﻕ ﻧﺎﺭ\",\"احتشاد\",\"ﻣﺴﻴﺮﺓ\",\"الشهداء\",\"جنود\",\"جرائم\",\"ﺟﺮﻳﻤﺔ\",\"غزة\",\"غارات\",\"هدم\",\"قتل\",\"ﻣﻘﺎﺗﻼﺕ\",\"ﺇﻧﺬﺍﺭ\",\"مجزرة\",\"مصابين\",\"قصف\",\"تقصف\",\"صاروخية\",\"دماء\",\"قذيفة\",\"ﺻﻔﺎﺭﺍﺕ ﺍﻹﻧﺬﺍﺭ\",\"قوات\",\"المقاومة\",\"صواريخ\",\"مجاهدين\",\"قصف\",\"موت\",\"جرحى\",\"إصابه\",\"اصابه\",\"إصابة\",\"اصابة\",\"مقتل\", \"جرحى\", \"قتلى\",\"قتلي\",\"اشتباك\",\"اشتباكات\",\"مصاب\", \"شهيد\", \"شهداء\", \"القصف\", \"الغارات\",\"المجاهدين\", \"عسكر\", \"صافرات\", \"تظاهر\"]\n", | ||
"t4 = []\n", | ||
"for word in t:\n", | ||
" word = arabic_reshaper.reshape(word)\n", | ||
" t4.append(word)\n", | ||
"# print(len(t4))\n", | ||
"# for i in t:\n", | ||
"# print(i + \",\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"14\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"t= [\"ثالث الحرمين\",\"الوكيل\",\"ونعم\",\"حسبي\",\"لله\",\"المسلمون\",\"المسلمين\",\"مسلمون\",\"رمضان\",\"رب\",\"اللهم\", \"الله\", \"ربنا\", \"مسلمين\"]\n", | ||
"t5 = []\n", | ||
"for word in t:\n", | ||
"# word = arabic_reshaper.reshape(word)\n", | ||
" t5.append(word)\n", | ||
"print(len(t5))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"t= [\"سيدي\",\"ﺍﻻﻓﻄﺎﺭ\",\"سنفطر\",\"إعلان_زين\",\"اعلان\",\"زين\",\"إعلان\"]\n", | ||
"t6 = []\n", | ||
"for word in t:\n", | ||
" word = arabic_reshaper.reshape(word)\n", | ||
" t6.append(word)\n", | ||
"# print(len(t6))\n", | ||
"# for i in t:\n", | ||
"# print(i)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"t= [\"المتصهين\",\"ﺍﻟﻤﺘﺼﻬﻴﻨﻴﻦ\",\"ﺑﺎﻟﺼﻬﺎﻳﻨﻪ\",\"ﺑﺎﻟﺼﻬﺎﻳﻨﻪ\",\"ﻟﺼﻬﺎﻳﻨﺔ\",\"ﺍﻟﺼﻬﻴﻮﻧﻰ\",\"ﺍﻟﺼﻬﻴﻮﻧﻴﺔ\",\"ﺍﻟﺼﻬﺎﻳﻨﺔ\",\"ﺍﻟﺼﻬﺎﻳﻨﻪ\",\"صهاينه\",\"ﺍﻟﺼﻬﻴﻮﻧﻲ\",\"صهيونية\",\"صهيوني\",\"صهاينة\"]\n", | ||
"t7 = []\n", | ||
"for word in t:\n", | ||
" word = arabic_reshaper.reshape(word)\n", | ||
" t7.append(word)\n", | ||
"# print(len(t7))\n", | ||
"# for i in t:\n", | ||
"# print(i + \",\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"t = [\"الأطفال\",\"اطفالكم\",\"الاطفال\",\"أطفال\",\"اطفال\",\"طفل\"]\n", | ||
"t8 = []\n", | ||
"for word in t:\n", | ||
" word = arabic_reshaper.reshape(word)\n", | ||
" t8.append(word)\n", | ||
"# print(len(t8))\n", | ||
"# for i in t:\n", | ||
"# print(i + \",\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# The following cell maps tweets to topics based on keyword presence" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 13, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"0\n", | ||
"41\n", | ||
"310\n", | ||
"231\n", | ||
"0\n", | ||
"124\n", | ||
"49\n", | ||
"16\n", | ||
"536\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"df = pd.read_pickle(\"dataset/arabic/arabic_repliedto\")\n", | ||
"df[\"type\"] = \"repliedto\"\n", | ||
"# ----- uncomment the df we want to create mappings for\n", | ||
"\n", | ||
"# df = pd.read_pickle(\"dataset/arabic/arabic_replies\")\n", | ||
"# df[\"type\"] = \"reply\"\n", | ||
"# df = pd.read_pickle(\"dataset/arabic/arabic_tweets\")\n", | ||
"# df[\"type\"] = \"tweet\"\n", | ||
"america = []\n", | ||
"ret = []\n", | ||
"arab = []\n", | ||
"violence = []\n", | ||
"religion = []\n", | ||
"zein = []\n", | ||
"zionism = []\n", | ||
"children = []\n", | ||
"other = []\n", | ||
"# the following block is run only for the first time, then is commented for rest of dfs (i.e: replies and tweets)\n", | ||
"america_df = pd.DataFrame()\n", | ||
"ret_df = pd.DataFrame()\n", | ||
"arab_df = pd.DataFrame()\n", | ||
"violence_df = pd.DataFrame()\n", | ||
"religion_df = pd.DataFrame()\n", | ||
"zein_df = pd.DataFrame()\n", | ||
"zionism_df = pd.DataFrame()\n", | ||
"children_df = pd.DataFrame()\n", | ||
"other_df = pd.DataFrame()\n", | ||
"\n", | ||
"for i, row in df.head(len(df)).iterrows():\n", | ||
"# text = str(row[\"Text\"][2:len(row[\"Text\"]) - 2])\n", | ||
" text = arabic_reshaper.reshape(str(row[\"Text\"]))\n", | ||
" for j in range(len(t1)):\n", | ||
" if(t1[j] in text ):\n", | ||
" america.append(text)\n", | ||
" america_df = america_df.append(pd.Series(row), ignore_index = True)\n", | ||
" for k in range(len(t2)):\n", | ||
" if(t2[k] in text ):\n", | ||
" ret.append(text)\n", | ||
" ret_df = ret_df.append(pd.Series(row), ignore_index = True)\n", | ||
" for l in range(len(t3)):\n", | ||
" if(t3[l] in text ):\n", | ||
" arab.append(text)\n", | ||
" arab_df = arab_df.append(pd.Series(row), ignore_index = True)\n", | ||
" for m in range(len(t4)): \n", | ||
" if(t4[m] in text ):\n", | ||
" violence.append(text)\n", | ||
" violence_df = violence_df.append(pd.Series(row), ignore_index = True)\n", | ||
" for n in range(len(t5)):\n", | ||
" if(t5[n] in text ):\n", | ||
" religion.append(text)\n", | ||
" religion_df = religion_df.append(pd.Series(row), ignore_index = True)\n", | ||
" for o in range(len(t6)):\n", | ||
" if(t6[o] in text ):\n", | ||
" zein.append(text)\n", | ||
" zein_df = zein_df.append(pd.Series(row), ignore_index = True)\n", | ||
" for p in range(len(t7)):\n", | ||
" if(t7[p] in text ):\n", | ||
" zionism.append(text)\n", | ||
" zionism_df = zionism_df.append(pd.Series(row), ignore_index = True)\n", | ||
" for q in range(len(t8)):\n", | ||
" if(t8[q] in text ):\n", | ||
" children.append(text)\n", | ||
" children_df = children_df.append(pd.Series(row), ignore_index = True)\n", | ||
" if text not in america and text not in ret and text not in arab and text not in violence and text not in religion and text not in zein and text not in zionism and text not in children:\n", | ||
" other.append(text)\n", | ||
" other_df = other_df.append(pd.Series(row), ignore_index = True)\n", | ||
" \n", | ||
"print(len(america))\n", | ||
"print(len(ret))\n", | ||
"print(len(arab))\n", | ||
"print(len(violence))\n", | ||
"print(len(religion))\n", | ||
"print(len(zein))\n", | ||
"print(len(zionism))\n", | ||
"print(len(children))\n", | ||
"print(len(other))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"other_df = other_df.drop(0)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"america_df.to_pickle(\"arabic_topic_dfs/america_df\")\n", | ||
"ret_df.to_pickle(\"arabic_topic_dfs/return_df\")\n", | ||
"arab_df.to_pickle(\"arabic_topic_dfs/arab_df\")\n", | ||
"violence_df.to_pickle(\"arabic_topic_dfs/violence_df\")\n", | ||
"religion_df.to_pickle(\"arabic_topic_dfs/religion_df\")\n", | ||
"zein_df.to_pickle(\"arabic_topic_dfs/zein_df\")\n", | ||
"zionism_df.to_pickle(\"arabic_topic_dfs/zionism_df\")\n", | ||
"children_df.to_pickle(\"arabic_topic_dfs/children_df\")\n", | ||
"other_df.to_pickle(\"arabic_topic_dfs/other_df\")" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.1" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.