From 6509529ed52f1ffd0a009109ba9cd671e874f7e1 Mon Sep 17 00:00:00 2001
From: Fahim Dalvi <faimaduddin@hbku.edu.qa>
Date: Wed, 6 Sep 2023 15:31:01 +0300
Subject: [PATCH] Reorganize assets and unify naming scheme (#191)

A new naming scheme roughly corresponding to
	`language_code/task_category/task/dataset_model_nshot.py`
is introduced in this commit. Model/Dataset/Task names have also been standardized, along with removal of some duplicate assets.

* Reorganize assets and unify naming scheme

* Remove duplicate GPT4 propaganda asset

* Remove duplicate GPT3.5 harmfulness detection asset

* Remove duplicate GPT3.5 claim detection asset, and unify naming scheme for others

* Add missing Lemmatization assets for BLOOMZ and GPT4

* Rename gender assets to remove redundant 'Gender' prefix

* Rename CT22/CT23/Propaganda datasets

* Fix language codes and format code

* Fix HateSpeech and Offensive dataset names

* Fix Parsing citations and dataset name

* Fix ArSAS dataset name

* Fix ADI dataset name

* Fix default label type in Propaganda task

* Fix incorrect Dataset in Harmful GPT4 asset

* renamed datasets, assets and tasks for stance and fact.

renamed datasets, assets and tasks for stance and fact.

* Fix Lemmatization dataset name and citation

* Format code

* Fix Diacritization dataset name and citation

* Add Dialectal Diacritization dataset and asset

* Add GPT4 diacritization assets

* Split segmentation assets across correct datasets

* Split POS assets across correct datasets and add proper citations

* Merged similar tasks and updated assets

* Format code

* Rename STS assets to match convention

---------

Co-authored-by: maramhasanain <maramhasanain@gmail.com>
---
 .../MT/AraBench_ar2en_BLOOMZ_ZeroShot.py}     |   0
 .../MT/AraBench_ar2en_GPT35_ZeroShot.py}      |   0
 .../MT/AraBench_ar2en_GPT4_ZeroShot.py}       |   0
 .../QA/ARCD_BLOOMZ_ZeroShot.py                |   0
 .../QA/ARCD_GPT35_ZeroShot.py}                |   0
 .../QA/ARCD_GPT4_FewShot.py}                  |   0
 .../QA/ARCD_GPT4_ZeroShot.py}                 |   0
 .../QA/MLQA_BLOOMZ_ZeroShot.py                |   0
 .../QA/MLQA_GPT35_ZeroShot.py}                |   0
 .../QA/MLQA_GPT4_FewShot.py}                  |   0
 .../QA/MLQA_GPT4_ZeroShot.py}                 |   0
 .../QA/TyDiQA_BLOOMZ_ZeroShot.py              |   0
 .../QA/TyDiQA_GPT35_ZeroShot.py}              |   0
 .../QA/TyDiQA_GPT4_FewShot.py}                |   0
 .../QA/TydiQA_GPT4_ZeroShot.py}               |   0
 .../QA/XQuAD_BLOOMZ_ZeroShot.py               |   0
 .../QA/XQuAD_GPT35_ZeroShot.py}               |   0
 .../QA/XQuAD_GPT4_ZeroShot.py}                |   0
 .../QA/XQuaD_GPT4_FewShot.py}                 |   0
 .../gender/ArabGend_BLOOMZ_ZeroShot.py}       |   0
 .../gender/ArabGend_GPT35_ZeroShot.py}        |   0
 .../gender/ArabGend_GPT4_ZeroShot.py}         |   0
 .../gender/ArapTweet_BLOOMZ_ZeroShot.py}      |   0
 .../gender/ArapTweet_GPT35_ZeroShot.py}       |   0
 .../gender/ArapTweet_GPT4_FewShot.py}         |   0
 .../gender/ArapTweet_GPT4_ZeroShot.py}        |   0
 .../location/Location_BLOOMZ_ZeroShot.py      |   0
 .../location/Location_GPT35_ZeroShot.py}      |   0
 .../location/Location_GPT4_FewShot.py}        |   0
 .../location/Location_GPT4_ZeroShot.py}       |   0
 .../name_info/NameInfo_BLOOMZ_ZeroShot.py     |   0
 .../name_info/NameInfo_GPT35_ZeroShot.py}     |   0
 .../name_info/NameInfo_GPT4_FewShot.py}       |   0
 .../name_info/NameInfo_GPT4_ZeroShot.py}      |   0
 .../Adult_BLOOMZ_ZeroShot.py                  |   0
 .../Adult_GPT35_ZeroShot.py}                  |   0
 .../Adult_GPT4_FewShot.py}                    |   0
 .../Adult_GPT4_ZeroShot.py}                   |   0
 .../CT22Attentionworthy_BLOOMZ_ZeroShot.py}   |   4 +-
 .../CT22Attentionworthy_GPT35_ZeroShot.py}    |   6 +-
 .../CT22Attentionworthy_GPT4_FewShot.py}      |   9 +-
 .../CT22Attentionworthy_GPT4_ZeroShot.py}     |   9 +-
 .../CT22Checkworthiness_BLOOMZ_ZeroShot.py}   |   4 +-
 .../CT22Checkworthiness_GPT35_ZeroShot.py}    |   4 +-
 .../CT22Checkworthiness_GPT4_FewShot.py}      |   4 +-
 .../CT22Checkworthiness_GPT4_ZeroShot.py}     |   4 +-
 .../CT22Claim_BLOOMZ_ZeroShot.py}             |   4 +-
 .../CT22Claim_GPT35_ZeroShot.py}              |   4 +-
 .../CT22Claim_GPT4_FewShot.py}                |   4 +-
 .../CT22Claim_GPT4_ZeroShot.py}               |   9 +-
 .../COVID19Factuality_BLOOMZ_ZeroShot.py}     |   8 +-
 .../COVID19Factuality_GPT4_FewShot.py}        |  13 +-
 .../COVID19Factuality_GPT4_ZeroShot.py}       |  13 +-
 .../Khouja20Factuality_BLOOMZ_ZeroShot.py}    |  13 +-
 .../Khouja20Factuality_GPT35_ZeroShot.py}     |  10 +-
 .../Khouja20Factuality_GPT4_FewShot.py}       |   8 +-
 .../Khouja20Factuality_GPT4_ZeroShot.py}      |   8 +-
 .../UnifiedFCFactuality_BLOOMZ_ZeroShot.py}   |  13 +-
 .../UnifiedFCFactuality_GPT4_FewShot.py}      |   8 +-
 .../UnifiedFCFactuality_GPT4_ZeroShot.py}     |  13 +-
 .../CT22Harmful_BLOOMZ_ZeroShot.py}           |   4 +-
 .../CT22Harmful_GPT35_ZeroShot.py}            |   4 +-
 .../CT22Harmful_GPT4_FewShot.py}              |   4 +-
 .../CT22Harmful_GPT4_ZeroShot.py}             |   4 +-
 .../OSACT4SubtaskB_GPT35_ZeroShot.py}         |   4 +-
 .../OSACT4SubtaskB_GPT4_FewShot.py}           |   4 +-
 .../OSACT4SubtaskA_BLOOMZ_ZeroShot.py}        |   6 +-
 .../OSACT4SubtaskA_GPT35_ZeroShot.py}         |   6 +-
 .../OSACT4SubtaskA_GPT4_FewShot.py}           |   4 +-
 .../OSACT4SubtaskA_GPT4_ZeroShot.py}          |   4 +-
 .../propaganda/WANLP22T3_BLOOMZ_ZeroShot.py}  |   8 +-
 .../propaganda/WANLP22T3_GPT35_ZeroShot.py}   |   8 +-
 .../propaganda/WANLP22T3_GPT4_FewShot.py}     |   8 +-
 .../propaganda/WANLP22T3_GPT4_ZeroShot.py}    |   8 +-
 .../spam/Spam_BLOOMZ_ZeroShot.py              |   0
 .../spam/Spam_GPT35_ZeroShot.py}              |   0
 .../spam/Spam_GPT4_ZeroShot.py}               |   0
 .../CT23Subjectivity_BLOOMZ_ZeroShot.py}      |   4 +-
 .../CT23Subjectivity_GPT35_ZeroShot.py}       |   4 +-
 .../CT23Subjectivity_GPT4_FewShot.py}         |   4 +-
 .../CT23Subjectivity_GPT4_ZeroShot.py}        |   4 +-
 .../ASND_BLOOMZ_ZeroShot.py}                  |   7 +-
 .../ASND_GPT35_ZeroShot.py}                   |   9 +-
 .../news_categorization/ASND_GPT4_FewShot.py} |   8 +-
 .../ASND_GPT4_ZeroShot.py}                    |   8 +-
 .../Akhbarona_BLOOMZ_ZeroShot.py}             |   4 +-
 .../Akhbarona_GPT35_ZeroShot.py}              |   4 +-
 .../Akhbarona_GPT4_FewShot.py}                |   4 +-
 .../Akhbarona_GPT4_ZeroShot.py}               |   4 +-
 .../AlArabiya_BLOOMZ_ZeroShot.py}             |   4 +-
 .../AlArabiya_GPT35_ZeroShot.py}              |   4 +-
 .../AlArabiya_GPT4_FewShot.py}                |   4 +-
 .../AlArabiya_GPT4_ZeroShot.py}               |   4 +-
 .../AlKhaleej_BLOOMZ_ZeroShot.py}             |   4 +-
 .../AlKhaleej_GPT35_ZeroShot.py}              |   4 +-
 .../AlKhaleej_GPT4_FewShot.py}                |   4 +-
 .../AlKhaleej_GPT4_ZeroShot.py}               |   4 +-
 .../semantics/NLI}/XNLI_BLOOMZ_ZeroShot.py    |   0
 .../semantics/NLI/XNLI_GPT35_ZeroShot.py}     |   0
 .../semantics/NLI/XNLI_GPT4_FewShot.py}       |   0
 .../semantics/NLI/XNLI_GPT4_ZeroShot.py}      |   0
 .../semantics}/STS/Q2QSim_BLOOMZ_ZeroShot.py  |   0
 .../semantics/STS/Q2QSim_GPT35_ZeroShot.py}   |   0
 .../semantics}/STS/Q2QSim_GPT4_FewShot.py     |   0
 .../semantics}/STS/Q2QSim_GPT4_ZeroShot.py    |   0
 .../STS/SemEval17T1STS_BLOOMZ_ZeroShot.py}    |   8 +-
 .../STS/SemEval17T1STS_GPT35_ZeroShot.py}     |   8 +-
 .../STS/SemEval17T1STS_GPT4_FewShot.py}       |   8 +-
 .../STS/SemEval17T1STS_GPT4_ZeroShot.py}      |   8 +-
 .../STS/SemEval17T2STS_BLOOMZ_ZeroShot.py}    |   8 +-
 .../STS/SemEval17T2STS_GPT35_ZeroShot.py}     |  11 +-
 .../STS/SemEval17T2STS_GPT4_FewShot.py}       |   8 +-
 .../STS/SemEval17T2STS_GPT4_ZeroShot.py}      |   8 +-
 .../emotion/Emotion_BLOOMZ_ZeroShot.py        |   0
 .../emotion/Emotion_GPT35_ZeroShot.py}        |   0
 .../emotion/Emotion_GPT4_FewShot.py}          |   0
 .../emotion/Emotion_GPT4_ZeroShot.py}         |   0
 .../sarcasm/ArSarcasm2_GPT3_ZeroShot.py}      |   0
 .../sarcasm/ArSarcasm2_GPT4_FewShot.py        |   0
 .../sarcasm/ArSarcasm2_GPT4_ZeroShot.py}      |   0
 .../sarcasm/ArSarcasm_BLOOMZ_ZeroShot.py}     |   0
 .../sarcasm/ArSarcasm_GPT3_ZeroShot.py}       |   0
 .../sarcasm/ArSarcasm_GPT4_FewShot.py}        |   0
 .../sarcasm/ArSarcasm_GPT4_ZeroShot.py}       |   0
 .../sentiment/ArSAS_BLOOMZ_ZeroShot.py}       |   4 +-
 .../sentiment/ArSAS_GPT35_ZeroShot.py}        |   4 +-
 .../sentiment/ArSAS_GPT4_ZeroShot.py}         |   4 +-
 .../Khouja20Stance_BLOOMZ_ZeroShot.py}        |   8 +-
 .../Khouja20Stance_GPT35_ZeroShot.py}         |  10 +-
 .../Khouja20Stance_GPT4_FewShot.py}           |   9 +-
 .../Khouja20Stance_GPT4_ZeroShot.py}          |   9 +-
 .../UnifiedFCStance_BLOOMZ_ZeroShot.py}       |   8 +-
 .../UnifiedFCStance_GPT35_ZeroShot.py}        |  10 +-
 .../UnifiedFCStance_GPT4_FewShot.py}          |   8 +-
 .../UnifiedFCStance_GPT4_ZeroShot.py}         |   8 +-
 .../NER/ANERcorp_GPT35_ZeroShot.py}           |   0
 .../NER/ANERcorp_GPT4_FewShot.py}             |   0
 .../NER/ANERcorp_GPT4_ZeroShot.py}            |   0
 .../NER/Aqmar_GPT35_ZeroShot.py}              |   0
 .../NER/Aqmar_GPT4_FewShot.py}                |   0
 .../NER/Aqmar_GPT4_ZeroShot.py}               |   0
 .../NER/MGBWords_GPT35_ZeroShot.py}           |   0
 .../NER/MGBWords_GPT4_ZeroShot.py}            |   0
 .../QCRIDialectalArabicPOS_GPT4_ZeroShot.py}  |   8 +-
 .../QCRIDialectalArabic_GPT35_ZeroShot.py}    |   8 +-
 .../POS/QCRIDialectalArabic_GPT4_FewShot.py}  |  12 +-
 .../POS/WikiNews_GPT35_ZeroShot.py            | 155 +++++++++++++++
 .../POS/WikiNews_GPT4_FewShot.py              | 176 ++++++++++++++++++
 .../POS/WikiNews_GPT4_ZeroShot.py             | 159 ++++++++++++++++
 .../POS/XGLUE_GPT35_ZeroShot.py               | 155 +++++++++++++++
 .../POS/XGLUE_GPT4_FewShot.py                 | 176 ++++++++++++++++++
 .../POS/XGLUE_GPT4_ZeroShot.py                | 159 ++++++++++++++++
 .../BibleMaghrebi_GPT35_ZeroShot.py           |  55 ++++++
 .../BibleMaghrebi_GPT4_ZeroShot.py            |  59 ++++++
 .../WikiNews_GPT35_ZeroShot.py}               |   4 +-
 .../diacritization/WikiNews_GPT4_ZeroShot.py  |  47 +++++
 .../ADI_BLOOMZ_ZeroShot.py}                   |   4 +-
 .../ADI_GPT35_ZeroShot.py}                    |   4 +-
 .../ADI_GPT4_FewShot.py}                      |   4 +-
 .../ADI_GPT4_ZeroShot.py}                     |   4 +-
 .../QADI_GPT35_ZeroShot.py}                   |   0
 .../QADI_GPT4_ZeroShot.py}                    |   0
 .../lemmatization/WikiNews_BLOOMZ_ZeroShot.py |  51 +++++
 .../lemmatization/WikiNews_GPT35_ZeroShot.py} |   4 +-
 .../lemmatization/WikiNews_GPT4_ZeroShot.py   |  54 ++++++
 .../parsing/PADT_GPT35_ZeroShot.py}           |   4 +-
 .../parsing/PADT_GPT4_ZeroShot.py}            |   4 +-
 .../QCRIDialectalArabic_GPT35_ZeroShot.py}    |  11 +-
 .../QCRIDialectalArabic_GPT4_ZeroShot.py}     |   6 +-
 .../segmentation/WikiNews_GPT35_ZeroShot.py   |  58 ++++++
 .../segmentation/WikiNews_GPT4_ZeroShot.py    |  60 ++++++
 assets/benchmark_v1/QA/.keep                  |   2 -
 assets/benchmark_v1/STS/.keep                 |   2 -
 .../benchmark_v1/dialect_identification/.keep |   2 -
 .../COVClaimDetect_CGPT35_ZeroShot.py         |  65 -------
 .../COVHarmfulDetect_CGPT35_ZeroShot.py       |  60 ------
 .../PropMultilabel_GPT4_ZeroShot.py           | 148 ---------------
 assets/benchmark_v1/semantics/.keep           |   2 -
 .../sentiment_emotion_others/.keep            |   2 -
 .../CT22Checkworthiness_BLOOMZ_ZeroShot.py}   |   4 +-
 .../CT22Checkworthiness_GPT4_FewShot.py}      |   4 +-
 .../CT22Checkworthiness_GPT4_ZeroShot.py}     |   4 +-
 .../BanglaSentiment_BLOOMZ_ZeroShot.py        |   0
 .../sentiment/BanglaSentiment_GPT4_FewShot.py |   0
 .../BanglaSentiment_GPT4_ZeroShot.py          |   0
 .../SemEval23T3Propaganda_BLOOMZ_ZeroShot.py} |   8 +-
 .../SemEval23T3Propaganda_GPT4_FewShot.py}    |   8 +-
 .../SemEval23T3Propaganda_GPT4_ZeroShot.py}   |   8 +-
 .../CT22Checkworthiness_BLOOMZ_ZeroShot.py}   |   4 +-
 .../CT22Checkworthiness_GPT4_FewShot.py}      |   4 +-
 .../CT22Checkworthiness_GPT4_ZeroShot.py}     |   4 +-
 .../SemEval23T3Propaganda_BLOOMZ_ZeroShot.py} |   8 +-
 .../SemEval23T3Propaganda_GPT4_FewShot.py}    |   8 +-
 .../SemEval23T3Propaganda_GPT4_ZeroShot.py}   |   8 +-
 .../CT22Checkworthiness_BLOOMZ_ZeroShot.py}   |   4 +-
 .../CT22Checkworthiness_GPT4_FewShot.py}      |   4 +-
 .../CT22Checkworthiness_GPT4_ZeroShot.py}     |   4 +-
 .../SemEval23T3Propaganda_BLOOMZ_ZeroShot.py} |   8 +-
 .../SemEval23T3Propaganda_GPT4_FewShot.py}    |   8 +-
 .../SemEval23T3Propaganda_GPT4_ZeroShot.py}   |   8 +-
 .../SemEval23T3Propaganda_BLOOMZ_ZeroShot.py} |   8 +-
 .../SemEval23T3Propaganda_GPT4_FewShot.py}    |   8 +-
 .../SemEval23T3Propaganda_GPT4_ZeroShot.py}   |   8 +-
 .../CT22Checkworthiness_BLOOMZ_ZeroShot.py}   |   4 +-
 .../CT22Checkworthiness_GPT4_FewShot.py}      |   4 +-
 .../CT22Checkworthiness_GPT4_ZeroShot.py}     |   4 +-
 .../SemEval23T3Propaganda_BLOOMZ_ZeroShot.py} |   8 +-
 .../SemEval23T3Propaganda_GPT4_FewShot.py}    |   8 +-
 .../SemEval23T3Propaganda_GPT4_ZeroShot.py}   |   8 +-
 .../SemEval23T3Propaganda_BLOOMZ_ZeroShot.py} |   8 +-
 .../SemEval23T3Propaganda_GPT4_FewShot.py}    |   8 +-
 .../SemEval23T3Propaganda_GPT4_ZeroShot.py}   |   8 +-
 .../CT22Checkworthiness_BLOOMZ_ZeroShot.py}   |   4 +-
 .../CT22Checkworthiness_GPT4_FewShot.py}      |   4 +-
 .../CT22Checkworthiness_GPT4_ZeroShot.py}     |   4 +-
 llmebench/datasets/{DialectADI.py => ADI.py}  |   4 +-
 .../datasets/{ArSASSentiment.py => ArSAS.py}  |   4 +-
 llmebench/datasets/ArabicDiacritization.py    |  50 -----
 llmebench/datasets/ArabicPOS.py               |  50 -----
 llmebench/datasets/ArabicParsing.py           |  63 -------
 .../datasets/BibleMaghrebiDiacritization.py   |  47 +++++
 ...tualityCOVID19.py => COVID19Factuality.py} |   4 +-
 ...entionworthy.py => CT22Attentionworthy.py} |   6 +-
 ...ckworthiness.py => CT22Checkworthiness.py} |   8 +-
 llmebench/datasets/{Claim.py => CT22Claim.py} |   8 +-
 .../datasets/{Harmful.py => CT22Harmful.py}   |   8 +-
 .../{Subjectivity.py => CT23Subjectivity.py}  |   4 +-
 ...alityKhouja20.py => Khouja20Factuality.py} |   6 +-
 .../{StanceKhouja20.py => Khouja20Stance.py}  |   4 +-
 .../{Offensive.py => OSACT4SubtaskA.py}       |   4 +-
 .../{HateSpeech.py => OSACT4SubtaskB.py}      |   4 +-
 llmebench/datasets/PADT.py                    |  71 +++++++
 llmebench/datasets/QCRIDialectalArabicPOS.py  |  46 +++++
 ....py => QCRIDialectalArabicSegmentation.py} |   4 +-
 ...ArSemEval17Track1.py => SemEval17T1STS.py} |   4 +-
 ...ArSemEval17Track2.py => SemEval17T2STS.py} |   4 +-
 ...aSemEval23.py => SemEval23T3Propaganda.py} |   7 +-
 ...ityUnifiedFC.py => UnifiedFCFactuality.py} |   4 +-
 ...{StanceUnifiedFC.py => UnifiedFCStance.py} |   7 +-
 .../{Propaganda.py => WANLP22T3Propaganda.py} |  13 +-
 llmebench/datasets/WikiNewsDiacritization.py  |  47 +++++
 ...matization.py => WikiNewsLemmatization.py} |  19 +-
 llmebench/datasets/WikiNewsPOS.py             |  39 ++++
 llmebench/datasets/WikiNewsSegmentation.py    |  40 ++++
 llmebench/datasets/XGLUEPOS.py                |  39 ++++
 llmebench/datasets/__init__.py                |  50 ++---
 .../{FactualityUnifiedFC.py => Factuality.py} |  13 +-
 llmebench/tasks/FactualityCOVID19.py          |  27 ---
 llmebench/tasks/FactualityKhouja20.py         |  15 --
 ...elSemEval23.py => MultilabelPropaganda.py} |  22 ++-
 llmebench/tasks/NewsCatAkhbarona.py           |  34 ----
 llmebench/tasks/NewsCatAlArabiya.py           |  34 ----
 llmebench/tasks/NewsCatAlKhaleej.py           |  34 ----
 .../{NewsCatASND.py => NewsCategorization.py} |   4 +-
 llmebench/tasks/PropagandaMultilabel.py       |  29 ---
 llmebench/tasks/{STSTrack1.py => STS.py}      |   8 +-
 llmebench/tasks/STSTrack2.py                  |  19 --
 .../tasks/{StanceKhouja20.py => Stance.py}    |   4 +-
 llmebench/tasks/StanceUnifiedFC.py            |  16 --
 llmebench/tasks/__init__.py                   |  18 +-
 tests/datasets/test_metadata.py               |   7 +-
 261 files changed, 2175 insertions(+), 1198 deletions(-)
 rename assets/{benchmark_v1/MT/AraBench_Ara2Eng_BLOOMZ_ZeroShot.py => ar/MT/AraBench_ar2en_BLOOMZ_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT_ZeroShot.py => ar/MT/AraBench_ar2en_GPT35_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py => ar/MT/AraBench_ar2en_GPT4_ZeroShot.py} (100%)
 rename assets/{benchmark_v1 => ar}/QA/ARCD_BLOOMZ_ZeroShot.py (100%)
 rename assets/{benchmark_v1/QA/ARCD_ChatGPT_ZeroShot.py => ar/QA/ARCD_GPT35_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/QA/ARCD_GPTChatCompletion_FewShot.py => ar/QA/ARCD_GPT4_FewShot.py} (100%)
 rename assets/{benchmark_v1/QA/ARCD_GPTChatCompletion_ZeroShot.py => ar/QA/ARCD_GPT4_ZeroShot.py} (100%)
 rename assets/{benchmark_v1 => ar}/QA/MLQA_BLOOMZ_ZeroShot.py (100%)
 rename assets/{benchmark_v1/QA/MLQA_ChatGPT_ZeroShot.py => ar/QA/MLQA_GPT35_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/QA/MLQA_GPTChatCompletion_FewShot.py => ar/QA/MLQA_GPT4_FewShot.py} (100%)
 rename assets/{benchmark_v1/QA/MLQA_GPTChatCompletion_ZeroShot.py => ar/QA/MLQA_GPT4_ZeroShot.py} (100%)
 rename assets/{benchmark_v1 => ar}/QA/TyDiQA_BLOOMZ_ZeroShot.py (100%)
 rename assets/{benchmark_v1/QA/TyDiQA_ChatGPT_ZeroShot.py => ar/QA/TyDiQA_GPT35_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/QA/TyDiQA_GPTChatCompletion_FewShot.py => ar/QA/TyDiQA_GPT4_FewShot.py} (100%)
 rename assets/{benchmark_v1/QA/TydiQA_GPTChatCompletion_ZeroShot.py => ar/QA/TydiQA_GPT4_ZeroShot.py} (100%)
 rename assets/{benchmark_v1 => ar}/QA/XQuAD_BLOOMZ_ZeroShot.py (100%)
 rename assets/{benchmark_v1/QA/XQuAD_ChatGPT_ZeroShot.py => ar/QA/XQuAD_GPT35_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/QA/XQuAD_GPTChatCompletion_ZeroShot.py => ar/QA/XQuAD_GPT4_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/QA/XQuaD_GPTChatCompletion_FewShot.py => ar/QA/XQuaD_GPT4_FewShot.py} (100%)
 rename assets/{benchmark_v1/demography/gender/GenderArabGend_BLOOMZ_ZeroShot.py => ar/demographic_attributes/gender/ArabGend_BLOOMZ_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/demography/gender/GenderArabGend_ChatGPT_ZeroShot.py => ar/demographic_attributes/gender/ArabGend_GPT35_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/demography/gender/GenderArabGend_GPTChatCompletion_ZeroShot.py => ar/demographic_attributes/gender/ArabGend_GPT4_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/demography/gender/GenderArapTweet_BLOOMZ_ZeroShot.py => ar/demographic_attributes/gender/ArapTweet_BLOOMZ_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/demography/gender/GenderArapTweet_ChatGPT_ZeroShot.py => ar/demographic_attributes/gender/ArapTweet_GPT35_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_FewShot.py => ar/demographic_attributes/gender/ArapTweet_GPT4_FewShot.py} (100%)
 rename assets/{benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_ZeroShot.py => ar/demographic_attributes/gender/ArapTweet_GPT4_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/demography => ar/demographic_attributes}/location/Location_BLOOMZ_ZeroShot.py (100%)
 rename assets/{benchmark_v1/demography/location/Location_ChatGPT_ZeroShot.py => ar/demographic_attributes/location/Location_GPT35_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/demography/location/Location_GPTChatCompletion_FewShot.py => ar/demographic_attributes/location/Location_GPT4_FewShot.py} (100%)
 rename assets/{benchmark_v1/demography/location/Location_GPTChatCompletion_ZeroShot.py => ar/demographic_attributes/location/Location_GPT4_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/demography => ar/demographic_attributes}/name_info/NameInfo_BLOOMZ_ZeroShot.py (100%)
 rename assets/{benchmark_v1/demography/name_info/NameInfo_ChatGPT_ZeroShot.py => ar/demographic_attributes/name_info/NameInfo_GPT35_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_FewShot.py => ar/demographic_attributes/name_info/NameInfo_GPT4_FewShot.py} (100%)
 rename assets/{benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_ZeroShot.py => ar/demographic_attributes/name_info/NameInfo_GPT4_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content => ar/factuality_disinformation_harmful_content/adult_content_detection}/Adult_BLOOMZ_ZeroShot.py (100%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Adult_ChatGPT_ZeroShot.py => ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT35_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_FewShot.py => ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT4_FewShot.py} (100%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_ZeroShot.py => ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT4_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_BLOOMZ_ZeroShot.py => ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_BLOOMZ_ZeroShot.py} (93%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_ChatGPT_ZeroShot.py => ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT35_ZeroShot.py} (93%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_Fewshot.py => ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT4_FewShot.py} (96%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_ZeroShot.py => ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT4_ZeroShot.py} (94%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ZeroShot.py => ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py} (92%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_ChatGPT_ZeroShot.py => ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT35_ZeroShot.py} (93%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_FewShot.py => ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py} (96%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ZeroShot.py => ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py} (95%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_BLOOMZ_ZeroShot.py => ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_BLOOMZ_ZeroShot.py} (93%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_CGPT35_ZeroShot.py => ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT35_ZeroShot.py} (94%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_GPT4_FewShot.py => ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_FewShot.py} (96%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_GPTChatCompletion_ZeroShot.py => ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_ZeroShot.py} (93%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_BLOOMZ_ZeroShot.py => ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_BLOOMZ_ZeroShot.py} (88%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_FewShot.py => ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_FewShot.py} (91%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_ZeroShot.py => ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_ZeroShot.py} (88%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_BLOOMZ_ZeroShot.py => ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_BLOOMZ_ZeroShot.py} (86%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_ChatGPT_ZeroShot.py => ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT35_ZeroShot.py} (84%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_FewShot.py => ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT4_FewShot.py} (92%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_ZeroShot.py => ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT4_ZeroShot.py} (89%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_BLOOMZ_ZeroShot.py => ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_BLOOMZ_ZeroShot.py} (85%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPT4_FewShot.py => ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT4_FewShot.py} (92%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPTChatCompletion_ZeroShot.py => ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT4_ZeroShot.py} (88%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_BLOOMZ_ZeroShot.py => ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_BLOOMZ_ZeroShot.py} (94%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_CGPT35_ZeroShot.py => ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT35_ZeroShot.py} (94%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_GPT4_FewShot.py => ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot.py} (96%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_GPTChatCompletion_ZeroShot.py => ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_ChatGPT_ZeroShot.py => ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT35_ZeroShot.py} (93%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_GPTChatCompletion_FewShot.py => ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT4_FewShot.py} (96%)
 rename assets/{benchmark_v1/sentiment/offensive/Offensive_BLOOMZ_ZeroShot.py => ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_BLOOMZ_ZeroShot.py} (79%)
 rename assets/{benchmark_v1/sentiment/offensive/Offensive_ChatGPT_ZeroShot.py => ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT35_ZeroShot.py} (83%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Offensive_GPTChatCompletion_FewShot.py => ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT4_FewShot.py} (96%)
 rename assets/{benchmark_v1/sentiment/offensive/Offensive_GPTChatCompletion_ZeroShot.py => ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT4_ZeroShot.py} (93%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ZeroShot.py => ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_BLOOMZ_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_CGPT35_ZeroShot.py => ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_GPT35_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_FewShot.py => ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_GPT4_FewShot.py} (94%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_ZeroShot.py => ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_GPT4_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/sentiment => ar/factuality_disinformation_harmful_content}/spam/Spam_BLOOMZ_ZeroShot.py (100%)
 rename assets/{benchmark_v1/sentiment/spam/Spam_ChatGPT_ZeroShot.py => ar/factuality_disinformation_harmful_content/spam/Spam_GPT35_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/sentiment/spam/Spam_GPTChatCompletion_ZeroShot.py => ar/factuality_disinformation_harmful_content/spam/Spam_GPT4_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_BLOOMZ_ZeroShot.py => ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_BLOOMZ_ZeroShot.py} (91%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_ChatGPT_ZeroShot.py => ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT35_ZeroShot.py} (93%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_FewShot.py => ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_FewShot.py} (96%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_ZeroShot.py => ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_ZeroShot.py} (94%)
 rename assets/{benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py => ar/news_categorization/ASND_BLOOMZ_ZeroShot.py} (95%)
 rename assets/{benchmark_v1/news_categorization/NewsCat_ASND_ChatGPT_ZeroShot.py => ar/news_categorization/ASND_GPT35_ZeroShot.py} (92%)
 rename assets/{benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_FewShot.py => ar/news_categorization/ASND_GPT4_FewShot.py} (96%)
 rename assets/{benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_ZeroShot.py => ar/news_categorization/ASND_GPT4_ZeroShot.py} (95%)
 rename assets/{benchmark_v1/news_categorization/NewsCat_Akhbarona_BLOOMZ_ZeroShot.py => ar/news_categorization/Akhbarona_BLOOMZ_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/news_categorization/NewsCat_Akhbarona_ChatGPT_ZeroShot.py => ar/news_categorization/Akhbarona_GPT35_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_FewShot.py => ar/news_categorization/Akhbarona_GPT4_FewShot.py} (97%)
 rename assets/{benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_ZeroShot.py => ar/news_categorization/Akhbarona_GPT4_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/news_categorization/NewsCat_AlArabiya_BLOOMZ_ZeroShot.py => ar/news_categorization/AlArabiya_BLOOMZ_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/news_categorization/NewsCat_AlArabiya_ChatGPT_ZeroShot.py => ar/news_categorization/AlArabiya_GPT35_ZeroShot.py} (95%)
 rename assets/{benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_FewShot.py => ar/news_categorization/AlArabiya_GPT4_FewShot.py} (97%)
 rename assets/{benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_ZeroShot.py => ar/news_categorization/AlArabiya_GPT4_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/news_categorization/NewsCat_AlKhaleej_BLOOMZ_ZeroShot.py => ar/news_categorization/AlKhaleej_BLOOMZ_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/news_categorization/NewsCat_AlKhaleej_ChatGPT_ZeroShot.py => ar/news_categorization/AlKhaleej_GPT35_ZeroShot.py} (95%)
 rename assets/{benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_FewShot.py => ar/news_categorization/AlKhaleej_GPT4_FewShot.py} (97%)
 rename assets/{benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_ZeroShot.py => ar/news_categorization/AlKhaleej_GPT4_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/semantics => ar/semantics/NLI}/XNLI_BLOOMZ_ZeroShot.py (100%)
 rename assets/{benchmark_v1/semantics/XNLI_ChatGPT_ZeroShot.py => ar/semantics/NLI/XNLI_GPT35_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/semantics/XNLI_CGPT4_FewShot.py => ar/semantics/NLI/XNLI_GPT4_FewShot.py} (100%)
 rename assets/{benchmark_v1/semantics/XNLI_CGPT4_ZeroShot.py => ar/semantics/NLI/XNLI_GPT4_ZeroShot.py} (100%)
 rename assets/{benchmark_v1 => ar/semantics}/STS/Q2QSim_BLOOMZ_ZeroShot.py (100%)
 rename assets/{benchmark_v1/STS/Q2QSim_ChatGPT_ZeroShot.py => ar/semantics/STS/Q2QSim_GPT35_ZeroShot.py} (100%)
 rename assets/{benchmark_v1 => ar/semantics}/STS/Q2QSim_GPT4_FewShot.py (100%)
 rename assets/{benchmark_v1 => ar/semantics}/STS/Q2QSim_GPT4_ZeroShot.py (100%)
 rename assets/{benchmark_v1/STS/STSTrack1_BLOOMZ_ZeroShot.py => ar/semantics/STS/SemEval17T1STS_BLOOMZ_ZeroShot.py} (88%)
 rename assets/{benchmark_v1/STS/STSTrack1_ChatGPT_ZeroShot.py => ar/semantics/STS/SemEval17T1STS_GPT35_ZeroShot.py} (90%)
 rename assets/{benchmark_v1/STS/STSTrack1_GPT4_FewShot.py => ar/semantics/STS/SemEval17T1STS_GPT4_FewShot.py} (94%)
 rename assets/{benchmark_v1/STS/STSTrack1_GPT4_ZeroShot.py => ar/semantics/STS/SemEval17T1STS_GPT4_ZeroShot.py} (90%)
 rename assets/{benchmark_v1/STS/STSTrack2_BLOOMZ_ZeroShot.py => ar/semantics/STS/SemEval17T2STS_BLOOMZ_ZeroShot.py} (88%)
 rename assets/{benchmark_v1/STS/STSTrack2_ChatGPT_ZeroShot.py => ar/semantics/STS/SemEval17T2STS_GPT35_ZeroShot.py} (87%)
 rename assets/{benchmark_v1/STS/STSTrack2_GPT4_FewShot.py => ar/semantics/STS/SemEval17T2STS_GPT4_FewShot.py} (94%)
 rename assets/{benchmark_v1/STS/STSTrack2_GPT4_ZeroShot.py => ar/semantics/STS/SemEval17T2STS_GPT4_ZeroShot.py} (90%)
 rename assets/{benchmark_v1/sentiment => ar/sentiment_emotion_others}/emotion/Emotion_BLOOMZ_ZeroShot.py (100%)
 rename assets/{benchmark_v1/sentiment/emotion/Emotion_ChatGPT_ZeroShot.py => ar/sentiment_emotion_others/emotion/Emotion_GPT35_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_FewShot.py => ar/sentiment_emotion_others/emotion/Emotion_GPT4_FewShot.py} (100%)
 rename assets/{benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_ZeroShot.py => ar/sentiment_emotion_others/emotion/Emotion_GPT4_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/sarcasm/ArSarcasm2_GPT3_Zeroshot.py => ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT3_ZeroShot.py} (100%)
 rename assets/{benchmark_v1 => ar/sentiment_emotion_others}/sarcasm/ArSarcasm2_GPT4_FewShot.py (100%)
 rename assets/{benchmark_v1/sarcasm/ArSarcasm2_GPT4_Zeroshot.py => ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT4_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/sarcasm/ArSarcasm_BLOOMZ_Zeroshot.py => ar/sentiment_emotion_others/sarcasm/ArSarcasm_BLOOMZ_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/sarcasm/ArSarcasm_GPT3_Zeroshot.py => ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT3_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/sarcasm/ArSarcasm_GPT4_Fewshot.py => ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT4_FewShot.py} (100%)
 rename assets/{benchmark_v1/sarcasm/ArSarcasm_GPT4_Zeroshot.py => ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT4_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/sentiment/sentiment/ArSASSentiment_BLOOMZ_ZeroShot.py => ar/sentiment_emotion_others/sentiment/ArSAS_BLOOMZ_ZeroShot.py} (91%)
 rename assets/{benchmark_v1/sentiment/sentiment/ArSASSentiment_ChatGPT_ZeroShot.py => ar/sentiment_emotion_others/sentiment/ArSAS_GPT35_ZeroShot.py} (92%)
 rename assets/{benchmark_v1/sentiment/sentiment/ArSASSentiment_GPTChatCompletion_ZeroShot.py => ar/sentiment_emotion_others/sentiment/ArSAS_GPT4_ZeroShot.py} (93%)
 rename assets/{benchmark_v1/sentiment_emotion_others/StanceKhouja20_BLOOMZ_ZeroShot.py => ar/sentiment_emotion_others/stance_detection/Khouja20Stance_BLOOMZ_ZeroShot.py} (84%)
 rename assets/{benchmark_v1/sentiment_emotion_others/StanceKhouja20_ChatGPT_ZeroShot.py => ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT35_ZeroShot.py} (83%)
 rename assets/{benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_FewShot.py => ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT4_FewShot.py} (93%)
 rename assets/{benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_ZeroShot.py => ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT4_ZeroShot.py} (90%)
 rename assets/{benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_BLOOMZ_ZeroShot.py => ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_BLOOMZ_ZeroShot.py} (87%)
 rename assets/{benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_ChatGPT_ZeroShot.py => ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT35_ZeroShot.py} (87%)
 rename assets/{benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_FewShot.py => ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT4_FewShot.py} (93%)
 rename assets/{benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_ZeroShot.py => ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT4_ZeroShot.py} (91%)
 rename assets/{benchmark_v1/NER/NERANERcorp_ChatGPT_ZeroShot.py => ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT35_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/NER/NERANERcorp_GPTChatCompletion_FewShot.py => ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT4_FewShot.py} (100%)
 rename assets/{benchmark_v1/NER/NERANERcorp_GPTChatCompletion_ZeroShot.py => ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT4_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/NER/NERAqmar_ChatGPT_ZeroShot.py => ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT35_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/NER/NERAqmar_GPTChatCompletion_FewShot.py => ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT4_FewShot.py} (100%)
 rename assets/{benchmark_v1/NER/NERAqmar_GPTChatCompletion_ZeroShot.py => ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT4_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/NER/MGBWords_ChatGPT_ZeroShot.py => ar/sequence_tagging_and_information_extraction/NER/MGBWords_GPT35_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/NER/MGBWords_GPTChatCompletion_ZeroShot.py => ar/sequence_tagging_and_information_extraction/NER/MGBWords_GPT4_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPT4_ZeroShot.py => ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabicPOS_GPT4_ZeroShot.py} (94%)
 rename assets/{benchmark_v1/sequence_tagging_ner_pos_etc/POS_ChatGPT_ZeroShot.py => ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_GPT35_ZeroShot.py} (94%)
 rename assets/{benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPTChatCompletion_FewShot.py => ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_GPT4_FewShot.py} (91%)
 create mode 100644 assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT35_ZeroShot.py
 create mode 100644 assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT4_FewShot.py
 create mode 100644 assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT4_ZeroShot.py
 create mode 100644 assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT35_ZeroShot.py
 create mode 100644 assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT4_FewShot.py
 create mode 100644 assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT4_ZeroShot.py
 create mode 100644 assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT35_ZeroShot.py
 create mode 100644 assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT4_ZeroShot.py
 rename assets/{benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py => ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT35_ZeroShot.py} (90%)
 create mode 100644 assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT4_ZeroShot.py
 rename assets/{benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_BLOOMZ_ZeroShot.py => ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_BLOOMZ_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_ChatGPT_ZeroShot.py => ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT35_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_FewShot.py => ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT4_FewShot.py} (97%)
 rename assets/{benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_ZeroShot.py => ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT4_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_ChatGPT_ZeroShot.py => ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_GPT35_ZeroShot.py} (100%)
 rename assets/{benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_GPTChatCompletion_ZeroShot.py => ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_GPT4_ZeroShot.py} (100%)
 create mode 100644 assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_BLOOMZ_ZeroShot.py
 rename assets/{benchmark_v1/sequence_tagging_ner_pos_etc/lemmatization/Lemmatization_ChatGPT_ZeroShot.py => ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_GPT35_ZeroShot.py} (92%)
 create mode 100644 assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_GPT4_ZeroShot.py
 rename assets/{benchmark_v1/sequence_tagging_ner_pos_etc/parsing_ChatGPT_ZeroShot.py => ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT35_ZeroShot.py} (94%)
 rename assets/{benchmark_v1/sequence_tagging_ner_pos_etc/parsing_GPT4_ZeroShot.py => ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT4_ZeroShot.py} (94%)
 rename assets/{benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_ChatGPT_ZeroShot.py => ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT35_ZeroShot.py} (85%)
 rename assets/{benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_GPT4_ZeroShot.py => ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT4_ZeroShot.py} (92%)
 create mode 100644 assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT35_ZeroShot.py
 create mode 100644 assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT4_ZeroShot.py
 delete mode 100644 assets/benchmark_v1/QA/.keep
 delete mode 100644 assets/benchmark_v1/STS/.keep
 delete mode 100644 assets/benchmark_v1/dialect_identification/.keep
 delete mode 100644 assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_CGPT35_ZeroShot.py
 delete mode 100644 assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_CGPT35_ZeroShot.py
 delete mode 100644 assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ZeroShot.py
 delete mode 100644 assets/benchmark_v1/semantics/.keep
 delete mode 100644 assets/benchmark_v1/sentiment_emotion_others/.keep
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_BGZeroShot.py => bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py} (92%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGFewShot.py => bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py} (96%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGZeroShot.py => bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py} (95%)
 rename assets/{benchmark_v1/sentiment => bn/sentiment_emotion_others}/sentiment/BanglaSentiment_BLOOMZ_ZeroShot.py (100%)
 rename assets/{benchmark_v1/sentiment => bn/sentiment_emotion_others}/sentiment/BanglaSentiment_GPT4_FewShot.py (100%)
 rename assets/{benchmark_v1/sentiment => bn/sentiment_emotion_others}/sentiment/BanglaSentiment_GPT4_ZeroShot.py (100%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_GEZeroShot.py => de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEFewShot.py => de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py} (97%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEZeroShot.py => de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py} (97%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ENZeroShot.py => en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py} (92%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENFewShot.py => en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py} (96%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENZeroShot.py => en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py} (95%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ENZeroShot.py => en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENFewShot.py => en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py} (97%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENZeroShot.py => en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py} (97%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ESZeroShot.py => es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py} (91%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESFewShot.py => es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py} (96%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESZeroShot.py => es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py} (95%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_FRZeroShot.py => fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRFewShot.py => fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py} (97%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRZeroShot.py => fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py} (97%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ITZeroShot.py => it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITFewShot.py => it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py} (97%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITZeroShot.py => it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py} (97%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_NLZeroShot.py => nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py} (92%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLFewShot.py => nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py} (96%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLZeroShot.py => nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py} (95%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_POZeroShot.py => pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POFewShot.py => pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py} (97%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POZeroShot.py => pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py} (97%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_RUZeroShot.py => ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py} (96%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUFewShot.py => ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py} (97%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUZeroShot.py => ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py} (97%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_TRZeroShot.py => tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py} (91%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRFewShot.py => tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py} (96%)
 rename assets/{benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRZeroShot.py => tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py} (95%)
 rename llmebench/datasets/{DialectADI.py => ADI.py} (90%)
 rename llmebench/datasets/{ArSASSentiment.py => ArSAS.py} (89%)
 delete mode 100644 llmebench/datasets/ArabicDiacritization.py
 delete mode 100644 llmebench/datasets/ArabicPOS.py
 delete mode 100644 llmebench/datasets/ArabicParsing.py
 create mode 100644 llmebench/datasets/BibleMaghrebiDiacritization.py
 rename llmebench/datasets/{FactualityCOVID19.py => COVID19Factuality.py} (95%)
 rename llmebench/datasets/{Attentionworthy.py => CT22Attentionworthy.py} (90%)
 rename llmebench/datasets/{Checkworthiness.py => CT22Checkworthiness.py} (89%)
 rename llmebench/datasets/{Claim.py => CT22Claim.py} (90%)
 rename llmebench/datasets/{Harmful.py => CT22Harmful.py} (90%)
 rename llmebench/datasets/{Subjectivity.py => CT23Subjectivity.py} (93%)
 rename llmebench/datasets/{FactualityKhouja20.py => Khouja20Factuality.py} (90%)
 rename llmebench/datasets/{StanceKhouja20.py => Khouja20Stance.py} (92%)
 rename llmebench/datasets/{Offensive.py => OSACT4SubtaskA.py} (92%)
 rename llmebench/datasets/{HateSpeech.py => OSACT4SubtaskB.py} (93%)
 create mode 100644 llmebench/datasets/PADT.py
 create mode 100644 llmebench/datasets/QCRIDialectalArabicPOS.py
 rename llmebench/datasets/{ArabicSegmentation.py => QCRIDialectalArabicSegmentation.py} (92%)
 rename llmebench/datasets/{STSArSemEval17Track1.py => SemEval17T1STS.py} (94%)
 rename llmebench/datasets/{STSArSemEval17Track2.py => SemEval17T2STS.py} (94%)
 rename llmebench/datasets/{PropagandaSemEval23.py => SemEval23T3Propaganda.py} (95%)
 rename llmebench/datasets/{FactualityUnifiedFC.py => UnifiedFCFactuality.py} (95%)
 rename llmebench/datasets/{StanceUnifiedFC.py => UnifiedFCStance.py} (95%)
 rename llmebench/datasets/{Propaganda.py => WANLP22T3Propaganda.py} (76%)
 create mode 100644 llmebench/datasets/WikiNewsDiacritization.py
 rename llmebench/datasets/{Lemmatization.py => WikiNewsLemmatization.py} (61%)
 create mode 100644 llmebench/datasets/WikiNewsPOS.py
 create mode 100644 llmebench/datasets/WikiNewsSegmentation.py
 create mode 100644 llmebench/datasets/XGLUEPOS.py
 rename llmebench/tasks/{FactualityUnifiedFC.py => Factuality.py} (65%)
 delete mode 100644 llmebench/tasks/FactualityCOVID19.py
 delete mode 100644 llmebench/tasks/FactualityKhouja20.py
 rename llmebench/tasks/{PropagandaMultilabelSemEval23.py => MultilabelPropaganda.py} (61%)
 delete mode 100644 llmebench/tasks/NewsCatAkhbarona.py
 delete mode 100644 llmebench/tasks/NewsCatAlArabiya.py
 delete mode 100644 llmebench/tasks/NewsCatAlKhaleej.py
 rename llmebench/tasks/{NewsCatASND.py => NewsCategorization.py} (90%)
 delete mode 100644 llmebench/tasks/PropagandaMultilabel.py
 rename llmebench/tasks/{STSTrack1.py => STS.py} (69%)
 delete mode 100644 llmebench/tasks/STSTrack2.py
 rename llmebench/tasks/{StanceKhouja20.py => Stance.py} (81%)
 delete mode 100644 llmebench/tasks/StanceUnifiedFC.py

diff --git a/assets/benchmark_v1/MT/AraBench_Ara2Eng_BLOOMZ_ZeroShot.py b/assets/ar/MT/AraBench_ar2en_BLOOMZ_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/MT/AraBench_Ara2Eng_BLOOMZ_ZeroShot.py
rename to assets/ar/MT/AraBench_ar2en_BLOOMZ_ZeroShot.py
diff --git a/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT_ZeroShot.py b/assets/ar/MT/AraBench_ar2en_GPT35_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT_ZeroShot.py
rename to assets/ar/MT/AraBench_ar2en_GPT35_ZeroShot.py
diff --git a/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py b/assets/ar/MT/AraBench_ar2en_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py
rename to assets/ar/MT/AraBench_ar2en_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/QA/ARCD_BLOOMZ_ZeroShot.py b/assets/ar/QA/ARCD_BLOOMZ_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/QA/ARCD_BLOOMZ_ZeroShot.py
rename to assets/ar/QA/ARCD_BLOOMZ_ZeroShot.py
diff --git a/assets/benchmark_v1/QA/ARCD_ChatGPT_ZeroShot.py b/assets/ar/QA/ARCD_GPT35_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/QA/ARCD_ChatGPT_ZeroShot.py
rename to assets/ar/QA/ARCD_GPT35_ZeroShot.py
diff --git a/assets/benchmark_v1/QA/ARCD_GPTChatCompletion_FewShot.py b/assets/ar/QA/ARCD_GPT4_FewShot.py
similarity index 100%
rename from assets/benchmark_v1/QA/ARCD_GPTChatCompletion_FewShot.py
rename to assets/ar/QA/ARCD_GPT4_FewShot.py
diff --git a/assets/benchmark_v1/QA/ARCD_GPTChatCompletion_ZeroShot.py b/assets/ar/QA/ARCD_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/QA/ARCD_GPTChatCompletion_ZeroShot.py
rename to assets/ar/QA/ARCD_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/QA/MLQA_BLOOMZ_ZeroShot.py b/assets/ar/QA/MLQA_BLOOMZ_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/QA/MLQA_BLOOMZ_ZeroShot.py
rename to assets/ar/QA/MLQA_BLOOMZ_ZeroShot.py
diff --git a/assets/benchmark_v1/QA/MLQA_ChatGPT_ZeroShot.py b/assets/ar/QA/MLQA_GPT35_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/QA/MLQA_ChatGPT_ZeroShot.py
rename to assets/ar/QA/MLQA_GPT35_ZeroShot.py
diff --git a/assets/benchmark_v1/QA/MLQA_GPTChatCompletion_FewShot.py b/assets/ar/QA/MLQA_GPT4_FewShot.py
similarity index 100%
rename from assets/benchmark_v1/QA/MLQA_GPTChatCompletion_FewShot.py
rename to assets/ar/QA/MLQA_GPT4_FewShot.py
diff --git a/assets/benchmark_v1/QA/MLQA_GPTChatCompletion_ZeroShot.py b/assets/ar/QA/MLQA_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/QA/MLQA_GPTChatCompletion_ZeroShot.py
rename to assets/ar/QA/MLQA_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/QA/TyDiQA_BLOOMZ_ZeroShot.py b/assets/ar/QA/TyDiQA_BLOOMZ_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/QA/TyDiQA_BLOOMZ_ZeroShot.py
rename to assets/ar/QA/TyDiQA_BLOOMZ_ZeroShot.py
diff --git a/assets/benchmark_v1/QA/TyDiQA_ChatGPT_ZeroShot.py b/assets/ar/QA/TyDiQA_GPT35_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/QA/TyDiQA_ChatGPT_ZeroShot.py
rename to assets/ar/QA/TyDiQA_GPT35_ZeroShot.py
diff --git a/assets/benchmark_v1/QA/TyDiQA_GPTChatCompletion_FewShot.py b/assets/ar/QA/TyDiQA_GPT4_FewShot.py
similarity index 100%
rename from assets/benchmark_v1/QA/TyDiQA_GPTChatCompletion_FewShot.py
rename to assets/ar/QA/TyDiQA_GPT4_FewShot.py
diff --git a/assets/benchmark_v1/QA/TydiQA_GPTChatCompletion_ZeroShot.py b/assets/ar/QA/TydiQA_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/QA/TydiQA_GPTChatCompletion_ZeroShot.py
rename to assets/ar/QA/TydiQA_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/QA/XQuAD_BLOOMZ_ZeroShot.py b/assets/ar/QA/XQuAD_BLOOMZ_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/QA/XQuAD_BLOOMZ_ZeroShot.py
rename to assets/ar/QA/XQuAD_BLOOMZ_ZeroShot.py
diff --git a/assets/benchmark_v1/QA/XQuAD_ChatGPT_ZeroShot.py b/assets/ar/QA/XQuAD_GPT35_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/QA/XQuAD_ChatGPT_ZeroShot.py
rename to assets/ar/QA/XQuAD_GPT35_ZeroShot.py
diff --git a/assets/benchmark_v1/QA/XQuAD_GPTChatCompletion_ZeroShot.py b/assets/ar/QA/XQuAD_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/QA/XQuAD_GPTChatCompletion_ZeroShot.py
rename to assets/ar/QA/XQuAD_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/QA/XQuaD_GPTChatCompletion_FewShot.py b/assets/ar/QA/XQuaD_GPT4_FewShot.py
similarity index 100%
rename from assets/benchmark_v1/QA/XQuaD_GPTChatCompletion_FewShot.py
rename to assets/ar/QA/XQuaD_GPT4_FewShot.py
diff --git a/assets/benchmark_v1/demography/gender/GenderArabGend_BLOOMZ_ZeroShot.py b/assets/ar/demographic_attributes/gender/ArabGend_BLOOMZ_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/demography/gender/GenderArabGend_BLOOMZ_ZeroShot.py
rename to assets/ar/demographic_attributes/gender/ArabGend_BLOOMZ_ZeroShot.py
diff --git a/assets/benchmark_v1/demography/gender/GenderArabGend_ChatGPT_ZeroShot.py b/assets/ar/demographic_attributes/gender/ArabGend_GPT35_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/demography/gender/GenderArabGend_ChatGPT_ZeroShot.py
rename to assets/ar/demographic_attributes/gender/ArabGend_GPT35_ZeroShot.py
diff --git a/assets/benchmark_v1/demography/gender/GenderArabGend_GPTChatCompletion_ZeroShot.py b/assets/ar/demographic_attributes/gender/ArabGend_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/demography/gender/GenderArabGend_GPTChatCompletion_ZeroShot.py
rename to assets/ar/demographic_attributes/gender/ArabGend_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/demography/gender/GenderArapTweet_BLOOMZ_ZeroShot.py b/assets/ar/demographic_attributes/gender/ArapTweet_BLOOMZ_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/demography/gender/GenderArapTweet_BLOOMZ_ZeroShot.py
rename to assets/ar/demographic_attributes/gender/ArapTweet_BLOOMZ_ZeroShot.py
diff --git a/assets/benchmark_v1/demography/gender/GenderArapTweet_ChatGPT_ZeroShot.py b/assets/ar/demographic_attributes/gender/ArapTweet_GPT35_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/demography/gender/GenderArapTweet_ChatGPT_ZeroShot.py
rename to assets/ar/demographic_attributes/gender/ArapTweet_GPT35_ZeroShot.py
diff --git a/assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_FewShot.py b/assets/ar/demographic_attributes/gender/ArapTweet_GPT4_FewShot.py
similarity index 100%
rename from assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_FewShot.py
rename to assets/ar/demographic_attributes/gender/ArapTweet_GPT4_FewShot.py
diff --git a/assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_ZeroShot.py b/assets/ar/demographic_attributes/gender/ArapTweet_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_ZeroShot.py
rename to assets/ar/demographic_attributes/gender/ArapTweet_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/demography/location/Location_BLOOMZ_ZeroShot.py b/assets/ar/demographic_attributes/location/Location_BLOOMZ_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/demography/location/Location_BLOOMZ_ZeroShot.py
rename to assets/ar/demographic_attributes/location/Location_BLOOMZ_ZeroShot.py
diff --git a/assets/benchmark_v1/demography/location/Location_ChatGPT_ZeroShot.py b/assets/ar/demographic_attributes/location/Location_GPT35_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/demography/location/Location_ChatGPT_ZeroShot.py
rename to assets/ar/demographic_attributes/location/Location_GPT35_ZeroShot.py
diff --git a/assets/benchmark_v1/demography/location/Location_GPTChatCompletion_FewShot.py b/assets/ar/demographic_attributes/location/Location_GPT4_FewShot.py
similarity index 100%
rename from assets/benchmark_v1/demography/location/Location_GPTChatCompletion_FewShot.py
rename to assets/ar/demographic_attributes/location/Location_GPT4_FewShot.py
diff --git a/assets/benchmark_v1/demography/location/Location_GPTChatCompletion_ZeroShot.py b/assets/ar/demographic_attributes/location/Location_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/demography/location/Location_GPTChatCompletion_ZeroShot.py
rename to assets/ar/demographic_attributes/location/Location_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/demography/name_info/NameInfo_BLOOMZ_ZeroShot.py b/assets/ar/demographic_attributes/name_info/NameInfo_BLOOMZ_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/demography/name_info/NameInfo_BLOOMZ_ZeroShot.py
rename to assets/ar/demographic_attributes/name_info/NameInfo_BLOOMZ_ZeroShot.py
diff --git a/assets/benchmark_v1/demography/name_info/NameInfo_ChatGPT_ZeroShot.py b/assets/ar/demographic_attributes/name_info/NameInfo_GPT35_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/demography/name_info/NameInfo_ChatGPT_ZeroShot.py
rename to assets/ar/demographic_attributes/name_info/NameInfo_GPT35_ZeroShot.py
diff --git a/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_FewShot.py b/assets/ar/demographic_attributes/name_info/NameInfo_GPT4_FewShot.py
similarity index 100%
rename from assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_FewShot.py
rename to assets/ar/demographic_attributes/name_info/NameInfo_GPT4_FewShot.py
diff --git a/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_ZeroShot.py b/assets/ar/demographic_attributes/name_info/NameInfo_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_ZeroShot.py
rename to assets/ar/demographic_attributes/name_info/NameInfo_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_BLOOMZ_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_BLOOMZ_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_BLOOMZ_ZeroShot.py
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_ChatGPT_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT35_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_ChatGPT_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT35_ZeroShot.py
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT4_FewShot.py
similarity index 100%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_FewShot.py
rename to assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT4_FewShot.py
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_BLOOMZ_ZeroShot.py
similarity index 93%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_BLOOMZ_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_BLOOMZ_ZeroShot.py
index 37f15dde..de9d63a1 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_BLOOMZ_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_BLOOMZ_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import AttentionworthyDataset
+from llmebench.datasets import CT22AttentionworthyDataset
 from llmebench.models import BLOOMPetalModel
 from llmebench.tasks import AttentionworthyTask
 
 
 def config():
     return {
-        "dataset": AttentionworthyDataset,
+        "dataset": CT22AttentionworthyDataset,
         "dataset_args": {},
         "task": AttentionworthyTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_ChatGPT_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT35_ZeroShot.py
similarity index 93%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_ChatGPT_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT35_ZeroShot.py
index 23d7f66c..2acfc1f3 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_ChatGPT_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT35_ZeroShot.py
@@ -1,14 +1,14 @@
 import os
 import re
 
-from llmebench.datasets import AttentionworthyDataset
-from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.datasets import CT22AttentionworthyDataset
+from llmebench.models import GPTModel
 from llmebench.tasks import AttentionworthyTask
 
 
 def config():
     return {
-        "dataset": AttentionworthyDataset,
+        "dataset": CT22AttentionworthyDataset,
         "dataset_args": {},
         "task": AttentionworthyTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_Fewshot.py b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT4_FewShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_Fewshot.py
rename to assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT4_FewShot.py
index 92c7a32b..37707a08 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_Fewshot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT4_FewShot.py
@@ -1,18 +1,13 @@
 import os
-import random
-import re
 
-from llmebench.datasets import AttentionworthyDataset
+from llmebench.datasets import CT22AttentionworthyDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import AttentionworthyTask
 
 
-random.seed(1333)
-
-
 def config():
     return {
-        "dataset": AttentionworthyDataset,
+        "dataset": CT22AttentionworthyDataset,
         "dataset_args": {},
         "task": AttentionworthyTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT4_ZeroShot.py
similarity index 94%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT4_ZeroShot.py
index 69da7868..b988345b 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT4_ZeroShot.py
@@ -1,18 +1,13 @@
 import os
-import random
-import re
 
-from llmebench.datasets import AttentionworthyDataset
+from llmebench.datasets import CT22AttentionworthyDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import AttentionworthyTask
 
 
-random.seed(1333)
-
-
 def config():
     return {
-        "dataset": AttentionworthyDataset,
+        "dataset": CT22AttentionworthyDataset,
         "dataset_args": {},
         "task": AttentionworthyTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py
similarity index 92%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py
index 7d7c512f..4b852302 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import BLOOMPetalModel
 from llmebench.tasks import CheckworthinessTask
 
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_ChatGPT_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT35_ZeroShot.py
similarity index 93%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_ChatGPT_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT35_ZeroShot.py
index 5e60e8a5..ec0d9a82 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_ChatGPT_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT35_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import GPTModel, RandomGPTModel
 from llmebench.tasks import CheckworthinessTask
 
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_FewShot.py
rename to assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py
index 444fb4de..85c628f8 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_FewShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py
@@ -2,7 +2,7 @@
 import random
 import re
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import CheckworthinessTask
 
@@ -12,7 +12,7 @@
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py
similarity index 95%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py
index d90cb0ce..60f3b7d5 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py
@@ -2,7 +2,7 @@
 import random
 import re
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import CheckworthinessTask
 
@@ -12,7 +12,7 @@
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_BLOOMZ_ZeroShot.py
similarity index 93%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_BLOOMZ_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_BLOOMZ_ZeroShot.py
index 63be543d..4e171616 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_BLOOMZ_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_BLOOMZ_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import CovidClaimDataset
+from llmebench.datasets import CT22ClaimDataset
 from llmebench.models import BLOOMPetalModel
 from llmebench.tasks import ClaimDetectionTask
 
 
 def config():
     return {
-        "dataset": CovidClaimDataset,
+        "dataset": CT22ClaimDataset,
         "dataset_args": {},
         "task": ClaimDetectionTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_CGPT35_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT35_ZeroShot.py
similarity index 94%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_CGPT35_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT35_ZeroShot.py
index f8e2ba13..64a027ea 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_CGPT35_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT35_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import CovidClaimDataset
+from llmebench.datasets import CT22ClaimDataset
 from llmebench.models import GPTModel
 from llmebench.tasks import ClaimDetectionTask
 
 
 def config():
     return {
-        "dataset": CovidClaimDataset,
+        "dataset": CT22ClaimDataset,
         "dataset_args": {},
         "task": ClaimDetectionTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_GPT4_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_FewShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_GPT4_FewShot.py
rename to assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_FewShot.py
index 8ed99f9a..6059c977 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_GPT4_FewShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_FewShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import CovidClaimDataset
+from llmebench.datasets import CT22ClaimDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import ClaimDetectionTask
 
 
 def config():
     return {
-        "dataset": CovidClaimDataset,
+        "dataset": CT22ClaimDataset,
         "dataset_args": {},
         "task": ClaimDetectionTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_GPTChatCompletion_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_ZeroShot.py
similarity index 93%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_GPTChatCompletion_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_ZeroShot.py
index 1e9c3c73..9c2a3a18 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_GPTChatCompletion_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_ZeroShot.py
@@ -1,18 +1,13 @@
 import os
-import random
-import re
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import CheckworthinessTask
 
 
-random.seed(1333)
-
-
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_BLOOMZ_ZeroShot.py
similarity index 88%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_BLOOMZ_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_BLOOMZ_ZeroShot.py
index fa4dc0dd..4b11e70d 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_BLOOMZ_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_BLOOMZ_ZeroShot.py
@@ -1,15 +1,15 @@
 import os
 
-from llmebench.datasets import FactualityCOVID19Dataset
+from llmebench.datasets import COVID19FactualityDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import FactualityCOVID19Task
+from llmebench.tasks import FactualityTask
 
 
 def config():
     return {
-        "dataset": FactualityCOVID19Dataset,
+        "dataset": COVID19FactualityDataset,
         "dataset_args": {},
-        "task": FactualityCOVID19Task,
+        "task": FactualityTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_FewShot.py
similarity index 91%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_FewShot.py
rename to assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_FewShot.py
index e165251a..22701a62 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_FewShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_FewShot.py
@@ -1,20 +1,15 @@
 import os
-import random
-import re
 
-from llmebench.datasets import FactualityCOVID19Dataset
+from llmebench.datasets import COVID19FactualityDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import FactualityCOVID19Task
-
-
-random.seed(1333)
+from llmebench.tasks import FactualityTask
 
 
 def config():
     return {
-        "dataset": FactualityCOVID19Dataset,
+        "dataset": COVID19FactualityDataset,
         "dataset_args": {},
-        "task": FactualityCOVID19Task,
+        "task": FactualityTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_ZeroShot.py
similarity index 88%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_ZeroShot.py
index 6b74eb7e..8ecfe1f7 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_ZeroShot.py
@@ -1,20 +1,15 @@
 import os
-import random
-import re
 
-from llmebench.datasets import FactualityCOVID19Dataset
+from llmebench.datasets import COVID19FactualityDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import FactualityCOVID19Task
-
-
-random.seed(1333)
+from llmebench.tasks import FactualityTask
 
 
 def config():
     return {
-        "dataset": FactualityCOVID19Dataset,
+        "dataset": COVID19FactualityDataset,
         "dataset_args": {},
-        "task": FactualityCOVID19Task,
+        "task": FactualityTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_BLOOMZ_ZeroShot.py
similarity index 86%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_BLOOMZ_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_BLOOMZ_ZeroShot.py
index 7fd21752..1d3f260c 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_BLOOMZ_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_BLOOMZ_ZeroShot.py
@@ -1,20 +1,15 @@
 import os
-import random
-import re
 
-from llmebench.datasets import FactualityKhouja20Dataset
+from llmebench.datasets import Khouja20FactualityDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import FactualityKhouja20Task
-
-
-random.seed(1333)
+from llmebench.tasks import FactualityTask
 
 
 def config():
     return {
-        "dataset": FactualityKhouja20Dataset,
+        "dataset": Khouja20FactualityDataset,
         "dataset_args": {},
-        "task": FactualityKhouja20Task,
+        "task": FactualityTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_ChatGPT_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT35_ZeroShot.py
similarity index 84%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_ChatGPT_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT35_ZeroShot.py
index 1dceb259..c3c9630d 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_ChatGPT_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT35_ZeroShot.py
@@ -1,15 +1,15 @@
 import os
 
-from llmebench.datasets import FactualityKhouja20Dataset
-from llmebench.models import GPTModel, RandomGPTModel
-from llmebench.tasks import FactualityKhouja20Task
+from llmebench.datasets import Khouja20FactualityDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import FactualityTask
 
 
 def config():
     return {
-        "dataset": FactualityKhouja20Dataset,
+        "dataset": Khouja20FactualityDataset,
         "dataset_args": {},
-        "task": FactualityKhouja20Task,
+        "task": FactualityTask,
         "task_args": {},
         "model": GPTModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT4_FewShot.py
similarity index 92%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_FewShot.py
rename to assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT4_FewShot.py
index 82fb95ef..5b75a496 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_FewShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT4_FewShot.py
@@ -1,15 +1,15 @@
 import os
 
-from llmebench.datasets import FactualityKhouja20Dataset
+from llmebench.datasets import Khouja20FactualityDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import FactualityKhouja20Task
+from llmebench.tasks import FactualityTask
 
 
 def config():
     return {
-        "dataset": FactualityKhouja20Dataset,
+        "dataset": Khouja20FactualityDataset,
         "dataset_args": {},
-        "task": FactualityKhouja20Task,
+        "task": FactualityTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT4_ZeroShot.py
similarity index 89%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT4_ZeroShot.py
index add924df..ba41c19d 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT4_ZeroShot.py
@@ -1,15 +1,15 @@
 import os
 
-from llmebench.datasets import FactualityKhouja20Dataset
+from llmebench.datasets import Khouja20FactualityDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import FactualityKhouja20Task
+from llmebench.tasks import FactualityTask
 
 
 def config():
     return {
-        "dataset": FactualityKhouja20Dataset,
+        "dataset": Khouja20FactualityDataset,
         "dataset_args": {},
-        "task": FactualityKhouja20Task,
+        "task": FactualityTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_BLOOMZ_ZeroShot.py
similarity index 85%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_BLOOMZ_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_BLOOMZ_ZeroShot.py
index 47d99663..4df082dc 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_BLOOMZ_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_BLOOMZ_ZeroShot.py
@@ -1,20 +1,15 @@
 import os
-import random
-import re
 
-from llmebench.datasets import FactualityUnifiedFCDataset
+from llmebench.datasets import UnifiedFCFactualityDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import FactualityUnifiedFCTask
-
-
-random.seed(1333)
+from llmebench.tasks import FactualityTask
 
 
 def config():
     return {
-        "dataset": FactualityUnifiedFCDataset,
+        "dataset": UnifiedFCFactualityDataset,
         "dataset_args": {},
-        "task": FactualityUnifiedFCTask,
+        "task": FactualityTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPT4_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT4_FewShot.py
similarity index 92%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPT4_FewShot.py
rename to assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT4_FewShot.py
index a85f1182..ea0c7228 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPT4_FewShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT4_FewShot.py
@@ -1,15 +1,15 @@
 import os
 
-from llmebench.datasets import FactualityUnifiedFCDataset
+from llmebench.datasets import UnifiedFCFactualityDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import FactualityUnifiedFCTask
+from llmebench.tasks import FactualityTask
 
 
 def config():
     return {
-        "dataset": FactualityUnifiedFCDataset,
+        "dataset": UnifiedFCFactualityDataset,
         "dataset_args": {},
-        "task": FactualityUnifiedFCTask,
+        "task": FactualityTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPTChatCompletion_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT4_ZeroShot.py
similarity index 88%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPTChatCompletion_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT4_ZeroShot.py
index 0c1d21c2..65e1b4fd 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPTChatCompletion_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT4_ZeroShot.py
@@ -1,20 +1,15 @@
 import os
-import random
-import re
 
-from llmebench.datasets import FactualityUnifiedFCDataset
+from llmebench.datasets import UnifiedFCFactualityDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import FactualityUnifiedFCTask
-
-
-random.seed(1333)
+from llmebench.tasks import FactualityTask
 
 
 def config():
     return {
-        "dataset": FactualityUnifiedFCDataset,
+        "dataset": UnifiedFCFactualityDataset,
         "dataset_args": {},
-        "task": FactualityUnifiedFCTask,
+        "task": FactualityTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_BLOOMZ_ZeroShot.py
similarity index 94%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_BLOOMZ_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_BLOOMZ_ZeroShot.py
index 20cb07f7..ac1040ae 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_BLOOMZ_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_BLOOMZ_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import CovidHarmfulDataset
+from llmebench.datasets import CT22HarmfulDataset
 from llmebench.models import BLOOMPetalModel
 from llmebench.tasks import HarmfulDetectionTask
 
 
 def config():
     return {
-        "dataset": CovidHarmfulDataset,
+        "dataset": CT22HarmfulDataset,
         "dataset_args": {},
         "task": HarmfulDetectionTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_CGPT35_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT35_ZeroShot.py
similarity index 94%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_CGPT35_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT35_ZeroShot.py
index 8db951ae..b2f6696f 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_CGPT35_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT35_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import CovidHarmfulDataset
+from llmebench.datasets import CT22HarmfulDataset
 from llmebench.models import GPTModel
 from llmebench.tasks import HarmfulDetectionTask
 
 
 def config():
     return {
-        "dataset": CovidHarmfulDataset,
+        "dataset": CT22HarmfulDataset,
         "dataset_args": {},
         "task": HarmfulDetectionTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_GPT4_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_GPT4_FewShot.py
rename to assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot.py
index fe497189..92cdce83 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_GPT4_FewShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import CovidHarmfulDataset
+from llmebench.datasets import CT22HarmfulDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import HarmfulDetectionTask
 
 
 def config():
     return {
-        "dataset": CovidHarmfulDataset,
+        "dataset": CT22HarmfulDataset,
         "dataset_args": {},
         "task": HarmfulDetectionTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_GPTChatCompletion_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_GPTChatCompletion_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot.py
index 8b0813b0..5042116e 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_GPTChatCompletion_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot.py
@@ -2,7 +2,7 @@
 import random
 import re
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22HarmfulDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import CheckworthinessTask
 
@@ -12,7 +12,7 @@
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22HarmfulDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_ChatGPT_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT35_ZeroShot.py
similarity index 93%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_ChatGPT_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT35_ZeroShot.py
index 2cdd6740..d8dad02b 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_ChatGPT_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT35_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import HateSpeechDataset
+from llmebench.datasets import OSACT4SubtaskBDataset
 from llmebench.models import GPTModel, RandomGPTModel
 from llmebench.tasks import HateSpeechTask
 
 
 def config():
     return {
-        "dataset": HateSpeechDataset,
+        "dataset": OSACT4SubtaskBDataset,
         "dataset_args": {},
         "task": HateSpeechTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_GPTChatCompletion_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT4_FewShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_GPTChatCompletion_FewShot.py
rename to assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT4_FewShot.py
index 28e883e8..2f6da19a 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_GPTChatCompletion_FewShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT4_FewShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import HateSpeechDataset
+from llmebench.datasets import OSACT4SubtaskBDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import HateSpeechTask
 
 
 def config():
     return {
-        "dataset": HateSpeechDataset,
+        "dataset": OSACT4SubtaskBDataset,
         "dataset_args": {},
         "task": HateSpeechTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/sentiment/offensive/Offensive_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_BLOOMZ_ZeroShot.py
similarity index 79%
rename from assets/benchmark_v1/sentiment/offensive/Offensive_BLOOMZ_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_BLOOMZ_ZeroShot.py
index 965bde7b..7d99a05a 100644
--- a/assets/benchmark_v1/sentiment/offensive/Offensive_BLOOMZ_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_BLOOMZ_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import OffensiveDataset
+from llmebench.datasets import OSACT4SubtaskADataset
 from llmebench.models import BLOOMPetalModel
 from llmebench.tasks import OffensiveTask
 
 
 def config():
     return {
-        "dataset": OffensiveDataset,
+        "dataset": OSACT4SubtaskADataset,
         "dataset_args": {},
         "task": OffensiveTask,
         "task_args": {},
@@ -18,7 +18,7 @@ def config():
             "max_tries": 3,
         },
         "general_args": {
-            "data_path": "data/sentiment_emotion_others/offensive_language/OSACT2020-sharedTask-test-tweets-labels.txt"
+            "data_path": "data/factuality_disinformation_harmful_content/offensive_language/OSACT2020-sharedTask-test-tweets-labels.txt"
         },
     }
 
diff --git a/assets/benchmark_v1/sentiment/offensive/Offensive_ChatGPT_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT35_ZeroShot.py
similarity index 83%
rename from assets/benchmark_v1/sentiment/offensive/Offensive_ChatGPT_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT35_ZeroShot.py
index 8752749f..a9d3a793 100644
--- a/assets/benchmark_v1/sentiment/offensive/Offensive_ChatGPT_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT35_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import OffensiveDataset
+from llmebench.datasets import OSACT4SubtaskADataset
 from llmebench.models import GPTModel
 from llmebench.tasks import OffensiveTask
 
 
 def config():
     return {
-        "dataset": OffensiveDataset,
+        "dataset": OSACT4SubtaskADataset,
         "dataset_args": {},
         "task": OffensiveTask,
         "task_args": {},
@@ -22,7 +22,7 @@ def config():
             "max_tries": 3,
         },
         "general_args": {
-            "data_path": "data/sentiment_emotion_others/offensive_language/OSACT2020-sharedTask-test-tweets-labels.txt"
+            "data_path": "data/factuality_disinformation_harmful_content/offensive_language/OSACT2020-sharedTask-test-tweets-labels.txt"
         },
     }
 
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Offensive_GPTChatCompletion_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT4_FewShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Offensive_GPTChatCompletion_FewShot.py
rename to assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT4_FewShot.py
index 044abf2e..b0306167 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Offensive_GPTChatCompletion_FewShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT4_FewShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import OffensiveDataset
+from llmebench.datasets import OSACT4SubtaskADataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import OffensiveTask
 
 
 def config():
     return {
-        "dataset": OffensiveDataset,
+        "dataset": OSACT4SubtaskADataset,
         "dataset_args": {},
         "task": OffensiveTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/sentiment/offensive/Offensive_GPTChatCompletion_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT4_ZeroShot.py
similarity index 93%
rename from assets/benchmark_v1/sentiment/offensive/Offensive_GPTChatCompletion_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT4_ZeroShot.py
index d29b2d92..5e869c5e 100644
--- a/assets/benchmark_v1/sentiment/offensive/Offensive_GPTChatCompletion_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT4_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import OffensiveDataset
+from llmebench.datasets import OSACT4SubtaskADataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import OffensiveTask
 
 
 def config():
     return {
-        "dataset": OffensiveDataset,
+        "dataset": OSACT4SubtaskADataset,
         "dataset_args": {},
         "task": OffensiveTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_BLOOMZ_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_BLOOMZ_ZeroShot.py
index 0247eaa6..302d1814 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_BLOOMZ_ZeroShot.py
@@ -2,9 +2,9 @@
 import random
 import re
 
-from llmebench.datasets import PropagandaTweetDataset
+from llmebench.datasets import WANLP22T3PropagandaDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import PropagandaMultilabelTask
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 random.seed(1333)
@@ -12,11 +12,11 @@
 
 def config():
     return {
-        "dataset": PropagandaTweetDataset,
+        "dataset": WANLP22T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda/classes.txt"
         },
-        "task": PropagandaMultilabelTask,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_CGPT35_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_GPT35_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_CGPT35_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_GPT35_ZeroShot.py
index b9ad7f81..a9f861a8 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_CGPT35_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_GPT35_ZeroShot.py
@@ -2,18 +2,18 @@
 
 import regex as re
 
-from llmebench.datasets import PropagandaTweetDataset
+from llmebench.datasets import WANLP22T3PropagandaDataset
 from llmebench.models import GPTModel
-from llmebench.tasks import PropagandaMultilabelTask
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 def config():
     return {
-        "dataset": PropagandaTweetDataset,
+        "dataset": WANLP22T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda/classes.txt"
         },
-        "task": PropagandaMultilabelTask,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": GPTModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_GPT4_FewShot.py
similarity index 94%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_FewShot.py
rename to assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_GPT4_FewShot.py
index 0b1f913b..1dced0a3 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_FewShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_GPT4_FewShot.py
@@ -2,9 +2,9 @@
 import random
 import re
 
-from llmebench.datasets import PropagandaTweetDataset
+from llmebench.datasets import WANLP22T3PropagandaDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import PropagandaMultilabelTask
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 random.seed(1333)
@@ -12,11 +12,11 @@
 
 def config():
     return {
-        "dataset": PropagandaTweetDataset,
+        "dataset": WANLP22T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda/classes.txt"
         },
-        "task": PropagandaMultilabelTask,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_GPT4_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_GPT4_ZeroShot.py
index 271385fe..c4162624 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_GPT4_ZeroShot.py
@@ -2,9 +2,9 @@
 import random
 import re
 
-from llmebench.datasets import PropagandaTweetDataset
+from llmebench.datasets import WANLP22T3PropagandaDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import PropagandaMultilabelTask
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 random.seed(1333)
@@ -12,11 +12,11 @@
 
 def config():
     return {
-        "dataset": PropagandaTweetDataset,
+        "dataset": WANLP22T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda/classes.txt"
         },
-        "task": PropagandaMultilabelTask,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/sentiment/spam/Spam_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/spam/Spam_BLOOMZ_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/sentiment/spam/Spam_BLOOMZ_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/spam/Spam_BLOOMZ_ZeroShot.py
diff --git a/assets/benchmark_v1/sentiment/spam/Spam_ChatGPT_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/spam/Spam_GPT35_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/sentiment/spam/Spam_ChatGPT_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/spam/Spam_GPT35_ZeroShot.py
diff --git a/assets/benchmark_v1/sentiment/spam/Spam_GPTChatCompletion_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/spam/Spam_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/sentiment/spam/Spam_GPTChatCompletion_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/spam/Spam_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_BLOOMZ_ZeroShot.py
similarity index 91%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_BLOOMZ_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_BLOOMZ_ZeroShot.py
index 63fa49b4..80cb5d4c 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_BLOOMZ_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_BLOOMZ_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import SubjectivityDataset
+from llmebench.datasets import CT23SubjectivityDataset
 from llmebench.models import BLOOMPetalModel
 from llmebench.tasks import SubjectivityTask
 
 
 def config():
     return {
-        "dataset": SubjectivityDataset,
+        "dataset": CT23SubjectivityDataset,
         "dataset_args": {},
         "task": SubjectivityTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_ChatGPT_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT35_ZeroShot.py
similarity index 93%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_ChatGPT_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT35_ZeroShot.py
index ac41046e..44a7d197 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_ChatGPT_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT35_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import SubjectivityDataset
+from llmebench.datasets import CT23SubjectivityDataset
 from llmebench.models import GPTModel, RandomGPTModel
 from llmebench.tasks import SubjectivityTask
 
 
 def config():
     return {
-        "dataset": SubjectivityDataset,
+        "dataset": CT23SubjectivityDataset,
         "dataset_args": {},
         "task": SubjectivityTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_FewShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_FewShot.py
rename to assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_FewShot.py
index 93859b9f..2acfaea5 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_FewShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_FewShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import SubjectivityDataset
+from llmebench.datasets import CT23SubjectivityDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import SubjectivityTask
 
 
 def config():
     return {
-        "dataset": SubjectivityDataset,
+        "dataset": CT23SubjectivityDataset,
         "dataset_args": {},
         "task": SubjectivityTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_ZeroShot.py
similarity index 94%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_ZeroShot.py
rename to assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_ZeroShot.py
index 95c46f73..23454f6c 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_ZeroShot.py
+++ b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_ZeroShot.py
@@ -2,7 +2,7 @@
 import random
 import re
 
-from llmebench.datasets import SubjectivityDataset
+from llmebench.datasets import CT23SubjectivityDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import SubjectivityTask
 
@@ -12,7 +12,7 @@
 
 def config():
     return {
-        "dataset": SubjectivityDataset,
+        "dataset": CT23SubjectivityDataset,
         "dataset_args": {},
         "task": SubjectivityTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py b/assets/ar/news_categorization/ASND_BLOOMZ_ZeroShot.py
similarity index 95%
rename from assets/benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py
rename to assets/ar/news_categorization/ASND_BLOOMZ_ZeroShot.py
index 48f34c4c..5a71249d 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py
+++ b/assets/ar/news_categorization/ASND_BLOOMZ_ZeroShot.py
@@ -1,18 +1,15 @@
 import os
-import random
 
 from llmebench.datasets import NewsCatASNDDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import NewsCatASNDTask
-
-random.seed(1333)
+from llmebench.tasks import NewsCategorizationTask
 
 
 def config():
     return {
         "dataset": NewsCatASNDDataset,
         "dataset_args": {},
-        "task": NewsCatASNDTask,
+        "task": NewsCategorizationTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_ASND_ChatGPT_ZeroShot.py b/assets/ar/news_categorization/ASND_GPT35_ZeroShot.py
similarity index 92%
rename from assets/benchmark_v1/news_categorization/NewsCat_ASND_ChatGPT_ZeroShot.py
rename to assets/ar/news_categorization/ASND_GPT35_ZeroShot.py
index b08edeec..caa12067 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_ASND_ChatGPT_ZeroShot.py
+++ b/assets/ar/news_categorization/ASND_GPT35_ZeroShot.py
@@ -1,18 +1,15 @@
 import os
-import random
 
 from llmebench.datasets import NewsCatASNDDataset
-from llmebench.models import GPTModel, RandomGPTModel
-from llmebench.tasks import NewsCatASNDTask
-
-random.seed(1333)
+from llmebench.models import GPTModel
+from llmebench.tasks import NewsCategorizationTask
 
 
 def config():
     return {
         "dataset": NewsCatASNDDataset,
         "dataset_args": {},
-        "task": NewsCatASNDTask,
+        "task": NewsCategorizationTask,
         "task_args": {"test": "useless"},
         "model": GPTModel,
         "model_args": {
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_FewShot.py b/assets/ar/news_categorization/ASND_GPT4_FewShot.py
similarity index 96%
rename from assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_FewShot.py
rename to assets/ar/news_categorization/ASND_GPT4_FewShot.py
index b1d0059e..cc1632f9 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_FewShot.py
+++ b/assets/ar/news_categorization/ASND_GPT4_FewShot.py
@@ -1,19 +1,15 @@
 import os
-import random
 
 from llmebench.datasets import NewsCatASNDDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import NewsCatASNDTask
-
-
-random.seed(1333)
+from llmebench.tasks import NewsCategorizationTask
 
 
 def config():
     return {
         "dataset": NewsCatASNDDataset,
         "dataset_args": {},
-        "task": NewsCatASNDTask,
+        "task": NewsCategorizationTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_ZeroShot.py b/assets/ar/news_categorization/ASND_GPT4_ZeroShot.py
similarity index 95%
rename from assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_ZeroShot.py
rename to assets/ar/news_categorization/ASND_GPT4_ZeroShot.py
index 67c0c625..147d3409 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_ZeroShot.py
+++ b/assets/ar/news_categorization/ASND_GPT4_ZeroShot.py
@@ -1,19 +1,15 @@
 import os
-import random
 
 from llmebench.datasets import NewsCatASNDDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import NewsCatASNDTask
-
-
-random.seed(1333)
+from llmebench.tasks import NewsCategorizationTask
 
 
 def config():
     return {
         "dataset": NewsCatASNDDataset,
         "dataset_args": {},
-        "task": NewsCatASNDTask,
+        "task": NewsCategorizationTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_BLOOMZ_ZeroShot.py b/assets/ar/news_categorization/Akhbarona_BLOOMZ_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_BLOOMZ_ZeroShot.py
rename to assets/ar/news_categorization/Akhbarona_BLOOMZ_ZeroShot.py
index a23188ff..8740e0bc 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_BLOOMZ_ZeroShot.py
+++ b/assets/ar/news_categorization/Akhbarona_BLOOMZ_ZeroShot.py
@@ -3,7 +3,7 @@
 
 from llmebench.datasets import NewsCatAkhbaronaDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import NewsCatAkhbaronaTask
+from llmebench.tasks import NewsCategorizationTask
 
 random.seed(1333)
 
@@ -12,7 +12,7 @@ def config():
     return {
         "dataset": NewsCatAkhbaronaDataset,
         "dataset_args": {},
-        "task": NewsCatAkhbaronaTask,
+        "task": NewsCategorizationTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_ChatGPT_ZeroShot.py b/assets/ar/news_categorization/Akhbarona_GPT35_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_ChatGPT_ZeroShot.py
rename to assets/ar/news_categorization/Akhbarona_GPT35_ZeroShot.py
index aa44e595..308291d6 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_ChatGPT_ZeroShot.py
+++ b/assets/ar/news_categorization/Akhbarona_GPT35_ZeroShot.py
@@ -3,7 +3,7 @@
 
 from llmebench.datasets import NewsCatAkhbaronaDataset
 from llmebench.models import GPTModel
-from llmebench.tasks import NewsCatAkhbaronaTask
+from llmebench.tasks import NewsCategorizationTask
 
 random.seed(1333)
 
@@ -12,7 +12,7 @@ def config():
     return {
         "dataset": NewsCatAkhbaronaDataset,
         "dataset_args": {},
-        "task": NewsCatAkhbaronaTask,
+        "task": NewsCategorizationTask,
         "task_args": {},
         "model": GPTModel,
         "model_args": {
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_FewShot.py b/assets/ar/news_categorization/Akhbarona_GPT4_FewShot.py
similarity index 97%
rename from assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_FewShot.py
rename to assets/ar/news_categorization/Akhbarona_GPT4_FewShot.py
index 698e26f6..4bdf5a67 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_FewShot.py
+++ b/assets/ar/news_categorization/Akhbarona_GPT4_FewShot.py
@@ -3,7 +3,7 @@
 
 from llmebench.datasets import NewsCatAkhbaronaDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import NewsCatAkhbaronaTask
+from llmebench.tasks import NewsCategorizationTask
 
 
 random.seed(1333)
@@ -13,7 +13,7 @@ def config():
     return {
         "dataset": NewsCatAkhbaronaDataset,
         "dataset_args": {},
-        "task": NewsCatAkhbaronaTask,
+        "task": NewsCategorizationTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_ZeroShot.py b/assets/ar/news_categorization/Akhbarona_GPT4_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_ZeroShot.py
rename to assets/ar/news_categorization/Akhbarona_GPT4_ZeroShot.py
index c029118d..039c6d97 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_ZeroShot.py
+++ b/assets/ar/news_categorization/Akhbarona_GPT4_ZeroShot.py
@@ -3,7 +3,7 @@
 
 from llmebench.datasets import NewsCatAkhbaronaDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import NewsCatAkhbaronaTask
+from llmebench.tasks import NewsCategorizationTask
 
 
 random.seed(1333)
@@ -13,7 +13,7 @@ def config():
     return {
         "dataset": NewsCatAkhbaronaDataset,
         "dataset_args": {},
-        "task": NewsCatAkhbaronaTask,
+        "task": NewsCategorizationTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_BLOOMZ_ZeroShot.py b/assets/ar/news_categorization/AlArabiya_BLOOMZ_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_BLOOMZ_ZeroShot.py
rename to assets/ar/news_categorization/AlArabiya_BLOOMZ_ZeroShot.py
index 1ce28d74..38cc28fe 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_BLOOMZ_ZeroShot.py
+++ b/assets/ar/news_categorization/AlArabiya_BLOOMZ_ZeroShot.py
@@ -3,7 +3,7 @@
 
 from llmebench.datasets import NewsCatAlArabiyaDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import NewsCatAlArabiyaTask
+from llmebench.tasks import NewsCategorizationTask
 
 random.seed(1333)
 
@@ -12,7 +12,7 @@ def config():
     return {
         "dataset": NewsCatAlArabiyaDataset,
         "dataset_args": {},
-        "task": NewsCatAlArabiyaTask,
+        "task": NewsCategorizationTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_ChatGPT_ZeroShot.py b/assets/ar/news_categorization/AlArabiya_GPT35_ZeroShot.py
similarity index 95%
rename from assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_ChatGPT_ZeroShot.py
rename to assets/ar/news_categorization/AlArabiya_GPT35_ZeroShot.py
index 458761f2..ec28352a 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_ChatGPT_ZeroShot.py
+++ b/assets/ar/news_categorization/AlArabiya_GPT35_ZeroShot.py
@@ -3,7 +3,7 @@
 
 from llmebench.datasets import NewsCatAlArabiyaDataset
 from llmebench.models import GPTModel
-from llmebench.tasks import NewsCatAlArabiyaTask
+from llmebench.tasks import NewsCategorizationTask
 
 random.seed(1333)
 
@@ -12,7 +12,7 @@ def config():
     return {
         "dataset": NewsCatAlArabiyaDataset,
         "dataset_args": {},
-        "task": NewsCatAlArabiyaTask,
+        "task": NewsCategorizationTask,
         "task_args": {},
         "model": GPTModel,
         "model_args": {
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_FewShot.py b/assets/ar/news_categorization/AlArabiya_GPT4_FewShot.py
similarity index 97%
rename from assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_FewShot.py
rename to assets/ar/news_categorization/AlArabiya_GPT4_FewShot.py
index 6f407dd5..4b656c53 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_FewShot.py
+++ b/assets/ar/news_categorization/AlArabiya_GPT4_FewShot.py
@@ -3,7 +3,7 @@
 
 from llmebench.datasets import NewsCatAlArabiyaDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import NewsCatAlArabiyaTask
+from llmebench.tasks import NewsCategorizationTask
 
 
 random.seed(1333)
@@ -13,7 +13,7 @@ def config():
     return {
         "dataset": NewsCatAlArabiyaDataset,
         "dataset_args": {},
-        "task": NewsCatAlArabiyaTask,
+        "task": NewsCategorizationTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_ZeroShot.py b/assets/ar/news_categorization/AlArabiya_GPT4_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_ZeroShot.py
rename to assets/ar/news_categorization/AlArabiya_GPT4_ZeroShot.py
index 333352ec..4f0dcf3f 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_ZeroShot.py
+++ b/assets/ar/news_categorization/AlArabiya_GPT4_ZeroShot.py
@@ -3,7 +3,7 @@
 
 from llmebench.datasets import NewsCatAlArabiyaDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import NewsCatAlArabiyaTask
+from llmebench.tasks import NewsCategorizationTask
 
 
 random.seed(1333)
@@ -13,7 +13,7 @@ def config():
     return {
         "dataset": NewsCatAlArabiyaDataset,
         "dataset_args": {},
-        "task": NewsCatAlArabiyaTask,
+        "task": NewsCategorizationTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_BLOOMZ_ZeroShot.py b/assets/ar/news_categorization/AlKhaleej_BLOOMZ_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_BLOOMZ_ZeroShot.py
rename to assets/ar/news_categorization/AlKhaleej_BLOOMZ_ZeroShot.py
index 03516c58..4bf90ef5 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_BLOOMZ_ZeroShot.py
+++ b/assets/ar/news_categorization/AlKhaleej_BLOOMZ_ZeroShot.py
@@ -3,7 +3,7 @@
 
 from llmebench.datasets import NewsCatAlArabiyaDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import NewsCatAlArabiyaTask
+from llmebench.tasks import NewsCategorizationTask
 
 random.seed(1333)
 
@@ -12,7 +12,7 @@ def config():
     return {
         "dataset": NewsCatAlArabiyaDataset,
         "dataset_args": {},
-        "task": NewsCatAlArabiyaTask,
+        "task": NewsCategorizationTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_ChatGPT_ZeroShot.py b/assets/ar/news_categorization/AlKhaleej_GPT35_ZeroShot.py
similarity index 95%
rename from assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_ChatGPT_ZeroShot.py
rename to assets/ar/news_categorization/AlKhaleej_GPT35_ZeroShot.py
index 63ae2363..b56c6526 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_ChatGPT_ZeroShot.py
+++ b/assets/ar/news_categorization/AlKhaleej_GPT35_ZeroShot.py
@@ -3,7 +3,7 @@
 
 from llmebench.datasets import NewsCatAlKhaleejDataset
 from llmebench.models import GPTModel
-from llmebench.tasks import NewsCatAlKhaleejTask
+from llmebench.tasks import NewsCategorizationTask
 
 random.seed(1333)
 
@@ -12,7 +12,7 @@ def config():
     return {
         "dataset": NewsCatAlKhaleejDataset,
         "dataset_args": {},
-        "task": NewsCatAlKhaleejTask,
+        "task": NewsCategorizationTask,
         "task_args": {},
         "model": GPTModel,
         "model_args": {
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_FewShot.py b/assets/ar/news_categorization/AlKhaleej_GPT4_FewShot.py
similarity index 97%
rename from assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_FewShot.py
rename to assets/ar/news_categorization/AlKhaleej_GPT4_FewShot.py
index c95f9f36..207af065 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_FewShot.py
+++ b/assets/ar/news_categorization/AlKhaleej_GPT4_FewShot.py
@@ -3,7 +3,7 @@
 
 from llmebench.datasets import NewsCatAlKhaleejDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import NewsCatAlKhaleejTask
+from llmebench.tasks import NewsCategorizationTask
 
 
 random.seed(1333)
@@ -13,7 +13,7 @@ def config():
     return {
         "dataset": NewsCatAlKhaleejDataset,
         "dataset_args": {},
-        "task": NewsCatAlKhaleejTask,
+        "task": NewsCategorizationTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_ZeroShot.py b/assets/ar/news_categorization/AlKhaleej_GPT4_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_ZeroShot.py
rename to assets/ar/news_categorization/AlKhaleej_GPT4_ZeroShot.py
index b7f37f16..bcb9952b 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_ZeroShot.py
+++ b/assets/ar/news_categorization/AlKhaleej_GPT4_ZeroShot.py
@@ -3,7 +3,7 @@
 
 from llmebench.datasets import NewsCatAlKhaleejDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import NewsCatAlKhaleejTask
+from llmebench.tasks import NewsCategorizationTask
 
 
 random.seed(1333)
@@ -13,7 +13,7 @@ def config():
     return {
         "dataset": NewsCatAlKhaleejDataset,
         "dataset_args": {},
-        "task": NewsCatAlKhaleejTask,
+        "task": NewsCategorizationTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/semantics/XNLI_BLOOMZ_ZeroShot.py b/assets/ar/semantics/NLI/XNLI_BLOOMZ_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/semantics/XNLI_BLOOMZ_ZeroShot.py
rename to assets/ar/semantics/NLI/XNLI_BLOOMZ_ZeroShot.py
diff --git a/assets/benchmark_v1/semantics/XNLI_ChatGPT_ZeroShot.py b/assets/ar/semantics/NLI/XNLI_GPT35_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/semantics/XNLI_ChatGPT_ZeroShot.py
rename to assets/ar/semantics/NLI/XNLI_GPT35_ZeroShot.py
diff --git a/assets/benchmark_v1/semantics/XNLI_CGPT4_FewShot.py b/assets/ar/semantics/NLI/XNLI_GPT4_FewShot.py
similarity index 100%
rename from assets/benchmark_v1/semantics/XNLI_CGPT4_FewShot.py
rename to assets/ar/semantics/NLI/XNLI_GPT4_FewShot.py
diff --git a/assets/benchmark_v1/semantics/XNLI_CGPT4_ZeroShot.py b/assets/ar/semantics/NLI/XNLI_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/semantics/XNLI_CGPT4_ZeroShot.py
rename to assets/ar/semantics/NLI/XNLI_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/STS/Q2QSim_BLOOMZ_ZeroShot.py b/assets/ar/semantics/STS/Q2QSim_BLOOMZ_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/STS/Q2QSim_BLOOMZ_ZeroShot.py
rename to assets/ar/semantics/STS/Q2QSim_BLOOMZ_ZeroShot.py
diff --git a/assets/benchmark_v1/STS/Q2QSim_ChatGPT_ZeroShot.py b/assets/ar/semantics/STS/Q2QSim_GPT35_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/STS/Q2QSim_ChatGPT_ZeroShot.py
rename to assets/ar/semantics/STS/Q2QSim_GPT35_ZeroShot.py
diff --git a/assets/benchmark_v1/STS/Q2QSim_GPT4_FewShot.py b/assets/ar/semantics/STS/Q2QSim_GPT4_FewShot.py
similarity index 100%
rename from assets/benchmark_v1/STS/Q2QSim_GPT4_FewShot.py
rename to assets/ar/semantics/STS/Q2QSim_GPT4_FewShot.py
diff --git a/assets/benchmark_v1/STS/Q2QSim_GPT4_ZeroShot.py b/assets/ar/semantics/STS/Q2QSim_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/STS/Q2QSim_GPT4_ZeroShot.py
rename to assets/ar/semantics/STS/Q2QSim_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/STS/STSTrack1_BLOOMZ_ZeroShot.py b/assets/ar/semantics/STS/SemEval17T1STS_BLOOMZ_ZeroShot.py
similarity index 88%
rename from assets/benchmark_v1/STS/STSTrack1_BLOOMZ_ZeroShot.py
rename to assets/ar/semantics/STS/SemEval17T1STS_BLOOMZ_ZeroShot.py
index 9fd822f1..0c96b74d 100644
--- a/assets/benchmark_v1/STS/STSTrack1_BLOOMZ_ZeroShot.py
+++ b/assets/ar/semantics/STS/SemEval17T1STS_BLOOMZ_ZeroShot.py
@@ -1,16 +1,16 @@
 import os
 import re
 
-from llmebench.datasets import STSArSemEval17Track1Dataset
+from llmebench.datasets import SemEval17T1STSDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import STSTrack1Task
+from llmebench.tasks import STSTask
 
 
 def config():
     return {
-        "dataset": STSArSemEval17Track1Dataset,
+        "dataset": SemEval17T1STSDataset,
         "dataset_args": {},
-        "task": STSTrack1Task,
+        "task": STSTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {
diff --git a/assets/benchmark_v1/STS/STSTrack1_ChatGPT_ZeroShot.py b/assets/ar/semantics/STS/SemEval17T1STS_GPT35_ZeroShot.py
similarity index 90%
rename from assets/benchmark_v1/STS/STSTrack1_ChatGPT_ZeroShot.py
rename to assets/ar/semantics/STS/SemEval17T1STS_GPT35_ZeroShot.py
index 1b99e05a..aa52ec21 100644
--- a/assets/benchmark_v1/STS/STSTrack1_ChatGPT_ZeroShot.py
+++ b/assets/ar/semantics/STS/SemEval17T1STS_GPT35_ZeroShot.py
@@ -1,16 +1,16 @@
 import os
 import re
 
-from llmebench.datasets import STSArSemEval17Track1Dataset
+from llmebench.datasets import SemEval17T1STSDataset
 from llmebench.models import GPTModel
-from llmebench.tasks import STSTrack1Task
+from llmebench.tasks import STSTask
 
 
 def config():
     return {
-        "dataset": STSArSemEval17Track1Dataset,
+        "dataset": SemEval17T1STSDataset,
         "dataset_args": {},
-        "task": STSTrack1Task,
+        "task": STSTask,
         "task_args": {},
         "model": GPTModel,
         "model_args": {
diff --git a/assets/benchmark_v1/STS/STSTrack1_GPT4_FewShot.py b/assets/ar/semantics/STS/SemEval17T1STS_GPT4_FewShot.py
similarity index 94%
rename from assets/benchmark_v1/STS/STSTrack1_GPT4_FewShot.py
rename to assets/ar/semantics/STS/SemEval17T1STS_GPT4_FewShot.py
index aff4ddad..7aa362d6 100644
--- a/assets/benchmark_v1/STS/STSTrack1_GPT4_FewShot.py
+++ b/assets/ar/semantics/STS/SemEval17T1STS_GPT4_FewShot.py
@@ -1,15 +1,15 @@
 import os
 
-from llmebench.datasets import STSArSemEval17Track1Dataset
+from llmebench.datasets import SemEval17T1STSDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import STSTrack1Task
+from llmebench.tasks import STSTask
 
 
 def config():
     return {
-        "dataset": STSArSemEval17Track1Dataset,
+        "dataset": SemEval17T1STSDataset,
         "dataset_args": {},
-        "task": STSTrack1Task,
+        "task": STSTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/STS/STSTrack1_GPT4_ZeroShot.py b/assets/ar/semantics/STS/SemEval17T1STS_GPT4_ZeroShot.py
similarity index 90%
rename from assets/benchmark_v1/STS/STSTrack1_GPT4_ZeroShot.py
rename to assets/ar/semantics/STS/SemEval17T1STS_GPT4_ZeroShot.py
index 3f94a169..183b45c1 100644
--- a/assets/benchmark_v1/STS/STSTrack1_GPT4_ZeroShot.py
+++ b/assets/ar/semantics/STS/SemEval17T1STS_GPT4_ZeroShot.py
@@ -1,16 +1,16 @@
 import os
 import re
 
-from llmebench.datasets import STSArSemEval17Track1Dataset
+from llmebench.datasets import SemEval17T1STSDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import STSTrack1Task
+from llmebench.tasks import STSTask
 
 
 def config():
     return {
-        "dataset": STSArSemEval17Track1Dataset,
+        "dataset": SemEval17T1STSDataset,
         "dataset_args": {},
-        "task": STSTrack1Task,
+        "task": STSTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/STS/STSTrack2_BLOOMZ_ZeroShot.py b/assets/ar/semantics/STS/SemEval17T2STS_BLOOMZ_ZeroShot.py
similarity index 88%
rename from assets/benchmark_v1/STS/STSTrack2_BLOOMZ_ZeroShot.py
rename to assets/ar/semantics/STS/SemEval17T2STS_BLOOMZ_ZeroShot.py
index 9b2e6efe..3f57b14a 100644
--- a/assets/benchmark_v1/STS/STSTrack2_BLOOMZ_ZeroShot.py
+++ b/assets/ar/semantics/STS/SemEval17T2STS_BLOOMZ_ZeroShot.py
@@ -1,16 +1,16 @@
 import os
 import re
 
-from llmebench.datasets import STSArSemEval17Track1Dataset
+from llmebench.datasets import SemEval17T2STSDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import STSTrack1Task
+from llmebench.tasks import STSTask
 
 
 def config():
     return {
-        "dataset": STSArSemEval17Track1Dataset,
+        "dataset": SemEval17T2STSDataset,
         "dataset_args": {},
-        "task": STSTrack1Task,
+        "task": STSTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {
diff --git a/assets/benchmark_v1/STS/STSTrack2_ChatGPT_ZeroShot.py b/assets/ar/semantics/STS/SemEval17T2STS_GPT35_ZeroShot.py
similarity index 87%
rename from assets/benchmark_v1/STS/STSTrack2_ChatGPT_ZeroShot.py
rename to assets/ar/semantics/STS/SemEval17T2STS_GPT35_ZeroShot.py
index da4dff44..b45cc30c 100644
--- a/assets/benchmark_v1/STS/STSTrack2_ChatGPT_ZeroShot.py
+++ b/assets/ar/semantics/STS/SemEval17T2STS_GPT35_ZeroShot.py
@@ -1,16 +1,15 @@
 import os
-import re
 
-from llmebench.datasets import STSArSemEval17Track2Dataset
-from llmebench.models import GPTModel, RandomGPTModel
-from llmebench.tasks import STSTrack2Task
+from llmebench.datasets import SemEval17T2STSDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import STSTask
 
 
 def config():
     return {
-        "dataset": STSArSemEval17Track2Dataset,
+        "dataset": SemEval17T2STSDataset,
         "dataset_args": {},
-        "task": STSTrack2Task,
+        "task": STSTask,
         "task_args": {},
         "model": GPTModel,
         "model_args": {
diff --git a/assets/benchmark_v1/STS/STSTrack2_GPT4_FewShot.py b/assets/ar/semantics/STS/SemEval17T2STS_GPT4_FewShot.py
similarity index 94%
rename from assets/benchmark_v1/STS/STSTrack2_GPT4_FewShot.py
rename to assets/ar/semantics/STS/SemEval17T2STS_GPT4_FewShot.py
index a4e9e840..0573e60a 100644
--- a/assets/benchmark_v1/STS/STSTrack2_GPT4_FewShot.py
+++ b/assets/ar/semantics/STS/SemEval17T2STS_GPT4_FewShot.py
@@ -1,15 +1,15 @@
 import os
 
-from llmebench.datasets import STSArSemEval17Track2Dataset
+from llmebench.datasets import SemEval17T2STSDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import STSTrack2Task
+from llmebench.tasks import STSTask
 
 
 def config():
     return {
-        "dataset": STSArSemEval17Track2Dataset,
+        "dataset": SemEval17T2STSDataset,
         "dataset_args": {},
-        "task": STSTrack2Task,
+        "task": STSTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/STS/STSTrack2_GPT4_ZeroShot.py b/assets/ar/semantics/STS/SemEval17T2STS_GPT4_ZeroShot.py
similarity index 90%
rename from assets/benchmark_v1/STS/STSTrack2_GPT4_ZeroShot.py
rename to assets/ar/semantics/STS/SemEval17T2STS_GPT4_ZeroShot.py
index 4be3334c..ae4c8f29 100644
--- a/assets/benchmark_v1/STS/STSTrack2_GPT4_ZeroShot.py
+++ b/assets/ar/semantics/STS/SemEval17T2STS_GPT4_ZeroShot.py
@@ -1,16 +1,16 @@
 import os
 import re
 
-from llmebench.datasets import STSArSemEval17Track2Dataset
+from llmebench.datasets import SemEval17T2STSDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import STSTrack2Task
+from llmebench.tasks import STSTask
 
 
 def config():
     return {
-        "dataset": STSArSemEval17Track2Dataset,
+        "dataset": SemEval17T2STSDataset,
         "dataset_args": {},
-        "task": STSTrack2Task,
+        "task": STSTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/sentiment/emotion/Emotion_BLOOMZ_ZeroShot.py b/assets/ar/sentiment_emotion_others/emotion/Emotion_BLOOMZ_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/sentiment/emotion/Emotion_BLOOMZ_ZeroShot.py
rename to assets/ar/sentiment_emotion_others/emotion/Emotion_BLOOMZ_ZeroShot.py
diff --git a/assets/benchmark_v1/sentiment/emotion/Emotion_ChatGPT_ZeroShot.py b/assets/ar/sentiment_emotion_others/emotion/Emotion_GPT35_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/sentiment/emotion/Emotion_ChatGPT_ZeroShot.py
rename to assets/ar/sentiment_emotion_others/emotion/Emotion_GPT35_ZeroShot.py
diff --git a/assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_FewShot.py b/assets/ar/sentiment_emotion_others/emotion/Emotion_GPT4_FewShot.py
similarity index 100%
rename from assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_FewShot.py
rename to assets/ar/sentiment_emotion_others/emotion/Emotion_GPT4_FewShot.py
diff --git a/assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_ZeroShot.py b/assets/ar/sentiment_emotion_others/emotion/Emotion_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_ZeroShot.py
rename to assets/ar/sentiment_emotion_others/emotion/Emotion_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT3_Zeroshot.py b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT3_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/sarcasm/ArSarcasm2_GPT3_Zeroshot.py
rename to assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT3_ZeroShot.py
diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_FewShot.py b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT4_FewShot.py
similarity index 100%
rename from assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_FewShot.py
rename to assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT4_FewShot.py
diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_Zeroshot.py b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_Zeroshot.py
rename to assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm_BLOOMZ_Zeroshot.py b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_BLOOMZ_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/sarcasm/ArSarcasm_BLOOMZ_Zeroshot.py
rename to assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_BLOOMZ_ZeroShot.py
diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm_GPT3_Zeroshot.py b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT3_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/sarcasm/ArSarcasm_GPT3_Zeroshot.py
rename to assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT3_ZeroShot.py
diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Fewshot.py b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT4_FewShot.py
similarity index 100%
rename from assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Fewshot.py
rename to assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT4_FewShot.py
diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Zeroshot.py b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Zeroshot.py
rename to assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_BLOOMZ_ZeroShot.py b/assets/ar/sentiment_emotion_others/sentiment/ArSAS_BLOOMZ_ZeroShot.py
similarity index 91%
rename from assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_BLOOMZ_ZeroShot.py
rename to assets/ar/sentiment_emotion_others/sentiment/ArSAS_BLOOMZ_ZeroShot.py
index 57dd78e2..f3269a7c 100644
--- a/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_BLOOMZ_ZeroShot.py
+++ b/assets/ar/sentiment_emotion_others/sentiment/ArSAS_BLOOMZ_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import ArSASSentimentDataset
+from llmebench.datasets import ArSASDataset
 from llmebench.models import BLOOMPetalModel
 from llmebench.tasks import SentimentTask
 
 
 def config():
     return {
-        "dataset": ArSASSentimentDataset,
+        "dataset": ArSASDataset,
         "dataset_args": {},
         "task": SentimentTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_ChatGPT_ZeroShot.py b/assets/ar/sentiment_emotion_others/sentiment/ArSAS_GPT35_ZeroShot.py
similarity index 92%
rename from assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_ChatGPT_ZeroShot.py
rename to assets/ar/sentiment_emotion_others/sentiment/ArSAS_GPT35_ZeroShot.py
index 7fac34f6..a8d7798c 100644
--- a/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_ChatGPT_ZeroShot.py
+++ b/assets/ar/sentiment_emotion_others/sentiment/ArSAS_GPT35_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import ArSASSentimentDataset
+from llmebench.datasets import ArSASDataset
 from llmebench.models import GPTModel
 from llmebench.tasks import SentimentTask
 
 
 def config():
     return {
-        "dataset": ArSASSentimentDataset,
+        "dataset": ArSASDataset,
         "dataset_args": {},
         "task": SentimentTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_GPTChatCompletion_ZeroShot.py b/assets/ar/sentiment_emotion_others/sentiment/ArSAS_GPT4_ZeroShot.py
similarity index 93%
rename from assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_GPTChatCompletion_ZeroShot.py
rename to assets/ar/sentiment_emotion_others/sentiment/ArSAS_GPT4_ZeroShot.py
index b9220fb9..88e09958 100644
--- a/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_GPTChatCompletion_ZeroShot.py
+++ b/assets/ar/sentiment_emotion_others/sentiment/ArSAS_GPT4_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import ArSASSentimentDataset
+from llmebench.datasets import ArSASDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import SentimentTask
 
 
 def config():
     return {
-        "dataset": ArSASSentimentDataset,
+        "dataset": ArSASDataset,
         "dataset_args": {},
         "task": SentimentTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_BLOOMZ_ZeroShot.py b/assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_BLOOMZ_ZeroShot.py
similarity index 84%
rename from assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_BLOOMZ_ZeroShot.py
rename to assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_BLOOMZ_ZeroShot.py
index 76f29a4d..54499a07 100644
--- a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_BLOOMZ_ZeroShot.py
+++ b/assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_BLOOMZ_ZeroShot.py
@@ -1,15 +1,15 @@
 import os
 
-from llmebench.datasets import StanceKhouja20Dataset
+from llmebench.datasets import Khouja20StanceDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import StanceKhouja20Task
+from llmebench.tasks import StanceTask
 
 
 def config():
     return {
-        "dataset": StanceKhouja20Dataset,
+        "dataset": Khouja20StanceDataset,
         "dataset_args": {},
-        "task": StanceKhouja20Task,
+        "task": StanceTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {
diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_ChatGPT_ZeroShot.py b/assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT35_ZeroShot.py
similarity index 83%
rename from assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_ChatGPT_ZeroShot.py
rename to assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT35_ZeroShot.py
index d4fbdb93..4981ff29 100644
--- a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_ChatGPT_ZeroShot.py
+++ b/assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT35_ZeroShot.py
@@ -1,15 +1,15 @@
 import os
 
-from llmebench.datasets import StanceKhouja20Dataset
-from llmebench.models import GPTModel, RandomGPTModel
-from llmebench.tasks import StanceKhouja20Task
+from llmebench.datasets import Khouja20StanceDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import StanceTask
 
 
 def config():
     return {
-        "dataset": StanceKhouja20Dataset,
+        "dataset": Khouja20StanceDataset,
         "dataset_args": {},
-        "task": StanceKhouja20Task,
+        "task": StanceTask,
         "task_args": {},
         "model": GPTModel,
         "model_args": {
diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_FewShot.py b/assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT4_FewShot.py
similarity index 93%
rename from assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_FewShot.py
rename to assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT4_FewShot.py
index 1caa14ef..84d164ae 100644
--- a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_FewShot.py
+++ b/assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT4_FewShot.py
@@ -1,16 +1,15 @@
 import os
-import re
 
-from llmebench.datasets import StanceKhouja20Dataset
+from llmebench.datasets import Khouja20StanceDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import StanceKhouja20Task
+from llmebench.tasks import StanceTask
 
 
 def config():
     return {
-        "dataset": StanceKhouja20Dataset,
+        "dataset": Khouja20StanceDataset,
         "dataset_args": {},
-        "task": StanceKhouja20Task,
+        "task": StanceTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_ZeroShot.py b/assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT4_ZeroShot.py
similarity index 90%
rename from assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_ZeroShot.py
rename to assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT4_ZeroShot.py
index 3958ccb6..23d8ea90 100644
--- a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_ZeroShot.py
+++ b/assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT4_ZeroShot.py
@@ -1,16 +1,15 @@
 import os
-import re
 
-from llmebench.datasets import StanceKhouja20Dataset
+from llmebench.datasets import Khouja20StanceDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import StanceKhouja20Task
+from llmebench.tasks import StanceTask
 
 
 def config():
     return {
-        "dataset": StanceKhouja20Dataset,
+        "dataset": Khouja20StanceDataset,
         "dataset_args": {},
-        "task": StanceKhouja20Task,
+        "task": StanceTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_BLOOMZ_ZeroShot.py b/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_BLOOMZ_ZeroShot.py
similarity index 87%
rename from assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_BLOOMZ_ZeroShot.py
rename to assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_BLOOMZ_ZeroShot.py
index 9ac0bb5a..0c32b52d 100644
--- a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_BLOOMZ_ZeroShot.py
+++ b/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_BLOOMZ_ZeroShot.py
@@ -1,15 +1,15 @@
 import os
 
-from llmebench.datasets import StanceUnifiedFCDataset
+from llmebench.datasets import UnifiedFCStanceDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import StanceUnifiedFCTask
+from llmebench.tasks import StanceTask
 
 
 def config():
     return {
-        "dataset": StanceUnifiedFCDataset,
+        "dataset": UnifiedFCStanceDataset,
         "dataset_args": {},
-        "task": StanceUnifiedFCTask,
+        "task": StanceTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {
diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_ChatGPT_ZeroShot.py b/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT35_ZeroShot.py
similarity index 87%
rename from assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_ChatGPT_ZeroShot.py
rename to assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT35_ZeroShot.py
index 8c0f0568..f3bdd2a6 100644
--- a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_ChatGPT_ZeroShot.py
+++ b/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT35_ZeroShot.py
@@ -1,15 +1,15 @@
 import os
 
-from llmebench.datasets import StanceUnifiedFCDataset
-from llmebench.models import GPTModel, RandomGPTModel
-from llmebench.tasks import StanceUnifiedFCTask
+from llmebench.datasets import UnifiedFCStanceDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import StanceTask
 
 
 def config():
     return {
-        "dataset": StanceUnifiedFCDataset,
+        "dataset": UnifiedFCStanceDataset,
         "dataset_args": {},
-        "task": StanceUnifiedFCTask,
+        "task": StanceTask,
         "task_args": {},
         "model": GPTModel,
         "model_args": {
diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_FewShot.py b/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT4_FewShot.py
similarity index 93%
rename from assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_FewShot.py
rename to assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT4_FewShot.py
index 3d27a3d5..766a96f2 100644
--- a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_FewShot.py
+++ b/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT4_FewShot.py
@@ -1,16 +1,16 @@
 import os
 import re
 
-from llmebench.datasets import StanceUnifiedFCDataset
+from llmebench.datasets import UnifiedFCStanceDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import StanceUnifiedFCTask
+from llmebench.tasks import StanceTask
 
 
 def config():
     return {
-        "dataset": StanceUnifiedFCDataset,
+        "dataset": UnifiedFCStanceDataset,
         "dataset_args": {},
-        "task": StanceUnifiedFCTask,
+        "task": StanceTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_ZeroShot.py b/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT4_ZeroShot.py
similarity index 91%
rename from assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_ZeroShot.py
rename to assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT4_ZeroShot.py
index a435a768..c2e0f163 100644
--- a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_ZeroShot.py
+++ b/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT4_ZeroShot.py
@@ -1,16 +1,16 @@
 import os
 import re
 
-from llmebench.datasets import StanceUnifiedFCDataset
+from llmebench.datasets import UnifiedFCStanceDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import StanceUnifiedFCTask
+from llmebench.tasks import StanceTask
 
 
 def config():
     return {
-        "dataset": StanceUnifiedFCDataset,
+        "dataset": UnifiedFCStanceDataset,
         "dataset_args": {},
-        "task": StanceUnifiedFCTask,
+        "task": StanceTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/NER/NERANERcorp_ChatGPT_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT35_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/NER/NERANERcorp_ChatGPT_ZeroShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT35_ZeroShot.py
diff --git a/assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_FewShot.py b/assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT4_FewShot.py
similarity index 100%
rename from assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_FewShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT4_FewShot.py
diff --git a/assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_ZeroShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/NER/NERAqmar_ChatGPT_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT35_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/NER/NERAqmar_ChatGPT_ZeroShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT35_ZeroShot.py
diff --git a/assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_FewShot.py b/assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT4_FewShot.py
similarity index 100%
rename from assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_FewShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT4_FewShot.py
diff --git a/assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_ZeroShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/NER/MGBWords_ChatGPT_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/NER/MGBWords_GPT35_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/NER/MGBWords_ChatGPT_ZeroShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/NER/MGBWords_GPT35_ZeroShot.py
diff --git a/assets/benchmark_v1/NER/MGBWords_GPTChatCompletion_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/NER/MGBWords_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/NER/MGBWords_GPTChatCompletion_ZeroShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/NER/MGBWords_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabicPOS_GPT4_ZeroShot.py
similarity index 94%
rename from assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPT4_ZeroShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabicPOS_GPT4_ZeroShot.py
index 18db4d89..102afdb9 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPT4_ZeroShot.py
+++ b/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabicPOS_GPT4_ZeroShot.py
@@ -1,7 +1,7 @@
 import os
 import re
 
-from llmebench.datasets import ArabicPOSDataset
+from llmebench.datasets import QCRIDialectalArabicPOSDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import ArabicPOSTask
 
@@ -100,8 +100,6 @@ def config():
         ("glf", "glf.pos/glf.data_5.test.src-trg.sent"),
         ("mgr", "mgr.pos/mgr.data_5.test.src-trg.sent"),
         ("lev", "lev.pos/lev.data_5.test.src-trg.sent"),
-        ("msa", "WikiNewsTruth.txt"),
-        ("XGLUE", "XGLUE/ar.test.src-tgt.txt"),
     ]
     configs = []
     for name, testset in sets:
@@ -109,7 +107,7 @@ def config():
             {
                 "name": name,
                 "config": {
-                    "dataset": ArabicPOSDataset,
+                    "dataset": QCRIDialectalArabicPOSDataset,
                     "dataset_args": {},
                     "task": ArabicPOSTask,
                     "task_args": {},
@@ -123,7 +121,7 @@ def config():
                         "max_tries": 3,
                     },
                     "general_args": {
-                        "data_path": "data/sequence_tagging_ner_pos_etc/POS/" + testset
+                        "data_path": f"data/sequence_tagging_ner_pos_etc/POS/{testset}"
                     },
                 },
             }
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_ChatGPT_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_GPT35_ZeroShot.py
similarity index 94%
rename from assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_ChatGPT_ZeroShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_GPT35_ZeroShot.py
index 69f4fe5c..ada3e5d3 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_ChatGPT_ZeroShot.py
+++ b/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_GPT35_ZeroShot.py
@@ -1,7 +1,7 @@
 import os
 import re
 
-from llmebench.datasets import ArabicPOSDataset
+from llmebench.datasets import QCRIDialectalArabicPOSDataset
 from llmebench.models import GPTModel, RandomGPTModel
 from llmebench.tasks import ArabicPOSTask
 
@@ -100,8 +100,6 @@ def config():
         ("glf", "glf.pos/glf.data_5.test.src-trg.sent"),
         ("mgr", "mgr.pos/mgr.data_5.test.src-trg.sent"),
         ("lev", "lev.pos/lev.data_5.test.src-trg.sent"),
-        ("msa", "WikiNewsTruth.txt"),
-        ("XGLUE", "XGLUE/ar.test.src-tgt.txt"),
     ]
     configs = []
     for name, testset in sets:
@@ -109,7 +107,7 @@ def config():
             {
                 "name": name,
                 "config": {
-                    "dataset": ArabicPOSDataset,
+                    "dataset": QCRIDialectalArabicPOSDataset,
                     "dataset_args": {},
                     "task": ArabicPOSTask,
                     "task_args": {},
@@ -123,7 +121,7 @@ def config():
                         "max_tries": 3,
                     },
                     "general_args": {
-                        "data_path": "data/sequence_tagging_ner_pos_etc/POS/" + testset
+                        "data_path": f"data/sequence_tagging_ner_pos_etc/POS/{testset}"
                     },
                 },
             }
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPTChatCompletion_FewShot.py b/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_GPT4_FewShot.py
similarity index 91%
rename from assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPTChatCompletion_FewShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_GPT4_FewShot.py
index 70cf5815..a10d80bc 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPTChatCompletion_FewShot.py
+++ b/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_GPT4_FewShot.py
@@ -1,7 +1,7 @@
 import os
 import re
 
-from llmebench.datasets import ArabicPOSDataset
+from llmebench.datasets import QCRIDialectalArabicPOSDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import ArabicPOSTask
 
@@ -116,8 +116,6 @@ def config():
             "lev.pos/lev.data_5.test.src-trg.sent",
             "lev.pos/lev.data_5.dev.src-trg.sent",
         ),
-        ("msa", "WikiNewsTruth.txt.POS.tab", "WikiNewsTruthDev.txt"),
-        ("XGLUE", "XGLUE/ar.test.src-trg.txt", "XGLUE/ar.dev.src-trg.txt"),
     ]
     configs = []
     for name, testset, devset in sets:
@@ -125,7 +123,7 @@ def config():
             {
                 "name": name,
                 "config": {
-                    "dataset": ArabicPOSDataset,
+                    "dataset": QCRIDialectalArabicPOSDataset,
                     "dataset_args": {},
                     "task": ArabicPOSTask,
                     "task_args": {},
@@ -136,14 +134,12 @@ def config():
                         "api_base": os.environ["AZURE_API_URL"],
                         "api_key": os.environ["AZURE_API_KEY"],
                         "engine_name": os.environ["ENGINE_NAME"],
-                        # "class_labels": ["m", "f"],
                         "max_tries": 30,
                     },
                     "general_args": {
-                        "data_path": "data/sequence_tagging_ner_pos_etc/POS/" + testset,
+                        "data_path": f"data/sequence_tagging_ner_pos_etc/POS/{testset}",
                         "fewshot": {
-                            "train_data_path": "data/sequence_tagging_ner_pos_etc/POS/"
-                            + devset
+                            "train_data_path": f"data/sequence_tagging_ner_pos_etc/POS/{devset}"
                         },
                     },
                 },
diff --git a/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT35_ZeroShot.py
new file mode 100644
index 00000000..2e5517a5
--- /dev/null
+++ b/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT35_ZeroShot.py
@@ -0,0 +1,155 @@
+import os
+import re
+
+from llmebench.datasets import WikiNewsPOSDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import ArabicPOSTask
+
+mapTags = {
+    "UNK": "UNK",
+    "EOS": "EOS",
+    ".": "PUNC",
+    "NNP": "NOUN",
+    "JJR": "ADJ",
+    "CD": "NOUN",
+    "ADJ": "ADJ",
+    "adjective": "ADJ",
+    "JJ": "ADJ",
+    "MD": "ADJ",
+    "ADJF": "ADJ",
+    "ADV": "ADV",
+    "ADP": "ADV",
+    "adverb": "ADV",
+    "conjunction": "CONJ",
+    "CONJ": "CONJ",
+    "SCONJ": "CONJ",
+    "CCONJ": "CONJ",
+    "CC": "CONJ",
+    "DT": "DET",
+    "DET": "DET",
+    "hashtag": "HASH",
+    "NOUN": "NOUN",
+    "noun": "NOUN",
+    "N": "NOUN",
+    "NN": "NOUN",
+    "INTJ": "NOUN",
+    "PROPN": "NOUN",
+    "NEG": "PART",
+    "PART": "PART",
+    "NEG_PART": "PART",
+    "IN": "PART",
+    "preposition": "PREP",
+    "P": "PREP",
+    "PREP": "PREP",
+    "PRP": "PREP",
+    "PRON": "PRON",
+    "pronoun": "PRON",
+    "REL": "PRON",
+    "DEM": "PRON",
+    "PUNC": "PUNC",
+    "punctuation": "PUNC",
+    "PUNCT": "PUNC",
+    "SYM": "PUNC",
+    "verb": "V",
+    "VERB": "V",
+    "V": "V",
+    "VB": "V",
+    "RB": "ADV",
+    "VBG": "V",
+    "VBZ": "V",
+    "PRO": "PRON",
+    "conj": "CONJ",
+    "punct": "PUNC",
+    "neg": "PART",
+    "pron": "PRON",
+    "prep": "PREP",
+    "COMP": "ADJ",
+    "interjection": "PART",
+    "number": "NOUN",
+    "MOD": "PART",
+    "NUM": "NOUN",
+    "determiner": "DET",
+    "negation": "PART",
+    "url": "URL",
+    "demonstrative": "PRON",
+    "particle": "PART",
+    "HASHTAG": "HASH",
+    "NPROP": "NOUN",
+    "EMOJI": "EMOJI",
+    ",": "PUNC",
+    "RELPRO": "PRON",
+    "X": "NOUN",
+    "MENTION": "MENTION",
+    "اسم": "NOUN",
+    "اسم علم": "NOUN",
+    "حرف جر": "PREP",
+    "حرف شرطي": "PART",
+    "حرف عطف": "CONJ",
+    "حرف نداء": "PART",
+    "حرف نفي": "PART",
+    "عدد": "NOUN",
+    "فاصلة": "ADJ",
+    "فعل": "V",
+    "": "",
+}
+
+
+def config():
+    return {
+        "dataset": WikiNewsPOSDataset,
+        "dataset_args": {},
+        "task": ArabicPOSTask,
+        "task_args": {},
+        "model": GPTModel,
+        "model_args": {
+            "api_type": "azure",
+            "api_version": "2023-03-15-preview",
+            "api_base": os.environ["AZURE_API_URL"],
+            "api_key": os.environ["AZURE_API_KEY"],
+            "engine_name": os.environ["ENGINE_NAME"],
+            "max_tries": 3,
+        },
+        "general_args": {
+            "data_path": "data/sequence_tagging_ner_pos_etc/POS/WikiNewsTruth.txt.POS.tab"
+        },
+    }
+
+
+def prompt(input_sample):
+    return {
+        "system_message": "You are an AI assistant that helps people find information.",
+        "messages": [
+            {
+                "sender": "user",
+                "text": f'Please provide the POS tags for each word in the input sentence. The input will be a list of words in the sentence. \
+                 The output format should be a list of tuples, where each tuple consists of a word from the input text and its \
+                 corresponding POS tag label from the tag label set: \
+                 ["ABBREV", "ADJ", "ADV", "CASE", "CONJ", "DET", "EMOT", "FOREIGN", "FUT_PART", "HASH", "MENTION", "NEG_PART", "NOUN", \
+                 "NSUFF", "NUM", "PART", "PREP", "PROG_PART", "PRON", "PUNC", "URL", "V"].\
+                Note: Your response should include only a list of tuples, in the order that the words appear in the input sentence, \
+                with each tuple containing the corresponding POS tag label for a word. Input:+: {input_sample}',
+            }
+        ],
+    }
+
+
+def post_process(response):
+    text = response["choices"][0]["text"]
+
+    if "Sorry, I cannot" in text or "Unfortunately" in text:
+        return None
+
+    text = re.sub(r"Here's the segmented sentence in a JSON format:", "", text)
+
+    pattern = r"\([\"\']([^\"\']+)[\'\"], [\"\']([^\"\']+)[\'\"]\)"
+    matches = re.finditer(pattern, text)
+    results = []
+
+    for m in matches:
+        tag = m.group(2)
+        ntag = []
+        for t in tag.split("+"):
+            ntag.append(mapTags[t] if t in mapTags else t)
+        results.append("+".join(ntag))
+
+    return " ".join(results)
diff --git a/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT4_FewShot.py b/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT4_FewShot.py
new file mode 100644
index 00000000..21e8b2b8
--- /dev/null
+++ b/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT4_FewShot.py
@@ -0,0 +1,176 @@
+import os
+import re
+
+from llmebench.datasets import WikiNewsPOSDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import ArabicPOSTask
+
+mapTags = {
+    "UNK": "UNK",
+    "EOS": "EOS",
+    ".": "PUNC",
+    "NNP": "NOUN",
+    "JJR": "ADJ",
+    "CD": "NOUN",
+    "ADJ": "ADJ",
+    "adjective": "ADJ",
+    "JJ": "ADJ",
+    "MD": "ADJ",
+    "ADJF": "ADJ",
+    "ADV": "ADV",
+    "ADP": "ADV",
+    "adverb": "ADV",
+    "conjunction": "CONJ",
+    "CONJ": "CONJ",
+    "SCONJ": "CONJ",
+    "CCONJ": "CONJ",
+    "CC": "CONJ",
+    "DT": "DET",
+    "DET": "DET",
+    "hashtag": "HASH",
+    "NOUN": "NOUN",
+    "noun": "NOUN",
+    "N": "NOUN",
+    "NN": "NOUN",
+    "INTJ": "NOUN",
+    "PROPN": "NOUN",
+    "NEG": "PART",
+    "PART": "PART",
+    "NEG_PART": "PART",
+    "IN": "PART",
+    "preposition": "PREP",
+    "P": "PREP",
+    "PREP": "PREP",
+    "PRP": "PREP",
+    "PRON": "PRON",
+    "pronoun": "PRON",
+    "REL": "PRON",
+    "DEM": "PRON",
+    "PUNC": "PUNC",
+    "punctuation": "PUNC",
+    "PUNCT": "PUNC",
+    "SYM": "PUNC",
+    "verb": "V",
+    "VERB": "V",
+    "V": "V",
+    "VB": "V",
+    "RB": "ADV",
+    "VBG": "V",
+    "VBZ": "V",
+    "PRO": "PRON",
+    "conj": "CONJ",
+    "punct": "PUNC",
+    "neg": "PART",
+    "pron": "PRON",
+    "prep": "PREP",
+    "COMP": "ADJ",
+    "interjection": "PART",
+    "number": "NOUN",
+    "MOD": "PART",
+    "NUM": "NOUN",
+    "determiner": "DET",
+    "negation": "PART",
+    "url": "URL",
+    "demonstrative": "PRON",
+    "particle": "PART",
+    "HASHTAG": "HASH",
+    "NPROP": "NOUN",
+    "EMOJI": "EMOJI",
+    ",": "PUNC",
+    "RELPRO": "PRON",
+    "X": "NOUN",
+    "MENTION": "MENTION",
+    "اسم": "NOUN",
+    "اسم علم": "NOUN",
+    "حرف جر": "PREP",
+    "حرف شرطي": "PART",
+    "حرف عطف": "CONJ",
+    "حرف نداء": "PART",
+    "حرف نفي": "PART",
+    "عدد": "NOUN",
+    "فاصلة": "ADJ",
+    "فعل": "V",
+    "": "",
+}
+
+
+def config():
+    return {
+        "dataset": WikiNewsPOSDataset,
+        "dataset_args": {},
+        "task": ArabicPOSTask,
+        "task_args": {},
+        "model": GPTChatCompletionModel,
+        "model_args": {
+            "api_type": "azure",
+            "api_version": "2023-03-15-preview",
+            "api_base": os.environ["AZURE_API_URL"],
+            "api_key": os.environ["AZURE_API_KEY"],
+            "engine_name": os.environ["ENGINE_NAME"],
+            "max_tries": 30,
+        },
+        "general_args": {
+            "data_path": "data/sequence_tagging_ner_pos_etc/POS/WikiNewsTruth.txt.POS.tab",
+            "fewshot": {
+                "train_data_path": "data/sequence_tagging_ner_pos_etc/POS/WikiNewsTruthDev.txt"
+            },
+        },
+    }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    output_prompt = base_prompt + "\n"
+    for example in examples:
+        tokens = example["input"]
+        label = example["label"]
+        sample = list(zip(tokens.split(), label.split()))
+        output_prompt = (
+            output_prompt
+            + f"Sentence: {tokens.split()}"
+            + "\n"
+            + f"Labels: {sample}"
+            + "\n"
+        )
+    output_prompt = (
+        output_prompt + f"Sentence: {input_sample.split()}" + "\n" + "Labels:"
+    )
+    return output_prompt
+
+
+def prompt(input_sample, examples):
+    base_prompt = f'Please provide the POS tags for each word in the input sentence. The input will be a list of words in the sentence. The output format should be a list of tuples, where each tuple consists of a word from the input text and its corresponding POS tag label from the tag label set: ["ABBREV", "ADJ", "ADV", "CASE", "CONJ", "DET", "EMOT", "FOREIGN", "FUT_PART", "HASH", "MENTION", "NEG_PART", "NOUN", "NSUFF", "NUM", "PART", "PREP", "PROG_PART", "PRON", "PUNC", "URL", "V"]. Note: Your response should include only a list of tuples, in the order that the words appear in the input sentence, with each tuple containing the corresponding POS tag label for a word.'
+
+    return [
+        {
+            "role": "system",
+            "content": "You are a linguist that helps in annotating data.",
+        },
+        {
+            "role": "user",
+            "content": few_shot_prompt(input_sample, base_prompt, examples),
+        },
+    ]
+
+
+def post_process(response):
+    text = response["choices"][0]["message"]["content"]
+    matches = re.findall(r"\((.*?)\)", text)
+    if matches:
+        cleaned_response = []
+        for match in matches:
+            elements = match.split(",")
+            try:
+                cleaned_response.append(elements[1])
+            except:
+                if ":" in elements[0]:
+                    cleaned_response.append("EMOT")
+                elif len(elements[0].replace("'", "").strip()) == 0:
+                    cleaned_response.append("PUNCT")
+
+        cleaned_response = [
+            sample.replace("'", "").strip() for sample in cleaned_response
+        ]
+        cleaned_response = " ".join(cleaned_response)
+    else:
+        cleaned_response = None
+    return cleaned_response
diff --git a/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT4_ZeroShot.py
new file mode 100644
index 00000000..a514fa88
--- /dev/null
+++ b/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT4_ZeroShot.py
@@ -0,0 +1,159 @@
+import os
+import re
+
+from llmebench.datasets import WikiNewsPOSDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import ArabicPOSTask
+
+mapTags = {
+    "UNK": "UNK",
+    "EOS": "EOS",
+    ".": "PUNC",
+    "NNP": "NOUN",
+    "JJR": "ADJ",
+    "CD": "NOUN",
+    "ADJ": "ADJ",
+    "adjective": "ADJ",
+    "JJ": "ADJ",
+    "MD": "ADJ",
+    "ADJF": "ADJ",
+    "ADV": "ADV",
+    "ADP": "ADV",
+    "adverb": "ADV",
+    "conjunction": "CONJ",
+    "CONJ": "CONJ",
+    "SCONJ": "CONJ",
+    "CCONJ": "CONJ",
+    "CC": "CONJ",
+    "DT": "DET",
+    "DET": "DET",
+    "hashtag": "HASH",
+    "NOUN": "NOUN",
+    "noun": "NOUN",
+    "N": "NOUN",
+    "NN": "NOUN",
+    "INTJ": "NOUN",
+    "PROPN": "NOUN",
+    "NEG": "PART",
+    "PART": "PART",
+    "NEG_PART": "PART",
+    "IN": "PART",
+    "preposition": "PREP",
+    "P": "PREP",
+    "PREP": "PREP",
+    "PRP": "PREP",
+    "PRON": "PRON",
+    "pronoun": "PRON",
+    "REL": "PRON",
+    "DEM": "PRON",
+    "PUNC": "PUNC",
+    "punctuation": "PUNC",
+    "PUNCT": "PUNC",
+    "SYM": "PUNC",
+    "verb": "V",
+    "VERB": "V",
+    "V": "V",
+    "VB": "V",
+    "RB": "ADV",
+    "VBG": "V",
+    "VBZ": "V",
+    "PRO": "PRON",
+    "conj": "CONJ",
+    "punct": "PUNC",
+    "neg": "PART",
+    "pron": "PRON",
+    "prep": "PREP",
+    "COMP": "ADJ",
+    "interjection": "PART",
+    "number": "NOUN",
+    "MOD": "PART",
+    "NUM": "NOUN",
+    "determiner": "DET",
+    "negation": "PART",
+    "url": "URL",
+    "demonstrative": "PRON",
+    "particle": "PART",
+    "HASHTAG": "HASH",
+    "NPROP": "NOUN",
+    "EMOJI": "EMOJI",
+    ",": "PUNC",
+    "RELPRO": "PRON",
+    "X": "NOUN",
+    "MENTION": "MENTION",
+    "اسم": "NOUN",
+    "اسم علم": "NOUN",
+    "حرف جر": "PREP",
+    "حرف شرطي": "PART",
+    "حرف عطف": "CONJ",
+    "حرف نداء": "PART",
+    "حرف نفي": "PART",
+    "عدد": "NOUN",
+    "فاصلة": "ADJ",
+    "فعل": "V",
+    "": "",
+}
+
+
+def config():
+    return {
+        "dataset": WikiNewsPOSDataset,
+        "dataset_args": {},
+        "task": ArabicPOSTask,
+        "task_args": {},
+        "model": GPTChatCompletionModel,
+        "model_args": {
+            "api_type": "azure",
+            "api_version": "2023-03-15-preview",
+            "api_base": os.environ["AZURE_API_URL"],
+            "api_key": os.environ["AZURE_API_KEY"],
+            "engine_name": os.environ["ENGINE_NAME"],
+            "max_tries": 3,
+        },
+        "general_args": {
+            "data_path": "data/sequence_tagging_ner_pos_etc/POS/WikiNewsTruth.txt.POS.tab"
+        },
+    }
+
+
+def prompt(input_sample):
+    return [
+        {
+            "role": "system",
+            "content": "You are a linguist that helps in annotating data.",
+        },
+        {
+            "role": "user",
+            "content":
+            # f"Assign POS tag to each morphological segment within each word. group the tags for each word with +: {input_sample}"
+            # + ".\nThe output should be in the format: [{word: label}, {word: label}]",
+            f'Please provide the POS tags for each word in the input sentence. The input will be a list of words in the sentence. \
+                 The output format should be a list of tuples, where each tuple consists of a word from the input text and its \
+                 corresponding POS tag label from the tag label set: \
+                 ["ABBREV", "ADJ", "ADV", "CASE", "CONJ", "DET", "EMOT", "FOREIGN", "FUT_PART", "HASH", "MENTION", "NEG_PART", "NOUN", \
+                 "NSUFF", "NUM", "PART", "PREP", "PROG_PART", "PRON", "PUNC", "URL", "V"].\
+                Note: Your response should include only a list of tuples, in the order that the words appear in the input sentence, \
+                with each tuple containing the corresponding POS tag label for a word. Input:+: {input_sample}',
+        },
+    ]
+
+
+def post_process(response):
+    text = response["choices"][0]["message"]["content"]
+
+    if "Sorry, I cannot" in text or "Unfortunately" in text:
+        return None
+
+    text = re.sub(r"Here's the segmented sentence in a JSON format:", "", text)
+
+    pattern = r"\(\"([^\"]+)\", \"([^\"]+)\"\)"
+    matches = re.finditer(pattern, text)
+    results = []
+
+    for m in matches:
+        tag = m.group(2)
+        ntag = []
+        for t in tag.split("+"):
+            ntag.append(mapTags[t] if t in mapTags else t)
+        results.append("+".join(ntag))
+
+    return " ".join(results)
diff --git a/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT35_ZeroShot.py
new file mode 100644
index 00000000..2348e38a
--- /dev/null
+++ b/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT35_ZeroShot.py
@@ -0,0 +1,155 @@
+import os
+import re
+
+from llmebench.datasets import XGLUEPOSDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import ArabicPOSTask
+
+mapTags = {
+    "UNK": "UNK",
+    "EOS": "EOS",
+    ".": "PUNC",
+    "NNP": "NOUN",
+    "JJR": "ADJ",
+    "CD": "NOUN",
+    "ADJ": "ADJ",
+    "adjective": "ADJ",
+    "JJ": "ADJ",
+    "MD": "ADJ",
+    "ADJF": "ADJ",
+    "ADV": "ADV",
+    "ADP": "ADV",
+    "adverb": "ADV",
+    "conjunction": "CONJ",
+    "CONJ": "CONJ",
+    "SCONJ": "CONJ",
+    "CCONJ": "CONJ",
+    "CC": "CONJ",
+    "DT": "DET",
+    "DET": "DET",
+    "hashtag": "HASH",
+    "NOUN": "NOUN",
+    "noun": "NOUN",
+    "N": "NOUN",
+    "NN": "NOUN",
+    "INTJ": "NOUN",
+    "PROPN": "NOUN",
+    "NEG": "PART",
+    "PART": "PART",
+    "NEG_PART": "PART",
+    "IN": "PART",
+    "preposition": "PREP",
+    "P": "PREP",
+    "PREP": "PREP",
+    "PRP": "PREP",
+    "PRON": "PRON",
+    "pronoun": "PRON",
+    "REL": "PRON",
+    "DEM": "PRON",
+    "PUNC": "PUNC",
+    "punctuation": "PUNC",
+    "PUNCT": "PUNC",
+    "SYM": "PUNC",
+    "verb": "V",
+    "VERB": "V",
+    "V": "V",
+    "VB": "V",
+    "RB": "ADV",
+    "VBG": "V",
+    "VBZ": "V",
+    "PRO": "PRON",
+    "conj": "CONJ",
+    "punct": "PUNC",
+    "neg": "PART",
+    "pron": "PRON",
+    "prep": "PREP",
+    "COMP": "ADJ",
+    "interjection": "PART",
+    "number": "NOUN",
+    "MOD": "PART",
+    "NUM": "NOUN",
+    "determiner": "DET",
+    "negation": "PART",
+    "url": "URL",
+    "demonstrative": "PRON",
+    "particle": "PART",
+    "HASHTAG": "HASH",
+    "NPROP": "NOUN",
+    "EMOJI": "EMOJI",
+    ",": "PUNC",
+    "RELPRO": "PRON",
+    "X": "NOUN",
+    "MENTION": "MENTION",
+    "اسم": "NOUN",
+    "اسم علم": "NOUN",
+    "حرف جر": "PREP",
+    "حرف شرطي": "PART",
+    "حرف عطف": "CONJ",
+    "حرف نداء": "PART",
+    "حرف نفي": "PART",
+    "عدد": "NOUN",
+    "فاصلة": "ADJ",
+    "فعل": "V",
+    "": "",
+}
+
+
+def config():
+    return {
+        "dataset": XGLUEPOSDataset,
+        "dataset_args": {},
+        "task": ArabicPOSTask,
+        "task_args": {},
+        "model": GPTModel,
+        "model_args": {
+            "api_type": "azure",
+            "api_version": "2023-03-15-preview",
+            "api_base": os.environ["AZURE_API_URL"],
+            "api_key": os.environ["AZURE_API_KEY"],
+            "engine_name": os.environ["ENGINE_NAME"],
+            "max_tries": 3,
+        },
+        "general_args": {
+            "data_path": "data/sequence_tagging_ner_pos_etc/POS/XGLUE/ar.test.src-trg.txt"
+        },
+    }
+
+
+def prompt(input_sample):
+    return {
+        "system_message": "You are an AI assistant that helps people find information.",
+        "messages": [
+            {
+                "sender": "user",
+                "text": f'Please provide the POS tags for each word in the input sentence. The input will be a list of words in the sentence. \
+                 The output format should be a list of tuples, where each tuple consists of a word from the input text and its \
+                 corresponding POS tag label from the tag label set: \
+                 ["ABBREV", "ADJ", "ADV", "CASE", "CONJ", "DET", "EMOT", "FOREIGN", "FUT_PART", "HASH", "MENTION", "NEG_PART", "NOUN", \
+                 "NSUFF", "NUM", "PART", "PREP", "PROG_PART", "PRON", "PUNC", "URL", "V"].\
+                Note: Your response should include only a list of tuples, in the order that the words appear in the input sentence, \
+                with each tuple containing the corresponding POS tag label for a word. Input:+: {input_sample}',
+            }
+        ],
+    }
+
+
+def post_process(response):
+    text = response["choices"][0]["text"]
+
+    if "Sorry, I cannot" in text or "Unfortunately" in text:
+        return None
+
+    text = re.sub(r"Here's the segmented sentence in a JSON format:", "", text)
+
+    pattern = r"\([\"\']([^\"\']+)[\'\"], [\"\']([^\"\']+)[\'\"]\)"
+    matches = re.finditer(pattern, text)
+    results = []
+
+    for m in matches:
+        tag = m.group(2)
+        ntag = []
+        for t in tag.split("+"):
+            ntag.append(mapTags[t] if t in mapTags else t)
+        results.append("+".join(ntag))
+
+    return " ".join(results)
diff --git a/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT4_FewShot.py b/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT4_FewShot.py
new file mode 100644
index 00000000..1f44cf17
--- /dev/null
+++ b/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT4_FewShot.py
@@ -0,0 +1,176 @@
+import os
+import re
+
+from llmebench.datasets import XGLUEPOSDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import ArabicPOSTask
+
+mapTags = {
+    "UNK": "UNK",
+    "EOS": "EOS",
+    ".": "PUNC",
+    "NNP": "NOUN",
+    "JJR": "ADJ",
+    "CD": "NOUN",
+    "ADJ": "ADJ",
+    "adjective": "ADJ",
+    "JJ": "ADJ",
+    "MD": "ADJ",
+    "ADJF": "ADJ",
+    "ADV": "ADV",
+    "ADP": "ADV",
+    "adverb": "ADV",
+    "conjunction": "CONJ",
+    "CONJ": "CONJ",
+    "SCONJ": "CONJ",
+    "CCONJ": "CONJ",
+    "CC": "CONJ",
+    "DT": "DET",
+    "DET": "DET",
+    "hashtag": "HASH",
+    "NOUN": "NOUN",
+    "noun": "NOUN",
+    "N": "NOUN",
+    "NN": "NOUN",
+    "INTJ": "NOUN",
+    "PROPN": "NOUN",
+    "NEG": "PART",
+    "PART": "PART",
+    "NEG_PART": "PART",
+    "IN": "PART",
+    "preposition": "PREP",
+    "P": "PREP",
+    "PREP": "PREP",
+    "PRP": "PREP",
+    "PRON": "PRON",
+    "pronoun": "PRON",
+    "REL": "PRON",
+    "DEM": "PRON",
+    "PUNC": "PUNC",
+    "punctuation": "PUNC",
+    "PUNCT": "PUNC",
+    "SYM": "PUNC",
+    "verb": "V",
+    "VERB": "V",
+    "V": "V",
+    "VB": "V",
+    "RB": "ADV",
+    "VBG": "V",
+    "VBZ": "V",
+    "PRO": "PRON",
+    "conj": "CONJ",
+    "punct": "PUNC",
+    "neg": "PART",
+    "pron": "PRON",
+    "prep": "PREP",
+    "COMP": "ADJ",
+    "interjection": "PART",
+    "number": "NOUN",
+    "MOD": "PART",
+    "NUM": "NOUN",
+    "determiner": "DET",
+    "negation": "PART",
+    "url": "URL",
+    "demonstrative": "PRON",
+    "particle": "PART",
+    "HASHTAG": "HASH",
+    "NPROP": "NOUN",
+    "EMOJI": "EMOJI",
+    ",": "PUNC",
+    "RELPRO": "PRON",
+    "X": "NOUN",
+    "MENTION": "MENTION",
+    "اسم": "NOUN",
+    "اسم علم": "NOUN",
+    "حرف جر": "PREP",
+    "حرف شرطي": "PART",
+    "حرف عطف": "CONJ",
+    "حرف نداء": "PART",
+    "حرف نفي": "PART",
+    "عدد": "NOUN",
+    "فاصلة": "ADJ",
+    "فعل": "V",
+    "": "",
+}
+
+
+def config():
+    return {
+        "dataset": XGLUEPOSDataset,
+        "dataset_args": {},
+        "task": ArabicPOSTask,
+        "task_args": {},
+        "model": GPTChatCompletionModel,
+        "model_args": {
+            "api_type": "azure",
+            "api_version": "2023-03-15-preview",
+            "api_base": os.environ["AZURE_API_URL"],
+            "api_key": os.environ["AZURE_API_KEY"],
+            "engine_name": os.environ["ENGINE_NAME"],
+            "max_tries": 30,
+        },
+        "general_args": {
+            "data_path": "data/sequence_tagging_ner_pos_etc/POS/XGLUE/ar.test.src-trg.txt",
+            "fewshot": {
+                "train_data_path": "data/sequence_tagging_ner_pos_etc/POS/XGLUE/ar.dev.src-trg.txt"
+            },
+        },
+    }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    output_prompt = base_prompt + "\n"
+    for example in examples:
+        tokens = example["input"]
+        label = example["label"]
+        sample = list(zip(tokens.split(), label.split()))
+        output_prompt = (
+            output_prompt
+            + f"Sentence: {tokens.split()}"
+            + "\n"
+            + f"Labels: {sample}"
+            + "\n"
+        )
+    output_prompt = (
+        output_prompt + f"Sentence: {input_sample.split()}" + "\n" + "Labels:"
+    )
+    return output_prompt
+
+
+def prompt(input_sample, examples):
+    base_prompt = f'Please provide the POS tags for each word in the input sentence. The input will be a list of words in the sentence. The output format should be a list of tuples, where each tuple consists of a word from the input text and its corresponding POS tag label from the tag label set: ["ABBREV", "ADJ", "ADV", "CASE", "CONJ", "DET", "EMOT", "FOREIGN", "FUT_PART", "HASH", "MENTION", "NEG_PART", "NOUN", "NSUFF", "NUM", "PART", "PREP", "PROG_PART", "PRON", "PUNC", "URL", "V"]. Note: Your response should include only a list of tuples, in the order that the words appear in the input sentence, with each tuple containing the corresponding POS tag label for a word.'
+
+    return [
+        {
+            "role": "system",
+            "content": "You are a linguist that helps in annotating data.",
+        },
+        {
+            "role": "user",
+            "content": few_shot_prompt(input_sample, base_prompt, examples),
+        },
+    ]
+
+
+def post_process(response):
+    text = response["choices"][0]["message"]["content"]
+    matches = re.findall(r"\((.*?)\)", text)
+    if matches:
+        cleaned_response = []
+        for match in matches:
+            elements = match.split(",")
+            try:
+                cleaned_response.append(elements[1])
+            except:
+                if ":" in elements[0]:
+                    cleaned_response.append("EMOT")
+                elif len(elements[0].replace("'", "").strip()) == 0:
+                    cleaned_response.append("PUNCT")
+
+        cleaned_response = [
+            sample.replace("'", "").strip() for sample in cleaned_response
+        ]
+        cleaned_response = " ".join(cleaned_response)
+    else:
+        cleaned_response = None
+    return cleaned_response
diff --git a/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT4_ZeroShot.py
new file mode 100644
index 00000000..79cdfba0
--- /dev/null
+++ b/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT4_ZeroShot.py
@@ -0,0 +1,159 @@
+import os
+import re
+
+from llmebench.datasets import XGLUEPOSDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import ArabicPOSTask
+
+mapTags = {
+    "UNK": "UNK",
+    "EOS": "EOS",
+    ".": "PUNC",
+    "NNP": "NOUN",
+    "JJR": "ADJ",
+    "CD": "NOUN",
+    "ADJ": "ADJ",
+    "adjective": "ADJ",
+    "JJ": "ADJ",
+    "MD": "ADJ",
+    "ADJF": "ADJ",
+    "ADV": "ADV",
+    "ADP": "ADV",
+    "adverb": "ADV",
+    "conjunction": "CONJ",
+    "CONJ": "CONJ",
+    "SCONJ": "CONJ",
+    "CCONJ": "CONJ",
+    "CC": "CONJ",
+    "DT": "DET",
+    "DET": "DET",
+    "hashtag": "HASH",
+    "NOUN": "NOUN",
+    "noun": "NOUN",
+    "N": "NOUN",
+    "NN": "NOUN",
+    "INTJ": "NOUN",
+    "PROPN": "NOUN",
+    "NEG": "PART",
+    "PART": "PART",
+    "NEG_PART": "PART",
+    "IN": "PART",
+    "preposition": "PREP",
+    "P": "PREP",
+    "PREP": "PREP",
+    "PRP": "PREP",
+    "PRON": "PRON",
+    "pronoun": "PRON",
+    "REL": "PRON",
+    "DEM": "PRON",
+    "PUNC": "PUNC",
+    "punctuation": "PUNC",
+    "PUNCT": "PUNC",
+    "SYM": "PUNC",
+    "verb": "V",
+    "VERB": "V",
+    "V": "V",
+    "VB": "V",
+    "RB": "ADV",
+    "VBG": "V",
+    "VBZ": "V",
+    "PRO": "PRON",
+    "conj": "CONJ",
+    "punct": "PUNC",
+    "neg": "PART",
+    "pron": "PRON",
+    "prep": "PREP",
+    "COMP": "ADJ",
+    "interjection": "PART",
+    "number": "NOUN",
+    "MOD": "PART",
+    "NUM": "NOUN",
+    "determiner": "DET",
+    "negation": "PART",
+    "url": "URL",
+    "demonstrative": "PRON",
+    "particle": "PART",
+    "HASHTAG": "HASH",
+    "NPROP": "NOUN",
+    "EMOJI": "EMOJI",
+    ",": "PUNC",
+    "RELPRO": "PRON",
+    "X": "NOUN",
+    "MENTION": "MENTION",
+    "اسم": "NOUN",
+    "اسم علم": "NOUN",
+    "حرف جر": "PREP",
+    "حرف شرطي": "PART",
+    "حرف عطف": "CONJ",
+    "حرف نداء": "PART",
+    "حرف نفي": "PART",
+    "عدد": "NOUN",
+    "فاصلة": "ADJ",
+    "فعل": "V",
+    "": "",
+}
+
+
+def config():
+    return {
+        "dataset": XGLUEPOSDataset,
+        "dataset_args": {},
+        "task": ArabicPOSTask,
+        "task_args": {},
+        "model": GPTChatCompletionModel,
+        "model_args": {
+            "api_type": "azure",
+            "api_version": "2023-03-15-preview",
+            "api_base": os.environ["AZURE_API_URL"],
+            "api_key": os.environ["AZURE_API_KEY"],
+            "engine_name": os.environ["ENGINE_NAME"],
+            "max_tries": 3,
+        },
+        "general_args": {
+            "data_path": "data/sequence_tagging_ner_pos_etc/POS/XGLUE/ar.test.src-trg.txt"
+        },
+    }
+
+
+def prompt(input_sample):
+    return [
+        {
+            "role": "system",
+            "content": "You are a linguist that helps in annotating data.",
+        },
+        {
+            "role": "user",
+            "content":
+            # f"Assign POS tag to each morphological segment within each word. group the tags for each word with +: {input_sample}"
+            # + ".\nThe output should be in the format: [{word: label}, {word: label}]",
+            f'Please provide the POS tags for each word in the input sentence. The input will be a list of words in the sentence. \
+                 The output format should be a list of tuples, where each tuple consists of a word from the input text and its \
+                 corresponding POS tag label from the tag label set: \
+                 ["ABBREV", "ADJ", "ADV", "CASE", "CONJ", "DET", "EMOT", "FOREIGN", "FUT_PART", "HASH", "MENTION", "NEG_PART", "NOUN", \
+                 "NSUFF", "NUM", "PART", "PREP", "PROG_PART", "PRON", "PUNC", "URL", "V"].\
+                Note: Your response should include only a list of tuples, in the order that the words appear in the input sentence, \
+                with each tuple containing the corresponding POS tag label for a word. Input:+: {input_sample}',
+        },
+    ]
+
+
+def post_process(response):
+    text = response["choices"][0]["message"]["content"]
+
+    if "Sorry, I cannot" in text or "Unfortunately" in text:
+        return None
+
+    text = re.sub(r"Here's the segmented sentence in a JSON format:", "", text)
+
+    pattern = r"\(\"([^\"]+)\", \"([^\"]+)\"\)"
+    matches = re.finditer(pattern, text)
+    results = []
+
+    for m in matches:
+        tag = m.group(2)
+        ntag = []
+        for t in tag.split("+"):
+            ntag.append(mapTags[t] if t in mapTags else t)
+        results.append("+".join(ntag))
+
+    return " ".join(results)
diff --git a/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT35_ZeroShot.py
new file mode 100644
index 00000000..e3637234
--- /dev/null
+++ b/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT35_ZeroShot.py
@@ -0,0 +1,55 @@
+import os
+
+from llmebench.datasets import BibleMaghrebiDiacritizationDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import ArabicDiacritizationTask
+
+
+def config():
+    sets = [
+        ("mor", "morrocan_f05.test.src-tgt.txt"),
+        ("tun", "tunisian_f05.test.src-tgt.txt"),
+    ]
+    configs = []
+    for name, testset in sets:
+        configs.append(
+            {
+                "name": name,
+                "config": {
+                    "dataset": BibleMaghrebiDiacritizationDataset,
+                    "dataset_args": {},
+                    "task": ArabicDiacritizationTask,
+                    "task_args": {},
+                    "model": GPTModel,
+                    "model_args": {
+                        "api_type": "azure",
+                        "api_version": "2023-03-15-preview",
+                        "api_base": os.environ["AZURE_API_URL"],
+                        "api_key": os.environ["AZURE_API_KEY"],
+                        "engine_name": os.environ["ENGINE_NAME"],
+                        "max_tries": 3,
+                    },
+                    "general_args": {
+                        "data_path": f"data/sequence_tagging_ner_pos_etc/diacritization/{testset}"
+                    },
+                },
+            }
+        )
+
+    return configs
+
+
+def prompt(input_sample):
+    return {
+        "system_message": "You are an AI assistant that helps people find information.",
+        "messages": [
+            {
+                "sender": "user",
+                "text": f"Diacritize fully the following Arabic sentence: {input_sample}",
+            }
+        ],
+    }
+
+
+def post_process(response):
+    return response["choices"][0]["text"]
diff --git a/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT4_ZeroShot.py
new file mode 100644
index 00000000..219e704f
--- /dev/null
+++ b/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT4_ZeroShot.py
@@ -0,0 +1,59 @@
+import os
+
+from llmebench.datasets import BibleMaghrebiDiacritizationDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import ArabicDiacritizationTask
+
+
+def config():
+    sets = [
+        ("mor", "morrocan_f05.test.src-tgt.txt"),
+        ("tun", "tunisian_f05.test.src-tgt.txt"),
+    ]
+    configs = []
+    for name, testset in sets:
+        configs.append(
+            {
+                "name": name,
+                "config": {
+                    "dataset": BibleMaghrebiDiacritizationDataset,
+                    "dataset_args": {},
+                    "task": ArabicDiacritizationTask,
+                    "task_args": {},
+                    "model": GPTChatCompletionModel,
+                    "model_args": {
+                        "api_type": "azure",
+                        "api_version": "2023-03-15-preview",
+                        "api_base": os.environ["AZURE_API_URL"],
+                        "api_key": os.environ["AZURE_API_KEY"],
+                        "engine_name": os.environ["ENGINE_NAME"],
+                        "max_tries": 3,
+                    },
+                    "general_args": {
+                        "data_path": f"data/sequence_tagging_ner_pos_etc/diacritization/{testset}"
+                    },
+                },
+            }
+        )
+    return configs
+
+
+def prompt(input_sample):
+    return [
+        {
+            "role": "system",
+            "content": "You are a linguist that helps in annotating data.",
+        },
+        {
+            "role": "user",
+            "content": f"Diacritize fully the following Arabic sentence including adding case endings:\n {input_sample}\n\
+                     Make sure to put back non-Arabic tokens intact into the output sentence.\
+                    ",
+        },
+    ]
+
+
+def post_process(response):
+    text = response["choices"][0]["message"]["content"]
+
+    return text
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT35_ZeroShot.py
similarity index 90%
rename from assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT35_ZeroShot.py
index 0634ab80..281ceee5 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py
+++ b/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT35_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import ArabicDiacritizationDataset
+from llmebench.datasets import WikiNewsDiacritizationDataset
 from llmebench.models import GPTModel
 from llmebench.tasks import ArabicDiacritizationTask
 
 
 def config():
     return {
-        "dataset": ArabicDiacritizationDataset,
+        "dataset": WikiNewsDiacritizationDataset,
         "dataset_args": {},
         "task": ArabicDiacritizationTask,
         "task_args": {},
diff --git a/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT4_ZeroShot.py
new file mode 100644
index 00000000..cebc37db
--- /dev/null
+++ b/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT4_ZeroShot.py
@@ -0,0 +1,47 @@
+import os
+
+from llmebench.datasets import WikiNewsDiacritizationDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import ArabicDiacritizationTask
+
+
+def config():
+    return {
+        "dataset": WikiNewsDiacritizationDataset,
+        "dataset_args": {},
+        "task": ArabicDiacritizationTask,
+        "task_args": {},
+        "model": GPTChatCompletionModel,
+        "model_args": {
+            "api_type": "azure",
+            "api_version": "2023-03-15-preview",
+            "api_base": os.environ["AZURE_API_URL"],
+            "api_key": os.environ["AZURE_API_KEY"],
+            "engine_name": os.environ["ENGINE_NAME"],
+            "max_tries": 3,
+        },
+        "general_args": {
+            "data_path": "data/sequence_tagging_ner_pos_etc/diacritization/WikiNewsTruth.txt"
+        },
+    }
+
+
+def prompt(input_sample):
+    return [
+        {
+            "role": "system",
+            "content": "You are a linguist that helps in annotating data.",
+        },
+        {
+            "role": "user",
+            "content": f"Diacritize fully the following Arabic sentence including adding case endings:\n {input_sample}\n\
+                     Make sure to put back non-Arabic tokens intact into the output sentence.\
+                    ",
+        },
+    ]
+
+
+def post_process(response):
+    text = response["choices"][0]["message"]["content"]
+
+    return text
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_BLOOMZ_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_BLOOMZ_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_BLOOMZ_ZeroShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_BLOOMZ_ZeroShot.py
index 1fbdfa10..380eaf38 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_BLOOMZ_ZeroShot.py
+++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_BLOOMZ_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import DialectADIDataset
+from llmebench.datasets import ADIDataset
 from llmebench.models import BLOOMPetalModel
 from llmebench.tasks import DialectIDTask
 
 
 def config():
     return {
-        "dataset": DialectADIDataset,
+        "dataset": ADIDataset,
         "dataset_args": {},
         "task": DialectIDTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_ChatGPT_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT35_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_ChatGPT_ZeroShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT35_ZeroShot.py
index 8e243727..cb324f3d 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_ChatGPT_ZeroShot.py
+++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT35_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import DialectADIDataset
+from llmebench.datasets import ADIDataset
 from llmebench.models import GPTModel, RandomGPTModel
 from llmebench.tasks import DialectIDTask
 
 
 def config():
     return {
-        "dataset": DialectADIDataset,
+        "dataset": ADIDataset,
         "dataset_args": {},
         "task": DialectIDTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_FewShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT4_FewShot.py
similarity index 97%
rename from assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_FewShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT4_FewShot.py
index ccfe05f2..0230eca5 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_FewShot.py
+++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT4_FewShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import DialectADIDataset
+from llmebench.datasets import ADIDataset
 from llmebench.models import GPTChatCompletionModel, RandomGPTModel
 from llmebench.tasks import DialectIDTask
 
 
 def config():
     return {
-        "dataset": DialectADIDataset,
+        "dataset": ADIDataset,
         "dataset_args": {},
         "task": DialectIDTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT4_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_ZeroShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT4_ZeroShot.py
index 6cf4d539..d4875d14 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_ZeroShot.py
+++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT4_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import DialectADIDataset
+from llmebench.datasets import ADIDataset
 from llmebench.models import GPTChatCompletionModel, RandomGPTModel
 from llmebench.tasks import DialectIDTask
 
 
 def config():
     return {
-        "dataset": DialectADIDataset,
+        "dataset": ADIDataset,
         "dataset_args": {},
         "task": DialectIDTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_ChatGPT_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_GPT35_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_ChatGPT_ZeroShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_GPT35_ZeroShot.py
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_GPTChatCompletion_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_GPTChatCompletion_ZeroShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_GPT4_ZeroShot.py
diff --git a/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_BLOOMZ_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_BLOOMZ_ZeroShot.py
new file mode 100644
index 00000000..233869d6
--- /dev/null
+++ b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_BLOOMZ_ZeroShot.py
@@ -0,0 +1,51 @@
+import os
+
+from llmebench.datasets import WikiNewsLemmatizationDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import LemmatizationTask
+
+
+def config():
+    return {
+        "dataset": WikiNewsLemmatizationDataset,
+        "dataset_args": {},
+        "task": LemmatizationTask,
+        "task_args": {},
+        "model": BLOOMPetalModel,
+        "model_args": {
+            "api_url": os.environ["API_URL"],
+            "max_tries": 3,
+        },
+        "general_args": {
+            "data_path": "data/sequence_tagging_ner_pos_etc/lemmatization/WikiNews-26-06-2015-RefLemma.txt"
+        },
+    }
+
+
+def prompt(input_sample):
+    return {
+        "prompt": "for every word in the following Arabic sentence, write only the arabic lemma of the word separated by a single space without explanation.\n\n"
+        + "sentence: "
+        + input_sample
+        + "label: \n"
+    }
+
+
+def post_process(response):
+    label = response["outputs"]
+    label = label.replace("label:", "")
+    label = label.replace("label", "")
+
+    label = label.replace("<s>", "")
+    label = label.replace("</s>", "")
+
+    if (
+        label.startswith("Please provide the Arabic sentence")
+        or label.startswith("It seems")
+        or "is not" in label
+    ):
+        label = None
+    else:
+        # TODO: fix hack to handle prediction failure
+        label = (None, label.strip())
+    return label
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/lemmatization/Lemmatization_ChatGPT_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_GPT35_ZeroShot.py
similarity index 92%
rename from assets/benchmark_v1/sequence_tagging_ner_pos_etc/lemmatization/Lemmatization_ChatGPT_ZeroShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_GPT35_ZeroShot.py
index 9950ccac..b693d09a 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/lemmatization/Lemmatization_ChatGPT_ZeroShot.py
+++ b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_GPT35_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import LemmatizationDataset
+from llmebench.datasets import WikiNewsLemmatizationDataset
 from llmebench.models import GPTModel, RandomGPTModel
 from llmebench.tasks import LemmatizationTask
 
 
 def config():
     return {
-        "dataset": LemmatizationDataset,
+        "dataset": WikiNewsLemmatizationDataset,
         "dataset_args": {},
         "task": LemmatizationTask,
         "task_args": {},
diff --git a/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_GPT4_ZeroShot.py
new file mode 100644
index 00000000..7ca5c3f6
--- /dev/null
+++ b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_GPT4_ZeroShot.py
@@ -0,0 +1,54 @@
+import os
+
+from llmebench.datasets import WikiNewsLemmatizationDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import LemmatizationTask
+
+
+def config():
+    return {
+        "dataset": WikiNewsLemmatizationDataset,
+        "dataset_args": {},
+        "task": LemmatizationTask,
+        "task_args": {},
+        "model": GPTChatCompletionModel,
+        "model_args": {
+            "api_type": "azure",
+            "api_version": "2023-03-15-preview",
+            "api_base": os.environ["AZURE_API_URL"],
+            "api_key": os.environ["AZURE_API_KEY"],
+            "engine_name": os.environ["ENGINE_NAME"],
+            "max_tries": 3,
+        },
+        "general_args": {
+            "data_path": "data/sequence_tagging_ner_pos_etc/lemmatization/WikiNews-26-06-2015-RefLemma.txt"
+        },
+    }
+
+
+def prompt(input_sample):
+    return [
+        {
+            "role": "system",
+            "content": "You are a language expert, you can identify the lemma of any word within a sentence.",
+        },
+        {
+            "role": "user",
+            "content": f"for every word in the following Arabic word, write only the lemma without diacritics separated by a single space without explanation:\n {input_sample}",
+        },
+    ]
+
+
+def post_process(response):
+    x = response["choices"][0]["message"]["content"]
+    if (
+        x.startswith("Please provide the Arabic sentence")
+        or x.startswith("It seems")
+        or "is not" in x
+    ):
+        out = None
+    else:
+        # TODO: fix hack to handle prediction failure
+        out = (None, x)
+
+    return out
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_ChatGPT_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT35_ZeroShot.py
similarity index 94%
rename from assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_ChatGPT_ZeroShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT35_ZeroShot.py
index 77a175b7..8f9c55f9 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_ChatGPT_ZeroShot.py
+++ b/assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT35_ZeroShot.py
@@ -1,14 +1,14 @@
 import os
 import re
 
-from llmebench.datasets import ArabicParsingDataset
+from llmebench.datasets import PADTDataset
 from llmebench.models import GPTModel, RandomGPTModel
 from llmebench.tasks import ArabicParsingTask
 
 
 def config():
     return {
-        "dataset": ArabicParsingDataset,
+        "dataset": PADTDataset,
         "dataset_args": {},
         "task": ArabicParsingTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT4_ZeroShot.py
similarity index 94%
rename from assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_GPT4_ZeroShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT4_ZeroShot.py
index 36e3d02f..6edde39c 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_GPT4_ZeroShot.py
+++ b/assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT4_ZeroShot.py
@@ -1,14 +1,14 @@
 import os
 import re
 
-from llmebench.datasets import ArabicParsingDataset
+from llmebench.datasets import PADTDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import ArabicParsingTask
 
 
 def config():
     return {
-        "dataset": ArabicParsingDataset,
+        "dataset": PADTDataset,
         "dataset_args": {},
         "task": ArabicParsingTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_ChatGPT_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT35_ZeroShot.py
similarity index 85%
rename from assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_ChatGPT_ZeroShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT35_ZeroShot.py
index d808ce15..53a1c722 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_ChatGPT_ZeroShot.py
+++ b/assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT35_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 import re
 
-from llmebench.datasets import ArabicSegmentationDataset
-from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.datasets import QCRIDialectalArabicSegmentationDataset
+from llmebench.models import GPTModel
 from llmebench.tasks import ArabicSegmentationTask
 
 
@@ -12,7 +12,6 @@ def config():
         ("glf", "glf.seg/glf.data_5.test.src.sent"),
         ("mgr", "mgr.seg/mgr.data_5.test.src.sent"),
         ("lev", "lev.seg/lev.data_5.test.src.sent"),
-        ("msa", "WikiNewsTruth.txt"),
     ]
     configs = []
     for name, testset in sets:
@@ -20,7 +19,7 @@ def config():
             {
                 "name": name,
                 "config": {
-                    "dataset": ArabicSegmentationDataset,
+                    "dataset": QCRIDialectalArabicSegmentationDataset,
                     "dataset_args": {},
                     "task": ArabicSegmentationTask,
                     "task_args": {},
@@ -31,12 +30,10 @@ def config():
                         "api_base": os.environ["AZURE_API_URL"],
                         "api_key": os.environ["AZURE_API_KEY"],
                         "engine_name": os.environ["ENGINE_NAME"],
-                        # "class_labels": ["m", "f"],
                         "max_tries": 3,
                     },
                     "general_args": {
-                        "data_path": "data/sequence_tagging_ner_pos_etc/segmentation/"
-                        + testset
+                        "data_path": f"data/sequence_tagging_ner_pos_etc/segmentation/{testset}"
                     },
                 },
             }
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT4_ZeroShot.py
similarity index 92%
rename from assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_GPT4_ZeroShot.py
rename to assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT4_ZeroShot.py
index e6a3fc8d..63b39dad 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_GPT4_ZeroShot.py
+++ b/assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT4_ZeroShot.py
@@ -1,7 +1,7 @@
 import os
 import re
 
-from llmebench.datasets import ArabicSegmentationDataset
+from llmebench.datasets import QCRIDialectalArabicSegmentationDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import ArabicSegmentationTask
 
@@ -12,7 +12,6 @@ def config():
         ("glf", "glf.seg/glf.data_5.test.src.sent"),
         ("mgr", "mgr.seg/mgr.data_5.test.src.sent"),
         ("lev", "lev.seg/lev.data_5.test.src.sent"),
-        ("msa", "WikiNewsTruth.txt"),
     ]
     configs = []
     for name, testset in sets:
@@ -20,7 +19,7 @@ def config():
             {
                 "name": name,
                 "config": {
-                    "dataset": ArabicSegmentationDataset,
+                    "dataset": QCRIDialectalArabicSegmentationDataset,
                     "dataset_args": {},
                     "task": ArabicSegmentationTask,
                     "task_args": {},
@@ -31,7 +30,6 @@ def config():
                         "api_base": os.environ["AZURE_API_URL"],
                         "api_key": os.environ["AZURE_API_KEY"],
                         "engine_name": os.environ["ENGINE_NAME"],
-                        # "class_labels": ["m", "f"],
                         "max_tries": 3,
                     },
                     "general_args": {
diff --git a/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT35_ZeroShot.py
new file mode 100644
index 00000000..00d5b5ea
--- /dev/null
+++ b/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT35_ZeroShot.py
@@ -0,0 +1,58 @@
+import os
+import re
+
+from llmebench.datasets import WikiNewsSegmentationDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import ArabicSegmentationTask
+
+
+def config():
+    return {
+        "dataset": WikiNewsSegmentationDataset,
+        "dataset_args": {},
+        "task": ArabicSegmentationTask,
+        "task_args": {},
+        "model": GPTModel,
+        "model_args": {
+            "api_type": "azure",
+            "api_version": "2023-03-15-preview",
+            "api_base": os.environ["AZURE_API_URL"],
+            "api_key": os.environ["AZURE_API_KEY"],
+            "engine_name": os.environ["ENGINE_NAME"],
+            "max_tries": 3,
+        },
+        "general_args": {
+            "data_path": "data/sequence_tagging_ner_pos_etc/segmentation/WikiNewsTruth.txt"
+        },
+    }
+
+
+def prompt(input_sample):
+    return {
+        "system_message": "You are a linguist that helps in annotating data.",
+        "messages": [
+            {
+                "sender": "user",
+                "text": f"A word can be composed of one root and one or multiple affixed, \
+                    segment the following sentence into its morphological constituents:\n {input_sample}\
+                    The input will be a list of words in the sentence. \
+                    The output format should be a list of tuples, where each tuple consists of a word from the input text and its segmented form joined by a + sign.\
+                    ",
+            }
+        ],
+    }
+
+
+def post_process(response):
+    results = []
+    text = response["choices"][0]["text"]
+    pattern = "\([\"']([^\"']+)[\"'], [\"']([^\"']+)[\"']\)"
+    matches = re.finditer(pattern, text)
+    for m in matches:
+        results.append(m.group(2))
+    text = " ".join(results)
+
+    # Remove extra spaces
+    text = re.sub(r"\s+", " ", text)
+
+    return text
diff --git a/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT4_ZeroShot.py
new file mode 100644
index 00000000..748bb75c
--- /dev/null
+++ b/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT4_ZeroShot.py
@@ -0,0 +1,60 @@
+import os
+import re
+
+from llmebench.datasets import WikiNewsSegmentationDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import ArabicSegmentationTask
+
+
+def config():
+    return {
+        "dataset": WikiNewsSegmentationDataset,
+        "dataset_args": {},
+        "task": ArabicSegmentationTask,
+        "task_args": {},
+        "model": GPTChatCompletionModel,
+        "model_args": {
+            "api_type": "azure",
+            "api_version": "2023-03-15-preview",
+            "api_base": os.environ["AZURE_API_URL"],
+            "api_key": os.environ["AZURE_API_KEY"],
+            "engine_name": os.environ["ENGINE_NAME"],
+            "max_tries": 3,
+        },
+        "general_args": {
+            "data_path": "data/sequence_tagging_ner_pos_etc/segmentation/WikiNewsTruth.txt"
+        },
+    }
+
+
+def prompt(input_sample):
+    return [
+        {
+            "role": "system",
+            "content": "You are a linguist that helps in annotating data.",
+        },
+        {
+            "role": "user",
+            "content": f"A word can be composed of one root and one or multiple affixed, \
+                    segment the following sentence into its morphological constituents:\n {input_sample}\
+                    The input will be a list of words in the sentence. \
+                    The output format should be a list of tuples, where each tuple consists of a word from the input text and its segmented form joined by a + sign.\
+                    ",
+        },
+    ]
+
+
+def post_process(response):
+    results = []
+    text = response["choices"][0]["message"]["content"]
+    pattern = "\([\"']([^\"']+)[\"'], [\"']([^\"']+)[\"']\)"
+    matches = re.finditer(pattern, text)
+    for m in matches:
+        results.append(m.group(2))
+
+    text = " ".join(results)
+
+    # Remove extra spaces
+    text = re.sub(r"\s+", " ", text)
+
+    return text
diff --git a/assets/benchmark_v1/QA/.keep b/assets/benchmark_v1/QA/.keep
deleted file mode 100644
index 139597f9..00000000
--- a/assets/benchmark_v1/QA/.keep
+++ /dev/null
@@ -1,2 +0,0 @@
-
-
diff --git a/assets/benchmark_v1/STS/.keep b/assets/benchmark_v1/STS/.keep
deleted file mode 100644
index 139597f9..00000000
--- a/assets/benchmark_v1/STS/.keep
+++ /dev/null
@@ -1,2 +0,0 @@
-
-
diff --git a/assets/benchmark_v1/dialect_identification/.keep b/assets/benchmark_v1/dialect_identification/.keep
deleted file mode 100644
index 139597f9..00000000
--- a/assets/benchmark_v1/dialect_identification/.keep
+++ /dev/null
@@ -1,2 +0,0 @@
-
-
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_CGPT35_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_CGPT35_ZeroShot.py
deleted file mode 100644
index f2107913..00000000
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_CGPT35_ZeroShot.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import os
-
-from llmebench.datasets import CovidClaimDataset
-from llmebench.models import GPTModel
-from llmebench.tasks import ClaimDetectionTask
-
-
-def config():
-    return {
-        "dataset": CovidClaimDataset,
-        "dataset_args": {},
-        "task": ClaimDetectionTask,
-        "task_args": {},
-        "model": GPTModel,
-        "model_args": {
-            "api_type": "azure",
-            "api_version": "2023-03-15-preview",
-            "api_base": os.environ["AZURE_API_URL"],
-            "api_key": os.environ["AZURE_API_KEY"],
-            "engine_name": "gpt",
-            "max_tries": 3,
-        },
-        "general_args": {
-            "data_path": "data/factuality_disinformation_harmful_content/claim_covid19/CT22_arabic_1B_claim_test_gold.tsv"
-        },
-    }
-
-
-def prompt(input_sample):
-    return {
-        "system_message": "You are an AI assistant that helps people find information.",
-        "messages": [
-            {
-                "sender": "user",
-                "text": "Does this sentence contain a factual claim? Answer only by yes or no."
-                "\n\nsentence: " + input_sample + "label: \n",
-            }
-        ],
-    }
-
-
-def post_process(response):
-    input_label = response["choices"][0]["text"]
-    input_label = input_label.replace(".", "").strip().lower()
-    pred_label = ""
-
-    if (
-        "yes" in input_label
-        or "contains a factual claim" in input_label
-        or "label: 1" in input_label
-    ):
-        pred_label = "1"
-    if (
-        input_label == "no"
-        or "label: 0" in input_label
-        or "label: no" in input_label
-        or "not contain a factual claim" in input_label
-        or "doesn't contain a factual claim" in input_label
-    ):
-        pred_label = "0"
-
-    if pred_label == "":
-        pred_label = None
-
-    return pred_label
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_CGPT35_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_CGPT35_ZeroShot.py
deleted file mode 100644
index fdcf21f5..00000000
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_CGPT35_ZeroShot.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import os
-
-from llmebench.datasets import CovidHarmfulDataset
-from llmebench.models import GPTModel
-from llmebench.tasks import HarmfulDetectionTask
-
-
-def config():
-    return {
-        "dataset": CovidHarmfulDataset,
-        "dataset_args": {},
-        "task": HarmfulDetectionTask,
-        "task_args": {},
-        "model": GPTModel,
-        "model_args": {
-            "api_type": "azure",
-            "api_version": "2023-03-15-preview",
-            "api_base": os.environ["AZURE_API_URL"],
-            "api_key": os.environ["AZURE_API_KEY"],
-            "engine_name": "gpt",
-            "max_tries": 3,
-        },
-        "general_args": {
-            "data_path": "data/factuality_disinformation_harmful_content/harmful/CT22_arabic_1C_harmful_test_gold.tsv",
-        },
-    }
-
-
-def prompt(input_sample):
-    return {
-        "system_message": "You are an AI assistant that helps people find information.",
-        "messages": [
-            {
-                "sender": "user",
-                "text": "Classify the sentence as harmful or not harmful. Answer only by yes or no. Provide only "
-                "label.\n\n" + "sentence: " + input_sample + "label: \n",
-            }
-        ],
-    }
-
-
-def post_process(response):
-    pred_label = response["choices"][0]["text"]
-    pred_label = pred_label.replace(".", "").strip().lower()
-
-    if (
-        pred_label.startswith("harmful")
-        or pred_label.startswith("yes")
-        or "label: harmful" in pred_label
-    ):
-        pred_label = "1"
-
-    if (
-        pred_label.startswith("no")
-        or pred_label == "label: safe"
-        or "not harmful" in pred_label
-    ):
-        pred_label = "0"
-
-    return pred_label
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ZeroShot.py
deleted file mode 100644
index 5311470f..00000000
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ZeroShot.py
+++ /dev/null
@@ -1,148 +0,0 @@
-import os
-
-import regex as re
-
-from llmebench.datasets import PropagandaTweetDataset
-from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import PropagandaMultilabelTask
-
-
-def config():
-    return {
-        "dataset": PropagandaTweetDataset,
-        "dataset_args": {
-            "techniques_path": "data/factuality_disinformation_harmful_content/propaganda/classes.txt"
-        },
-        "task": PropagandaMultilabelTask,
-        "task_args": {},
-        "model": GPTChatCompletionModel,
-        "model_args": {
-            "api_type": "azure",
-            "api_version": "2023-03-15-preview",
-            "api_base": os.environ["AZURE_API_URL"],
-            "api_key": os.environ["AZURE_API_KEY"],
-            "engine_name": os.environ["ENGINE_NAME"],
-            "max_tries": 3,
-        },
-        "general_args": {
-            "data_path": "data/factuality_disinformation_harmful_content/propaganda/task1_test_gold_label_final.json"
-        },
-    }
-
-
-def prompt(input_sample):
-    return [
-        {
-            "role": "system",
-            "content": "## INSTRUCTION\nYou are an expert social media content analyst.\n\n",
-        },
-        {
-            "role": "user",
-            "content": 'Label this "Text" based on the following propaganda techniques: '
-            + "'no technique' , 'Smears' , 'Exaggeration/Minimisation' , 'Loaded Language' , 'Appeal to fear/prejudice' , 'Name calling/Labeling' , 'Slogans' , 'Repetition' , 'Doubt' , 'Obfuscation, Intentional vagueness, Confusion' , 'Flag-waving' , 'Glittering generalities (Virtue)' , 'Misrepresentation of Someone's Position (Straw Man)' , 'Presenting Irrelevant Data (Red Herring)' , 'Appeal to authority' , 'Whataboutism' , 'Black-and-white Fallacy/Dictatorship' , 'Thought-terminating cliché' , 'Causal Oversimplification'"
-            + "\n Answer (only yes/no) in the following format: \n"
-            + "'Doubt': 'yes', "
-            + "'Smears': 'no', \n\n"
-            + "## Text: "
-            + input_sample
-            + "\n\n"
-            + "## Response: \n",
-        },
-    ]
-
-
-def fix_label(pred_label):
-    if "used in this text" in pred_label:
-        return ["no technique"]
-
-    labels_fixed = []
-    pred_label = pred_label.replace('"', "'").split("', '")
-    pred_labels = []
-
-    for l in pred_label:
-        splits = l.replace(",", "").split(":")
-        if "no" in splits[1]:
-            continue
-        pred_labels.append(splits[0].replace("'", ""))
-
-    if len(pred_labels) == 0:
-        return ["no technique"]
-
-    for label in pred_labels:
-        label = label.replace(".", "").strip()
-        label = re.sub("-", " ", label)
-        label = label.strip().lower()
-
-        # Handle case of single word labels like "Smears" so we just capitalize it
-        label_fixed = label.capitalize()
-
-        # print(label)
-        if "slogan" in label:
-            label_fixed = "Slogans"
-        if "loaded" in label:
-            label_fixed = "Loaded Language"
-        if "prejudice" in label or "fear" in label or "mongering" in label:
-            label_fixed = "Appeal to fear/prejudice"
-        if "terminating" in label or "thought" in label:
-            label_fixed = "Thought-terminating cliché"
-        if "calling" in label or label == "name c":
-            label_fixed = "Name calling/Labeling"
-        if "minimisation" in label or label == "exaggeration minim":
-            label_fixed = "Exaggeration/Minimisation"
-        if "glittering" in label:
-            label_fixed = "Glittering generalities (Virtue)"
-        if "flag" in label:
-            label_fixed = "Flag-waving"
-        if "obfuscation" in label:
-            label_fixed = "Obfuscation, Intentional vagueness, Confusion"
-        if "oversimplification" in label or "causal" in label:
-            label_fixed = "Causal Oversimplification"
-        if "authority" in label:
-            label_fixed = "Appeal to authority"
-        if "dictatorship" in label or "black" in label or "white" in label:
-            label_fixed = "Black-and-white Fallacy/Dictatorship"
-        if "herring" in label or "irrelevant" in label:
-            label_fixed = "Presenting Irrelevant Data (Red Herring)"
-        if "straw" in label or "misrepresentation" in label:
-            label_fixed = "Misrepresentation of Someone's Position (Straw Man)"
-        if "whataboutism" in label:
-            label_fixed = "Whataboutism"
-
-        if (
-            "no propaganda" in label
-            or "technique" in label
-            or label == ""
-            or label == "no"
-            or label == "appeal to history"
-            or label == "appeal to emotion"
-            or label == "appeal to"
-            or label == "appeal"
-            or label == "appeal to author"
-            or label == "emotional appeal"
-            or "no techn" in label
-            or "hashtag" in label
-            or "theory" in label
-            or "specific mention" in label
-            or "religious" in label
-            or "gratitude" in label
-        ):
-            label_fixed = "no technique"
-
-        labels_fixed.append(label_fixed)
-
-    out_put_labels = []
-    # Remove no technique label when we have other techniques for the same text
-    if len(labels_fixed) > 1:
-        for flabel in labels_fixed:
-            if flabel != "no technique":
-                out_put_labels.append(flabel)
-        return out_put_labels
-
-    return labels_fixed
-
-
-def post_process(response):
-    pred_label = response["choices"][0]["message"]["content"]
-    pred_label = fix_label(pred_label)
-
-    return pred_label
diff --git a/assets/benchmark_v1/semantics/.keep b/assets/benchmark_v1/semantics/.keep
deleted file mode 100644
index 139597f9..00000000
--- a/assets/benchmark_v1/semantics/.keep
+++ /dev/null
@@ -1,2 +0,0 @@
-
-
diff --git a/assets/benchmark_v1/sentiment_emotion_others/.keep b/assets/benchmark_v1/sentiment_emotion_others/.keep
deleted file mode 100644
index 139597f9..00000000
--- a/assets/benchmark_v1/sentiment_emotion_others/.keep
+++ /dev/null
@@ -1,2 +0,0 @@
-
-
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_BGZeroShot.py b/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py
similarity index 92%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_BGZeroShot.py
rename to assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py
index 10c2a3bf..ff303850 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_BGZeroShot.py
+++ b/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import BLOOMPetalModel
 from llmebench.tasks import CheckworthinessTask
 
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGFewShot.py b/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGFewShot.py
rename to assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py
index 495d05d2..37cfbea5 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGFewShot.py
+++ b/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py
@@ -1,14 +1,14 @@
 import os
 import re
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import CheckworthinessTask
 
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGZeroShot.py b/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py
similarity index 95%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGZeroShot.py
rename to assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py
index 2d118f14..408435e3 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGZeroShot.py
+++ b/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py
@@ -1,14 +1,14 @@
 import os
 import re
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import CheckworthinessTask
 
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_BLOOMZ_ZeroShot.py b/assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_BLOOMZ_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_BLOOMZ_ZeroShot.py
rename to assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_BLOOMZ_ZeroShot.py
diff --git a/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_FewShot.py b/assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_GPT4_FewShot.py
similarity index 100%
rename from assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_FewShot.py
rename to assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_GPT4_FewShot.py
diff --git a/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_ZeroShot.py b/assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_GPT4_ZeroShot.py
similarity index 100%
rename from assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_ZeroShot.py
rename to assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_GPT4_ZeroShot.py
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_GEZeroShot.py b/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_GEZeroShot.py
rename to assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py
index 7f38863d..5d5c070c 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_GEZeroShot.py
+++ b/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py
@@ -1,18 +1,18 @@
 import os
 import re
 
-from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.datasets import SemEval23T3PropagandaDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 def config():
     return {
-        "dataset": PropagandaSemEval23Dataset,
+        "dataset": SemEval23T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/techniques_subtask3.txt"
         },
-        "task": PropagandaMultilabelSemEval23Task,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEFewShot.py b/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py
similarity index 97%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEFewShot.py
rename to assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py
index 70523f8e..bdac0ca6 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEFewShot.py
+++ b/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py
@@ -1,18 +1,18 @@
 import os
 import re
 
-from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.datasets import SemEval23T3PropagandaDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 def config():
     return {
-        "dataset": PropagandaSemEval23Dataset,
+        "dataset": SemEval23T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/techniques_subtask3.txt"
         },
-        "task": PropagandaMultilabelSemEval23Task,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEZeroShot.py b/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py
similarity index 97%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEZeroShot.py
rename to assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py
index e7836e14..dce30585 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEZeroShot.py
+++ b/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py
@@ -2,18 +2,18 @@
 
 import regex as re
 
-from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.datasets import SemEval23T3PropagandaDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 def config():
     return {
-        "dataset": PropagandaSemEval23Dataset,
+        "dataset": SemEval23T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/techniques_subtask3.txt"
         },
-        "task": PropagandaMultilabelSemEval23Task,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ENZeroShot.py b/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py
similarity index 92%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ENZeroShot.py
rename to assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py
index 5bf0e294..296280ce 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ENZeroShot.py
+++ b/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import BLOOMPetalModel
 from llmebench.tasks import CheckworthinessTask
 
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENFewShot.py b/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENFewShot.py
rename to assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py
index d0b67175..bf4b0537 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENFewShot.py
+++ b/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py
@@ -1,14 +1,14 @@
 import os
 import re
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import CheckworthinessTask
 
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENZeroShot.py b/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py
similarity index 95%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENZeroShot.py
rename to assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py
index 8cb18b32..fa4d5af5 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENZeroShot.py
+++ b/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py
@@ -1,14 +1,14 @@
 import os
 import re
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import CheckworthinessTask
 
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ENZeroShot.py b/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ENZeroShot.py
rename to assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py
index 6fa517a7..3c8d7fba 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ENZeroShot.py
+++ b/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py
@@ -1,18 +1,18 @@
 import os
 import re
 
-from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.datasets import SemEval23T3PropagandaDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 def config():
     return {
-        "dataset": PropagandaSemEval23Dataset,
+        "dataset": SemEval23T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/techniques_subtask3.txt"
         },
-        "task": PropagandaMultilabelSemEval23Task,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENFewShot.py b/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py
similarity index 97%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENFewShot.py
rename to assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py
index e7f33f39..117b160c 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENFewShot.py
+++ b/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py
@@ -1,18 +1,18 @@
 import os
 import re
 
-from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.datasets import SemEval23T3PropagandaDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 def config():
     return {
-        "dataset": PropagandaSemEval23Dataset,
+        "dataset": SemEval23T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/techniques_subtask3.txt"
         },
-        "task": PropagandaMultilabelSemEval23Task,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENZeroShot.py b/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py
similarity index 97%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENZeroShot.py
rename to assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py
index 590ca1ab..a01b0369 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENZeroShot.py
+++ b/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py
@@ -2,18 +2,18 @@
 
 import regex as re
 
-from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.datasets import SemEval23T3PropagandaDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 def config():
     return {
-        "dataset": PropagandaSemEval23Dataset,
+        "dataset": SemEval23T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/techniques_subtask3.txt"
         },
-        "task": PropagandaMultilabelSemEval23Task,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ESZeroShot.py b/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py
similarity index 91%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ESZeroShot.py
rename to assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py
index d8c1a989..977f9864 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ESZeroShot.py
+++ b/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import BLOOMPetalModel
 from llmebench.tasks import CheckworthinessTask
 
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESFewShot.py b/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESFewShot.py
rename to assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py
index e6099674..0f5c195f 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESFewShot.py
+++ b/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py
@@ -1,14 +1,14 @@
 import os
 import re
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import CheckworthinessTask
 
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESZeroShot.py b/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py
similarity index 95%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESZeroShot.py
rename to assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py
index 3ed072e6..bb7c65fd 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESZeroShot.py
+++ b/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py
@@ -1,14 +1,14 @@
 import os
 import re
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import CheckworthinessTask
 
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_FRZeroShot.py b/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_FRZeroShot.py
rename to assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py
index cc58c362..3f3f00e4 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_FRZeroShot.py
+++ b/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py
@@ -1,18 +1,18 @@
 import os
 import re
 
-from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.datasets import SemEval23T3PropagandaDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 def config():
     return {
-        "dataset": PropagandaSemEval23Dataset,
+        "dataset": SemEval23T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/techniques_subtask3.txt"
         },
-        "task": PropagandaMultilabelSemEval23Task,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRFewShot.py b/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py
similarity index 97%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRFewShot.py
rename to assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py
index 2c642c7f..c4f64cf9 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRFewShot.py
+++ b/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py
@@ -1,18 +1,18 @@
 import os
 import re
 
-from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.datasets import SemEval23T3PropagandaDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 def config():
     return {
-        "dataset": PropagandaSemEval23Dataset,
+        "dataset": SemEval23T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/techniques_subtask3.txt"
         },
-        "task": PropagandaMultilabelSemEval23Task,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRZeroShot.py b/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py
similarity index 97%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRZeroShot.py
rename to assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py
index 6c7c4dc7..80f68cf8 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRZeroShot.py
+++ b/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py
@@ -2,18 +2,18 @@
 
 import regex as re
 
-from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.datasets import SemEval23T3PropagandaDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 def config():
     return {
-        "dataset": PropagandaSemEval23Dataset,
+        "dataset": SemEval23T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/techniques_subtask3.txt"
         },
-        "task": PropagandaMultilabelSemEval23Task,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ITZeroShot.py b/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ITZeroShot.py
rename to assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py
index 9103c03c..d5c931ed 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ITZeroShot.py
+++ b/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py
@@ -1,18 +1,18 @@
 import os
 import re
 
-from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.datasets import SemEval23T3PropagandaDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 def config():
     return {
-        "dataset": PropagandaSemEval23Dataset,
+        "dataset": SemEval23T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/techniques_subtask3.txt"
         },
-        "task": PropagandaMultilabelSemEval23Task,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITFewShot.py b/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py
similarity index 97%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITFewShot.py
rename to assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py
index 2b782ca3..75bad866 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITFewShot.py
+++ b/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py
@@ -1,18 +1,18 @@
 import os
 import re
 
-from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.datasets import SemEval23T3PropagandaDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 def config():
     return {
-        "dataset": PropagandaSemEval23Dataset,
+        "dataset": SemEval23T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/techniques_subtask3.txt"
         },
-        "task": PropagandaMultilabelSemEval23Task,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITZeroShot.py b/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py
similarity index 97%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITZeroShot.py
rename to assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py
index 511ca715..eb978d48 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITZeroShot.py
+++ b/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py
@@ -2,18 +2,18 @@
 
 import regex as re
 
-from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.datasets import SemEval23T3PropagandaDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 def config():
     return {
-        "dataset": PropagandaSemEval23Dataset,
+        "dataset": SemEval23T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/techniques_subtask3.txt"
         },
-        "task": PropagandaMultilabelSemEval23Task,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_NLZeroShot.py b/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py
similarity index 92%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_NLZeroShot.py
rename to assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py
index 06793d52..c373d29d 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_NLZeroShot.py
+++ b/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import BLOOMPetalModel
 from llmebench.tasks import CheckworthinessTask
 
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLFewShot.py b/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLFewShot.py
rename to assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py
index 7f241509..a7677af1 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLFewShot.py
+++ b/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py
@@ -1,14 +1,14 @@
 import os
 import re
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import CheckworthinessTask
 
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLZeroShot.py b/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py
similarity index 95%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLZeroShot.py
rename to assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py
index f52b350c..4caf1405 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLZeroShot.py
+++ b/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py
@@ -1,14 +1,14 @@
 import os
 import re
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import CheckworthinessTask
 
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_POZeroShot.py b/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_POZeroShot.py
rename to assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py
index 4fca8321..33ade972 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_POZeroShot.py
+++ b/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py
@@ -1,18 +1,18 @@
 import os
 import re
 
-from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.datasets import SemEval23T3PropagandaDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 def config():
     return {
-        "dataset": PropagandaSemEval23Dataset,
+        "dataset": SemEval23T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/techniques_subtask3.txt"
         },
-        "task": PropagandaMultilabelSemEval23Task,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POFewShot.py b/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py
similarity index 97%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POFewShot.py
rename to assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py
index d9447aaa..5d6acf12 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POFewShot.py
+++ b/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py
@@ -1,18 +1,18 @@
 import os
 import re
 
-from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.datasets import SemEval23T3PropagandaDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 def config():
     return {
-        "dataset": PropagandaSemEval23Dataset,
+        "dataset": SemEval23T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/techniques_subtask3.txt"
         },
-        "task": PropagandaMultilabelSemEval23Task,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POZeroShot.py b/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py
similarity index 97%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POZeroShot.py
rename to assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py
index cc0a834b..4f2371ec 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POZeroShot.py
+++ b/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py
@@ -2,18 +2,18 @@
 
 import regex as re
 
-from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.datasets import SemEval23T3PropagandaDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 def config():
     return {
-        "dataset": PropagandaSemEval23Dataset,
+        "dataset": SemEval23T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/techniques_subtask3.txt"
         },
-        "task": PropagandaMultilabelSemEval23Task,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_RUZeroShot.py b/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_RUZeroShot.py
rename to assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py
index 9b3114e6..91c23317 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_RUZeroShot.py
+++ b/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py
@@ -1,18 +1,18 @@
 import os
 import re
 
-from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.datasets import SemEval23T3PropagandaDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 def config():
     return {
-        "dataset": PropagandaSemEval23Dataset,
+        "dataset": SemEval23T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/techniques_subtask3.txt"
         },
-        "task": PropagandaMultilabelSemEval23Task,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUFewShot.py b/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py
similarity index 97%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUFewShot.py
rename to assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py
index d4abcaac..4a89abb4 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUFewShot.py
+++ b/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py
@@ -1,18 +1,18 @@
 import os
 import re
 
-from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.datasets import SemEval23T3PropagandaDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 def config():
     return {
-        "dataset": PropagandaSemEval23Dataset,
+        "dataset": SemEval23T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/techniques_subtask3.txt"
         },
-        "task": PropagandaMultilabelSemEval23Task,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUZeroShot.py b/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py
similarity index 97%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUZeroShot.py
rename to assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py
index e2557173..563ac5a6 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUZeroShot.py
+++ b/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py
@@ -2,18 +2,18 @@
 
 import regex as re
 
-from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.datasets import SemEval23T3PropagandaDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.tasks import MultilabelPropagandaTask
 
 
 def config():
     return {
-        "dataset": PropagandaSemEval23Dataset,
+        "dataset": SemEval23T3PropagandaDataset,
         "dataset_args": {
             "techniques_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/techniques_subtask3.txt"
         },
-        "task": PropagandaMultilabelSemEval23Task,
+        "task": MultilabelPropagandaTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_TRZeroShot.py b/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py
similarity index 91%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_TRZeroShot.py
rename to assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py
index 61338f04..a3c08771 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_TRZeroShot.py
+++ b/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import BLOOMPetalModel
 from llmebench.tasks import CheckworthinessTask
 
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRFewShot.py b/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py
similarity index 96%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRFewShot.py
rename to assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py
index c1ae1b57..c940592f 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRFewShot.py
+++ b/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py
@@ -1,14 +1,14 @@
 import os
 import re
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import CheckworthinessTask
 
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRZeroShot.py b/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py
similarity index 95%
rename from assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRZeroShot.py
rename to assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py
index 14ebc259..5041a952 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRZeroShot.py
+++ b/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py
@@ -1,14 +1,14 @@
 import os
 import re
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import CheckworthinessTask
 
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},
diff --git a/llmebench/datasets/DialectADI.py b/llmebench/datasets/ADI.py
similarity index 90%
rename from llmebench/datasets/DialectADI.py
rename to llmebench/datasets/ADI.py
index f3cda355..580775ba 100644
--- a/llmebench/datasets/DialectADI.py
+++ b/llmebench/datasets/ADI.py
@@ -3,9 +3,9 @@
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class DialectADIDataset(DatasetBase):
+class ADIDataset(DatasetBase):
     def __init__(self, **kwargs):
-        super(DialectADIDataset, self).__init__(**kwargs)
+        super(ADIDataset, self).__init__(**kwargs)
 
     def get_data_sample(self):
         return {"input": "some tweet", "label": "no_not_interesting"}
diff --git a/llmebench/datasets/ArSASSentiment.py b/llmebench/datasets/ArSAS.py
similarity index 89%
rename from llmebench/datasets/ArSASSentiment.py
rename to llmebench/datasets/ArSAS.py
index f85b6c6c..a5bf0858 100644
--- a/llmebench/datasets/ArSASSentiment.py
+++ b/llmebench/datasets/ArSAS.py
@@ -1,9 +1,9 @@
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class ArSASSentimentDataset(DatasetBase):
+class ArSASDataset(DatasetBase):
     def __init__(self, **kwargs):
-        super(ArSASSentimentDataset, self).__init__(**kwargs)
+        super(ArSASDataset, self).__init__(**kwargs)
 
     def metadata():
         return {
diff --git a/llmebench/datasets/ArabicDiacritization.py b/llmebench/datasets/ArabicDiacritization.py
deleted file mode 100644
index d855e6bc..00000000
--- a/llmebench/datasets/ArabicDiacritization.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from llmebench.datasets.dataset_base import DatasetBase
-
-
-class ArabicDiacritizationDataset(DatasetBase):
-    def __init__(self, **kwargs):
-        super(ArabicDiacritizationDataset, self).__init__(**kwargs)
-
-    def metadata():
-        return {
-            "language": "ar",
-            "citation": """@article{10.1145/3434235,
-                author = {Darwish, Kareem and Abdelali, Ahmed and Mubarak, Hamdy and Eldesouki, Mohamed},
-                title = {Arabic Diacritic Recovery Using a Feature-Rich BiLSTM Model},
-                year = {2021},
-                issue_date = {March 2021},
-                publisher = {Association for Computing Machinery},
-                address = {New York, NY, USA},
-                volume = {20},
-                number = {2},
-                issn = {2375-4699},
-                url = {https://doi.org/10.1145/3434235},
-                doi = {10.1145/3434235},
-                journal = {ACM Trans. Asian Low-Resour. Lang. Inf. Process.},
-                month = {apr},
-                articleno = {33},
-                numpages = {18},
-            }""",
-        }
-
-    def get_data_sample(self):
-        return {
-            "input": "Original sentence",
-            "label": "Sentence with diacritized words",
-        }
-
-    def load_data(self, data_path, no_labels=False):
-        data = []
-
-        with open(data_path, "r") as fp:
-            for line_idx, line in enumerate(fp):
-                text, diacritized_text = line.split("\t")
-                data.append(
-                    {
-                        "input": text.strip(),
-                        "label": diacritized_text.strip(),
-                        "line_number": line_idx,
-                    }
-                )
-
-        return data
diff --git a/llmebench/datasets/ArabicPOS.py b/llmebench/datasets/ArabicPOS.py
deleted file mode 100644
index aa13354f..00000000
--- a/llmebench/datasets/ArabicPOS.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from llmebench.datasets.dataset_base import DatasetBase
-
-
-class ArabicPOSDataset(DatasetBase):
-    def __init__(self, **kwargs):
-        super(ArabicPOSDataset, self).__init__(**kwargs)
-
-    def metadata():
-        return {
-            "language": "ar",
-            "citation": """@inproceedings{samih-etal-2017-learning,
-                title = "Learning from Relatives: Unified Dialectal {A}rabic Segmentation",
-                author = "Samih, Younes  and
-                  Eldesouki, Mohamed  and
-                  Attia, Mohammed  and
-                  Darwish, Kareem  and
-                  Abdelali, Ahmed  and
-                  Mubarak, Hamdy  and
-                  Kallmeyer, Laura",
-                booktitle = "Proceedings of the 21st Conference on Computational Natural Language Learning ({C}o{NLL} 2017)",
-                month = aug,
-                year = "2017",
-                address = "Vancouver, Canada",
-                publisher = "Association for Computational Linguistics",
-                url = "https://aclanthology.org/K17-1043",
-                doi = "10.18653/v1/K17-1043",
-                pages = "432--441"
-            }""",
-        }
-
-    def get_data_sample(self):
-        return {
-            "input": "Original sentence",
-            "label": "Sentence with POS tags",
-        }
-
-    def load_data(self, data_path, no_labels=False):
-        data = []
-
-        with open(data_path, "r") as fp:
-            for line_idx, line in enumerate(fp):
-                data.append(
-                    {
-                        "input": line.strip().split("\t")[0],
-                        "label": line.strip().split("\t")[1],
-                        "line_number": line_idx,
-                    }
-                )
-
-        return data
diff --git a/llmebench/datasets/ArabicParsing.py b/llmebench/datasets/ArabicParsing.py
deleted file mode 100644
index 631e62ea..00000000
--- a/llmebench/datasets/ArabicParsing.py
+++ /dev/null
@@ -1,63 +0,0 @@
-from llmebench.datasets.dataset_base import DatasetBase
-
-
-class ArabicParsingDataset(DatasetBase):
-    def __init__(self, **kwargs):
-        super(ArabicParsingDataset, self).__init__(**kwargs)
-
-    def metadata():
-        return {
-            "language": "ar",
-            "citation": """@inproceedings{samih-etal-2017-learning,
-                title = "Learning from Relatives: Unified Dialectal {A}rabic Segmentation",
-                author = "Samih, Younes  and
-                  Eldesouki, Mohamed  and
-                  Attia, Mohammed  and
-                  Darwish, Kareem  and
-                  Abdelali, Ahmed  and
-                  Mubarak, Hamdy  and
-                  Kallmeyer, Laura",
-                booktitle = "Proceedings of the 21st Conference on Computational Natural Language Learning ({C}o{NLL} 2017)",
-                month = aug,
-                year = "2017",
-                address = "Vancouver, Canada",
-                publisher = "Association for Computational Linguistics",
-                url = "https://aclanthology.org/K17-1043",
-                doi = "10.18653/v1/K17-1043",
-                pages = "432--441"
-            }""",
-        }
-
-    def get_data_sample(self):
-        return {
-            "input": "Original sentence",
-            "label": {
-                "1": "2",
-                "2": "0",
-            },
-        }
-
-    def load_data(self, data_path, no_labels=False):
-        data = []
-        send_id = 0
-        sent_lab = {}
-        sent_src = []
-        with open(data_path, "r") as fp:
-            for line_idx, line in enumerate(fp):
-                if len(line.split("\t")) < 6:
-                    data.append(
-                        {
-                            "input": "\n".join(sent_src),
-                            "label": sent_lab,
-                            "sent_number": send_id,
-                        }
-                    )
-                    send_id += 1
-                    sent_lab = {}
-                    sent_src = []
-                else:
-                    sent_src.append("\t".join(line.split("\t")[:6]))
-                    lid = line.split("\t")[0]
-                    sent_lab[lid] = line.split("\t")[6]
-
-        return data
diff --git a/llmebench/datasets/BibleMaghrebiDiacritization.py b/llmebench/datasets/BibleMaghrebiDiacritization.py
new file mode 100644
index 00000000..99553d57
--- /dev/null
+++ b/llmebench/datasets/BibleMaghrebiDiacritization.py
@@ -0,0 +1,47 @@
+from llmebench.datasets.dataset_base import DatasetBase
+
+
+class BibleMaghrebiDiacritizationDataset(DatasetBase):
+    def __init__(self, **kwargs):
+        super(BibleMaghrebiDiacritizationDataset, self).__init__(**kwargs)
+
+    def metadata():
+        return {
+            "language": "ar",
+            "citation": """@InProceedings{DARWISH18.20,
+                author = {Kareem Darwish , Ahmed Abdelali , Hamdy Mubarak , Younes Samih and Mohammed Attia},
+                title = {Diacritization of Moroccan and Tunisian Arabic Dialects: A CRF Approach},
+                booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)},
+                year = {2018},
+                month = {may},
+                date = {7-12},
+                location = {Miyazaki, Japan},
+                editor = {Hend Al-Khalifa and King Saud University and KSA Walid Magdy and University of Edinburgh and UK Kareem Darwish and Qatar Computing Research Institute and Qatar Tamer Elsayed and Qatar University and Qatar},
+                publisher = {European Language Resources Association (ELRA)},
+                address = {Paris, France},
+                isbn = {979-10-95546-25-2},
+                language = {english}
+            }""",
+        }
+
+    def get_data_sample(self):
+        return {
+            "input": "Original sentence",
+            "label": "Sentence with diacritized words",
+        }
+
+    def load_data(self, data_path, no_labels=False):
+        data = []
+
+        with open(data_path, "r") as fp:
+            for line_idx, line in enumerate(fp):
+                text, diacritized_text = line.split("\t")
+                data.append(
+                    {
+                        "input": text.strip(),
+                        "label": diacritized_text.strip(),
+                        "line_number": line_idx,
+                    }
+                )
+
+        return data
diff --git a/llmebench/datasets/FactualityCOVID19.py b/llmebench/datasets/COVID19Factuality.py
similarity index 95%
rename from llmebench/datasets/FactualityCOVID19.py
rename to llmebench/datasets/COVID19Factuality.py
index cd6227c5..a3e48a61 100644
--- a/llmebench/datasets/FactualityCOVID19.py
+++ b/llmebench/datasets/COVID19Factuality.py
@@ -3,9 +3,9 @@
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class FactualityCOVID19Dataset(DatasetBase):
+class COVID19FactualityDataset(DatasetBase):
     def __init__(self, **kwargs):
-        super(FactualityCOVID19Dataset, self).__init__(**kwargs)
+        super(COVID19FactualityDataset, self).__init__(**kwargs)
 
     def get_data_sample(self):
         return {"input": "some tweet", "label": "no"}
diff --git a/llmebench/datasets/Attentionworthy.py b/llmebench/datasets/CT22Attentionworthy.py
similarity index 90%
rename from llmebench/datasets/Attentionworthy.py
rename to llmebench/datasets/CT22Attentionworthy.py
index f8978a5b..8119b977 100644
--- a/llmebench/datasets/Attentionworthy.py
+++ b/llmebench/datasets/CT22Attentionworthy.py
@@ -3,16 +3,16 @@
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class AttentionworthyDataset(DatasetBase):
+class CT22AttentionworthyDataset(DatasetBase):
     def __init__(self, **kwargs):
-        super(AttentionworthyDataset, self).__init__(**kwargs)
+        super(CT22AttentionworthyDataset, self).__init__(**kwargs)
 
     def get_data_sample(self):
         return {"input": "some tweet", "label": "no_not_interesting"}
 
     def metadata():
         return {
-            "language": "ar",
+            "language": ["ar", "bg", "nl", "en", "tr"],
             "citation": """@InProceedings{clef-checkthat:2022:task1,
                 author = {Nakov, Preslav and Barr\\'{o}n-Cede\\~{n}o, Alberto and Da San Martino, Giovanni and Alam, Firoj and M\\'{\\i}guez, Rub\'{e}n and Caselli, Tommaso and Kutlu, Mucahid and Zaghouani, Wajdi and Li, Chengkai and Shaar, Shaden and Mubarak, Hamdy and Nikolov, Alex and Kartal, Yavuz Selim and Beltr\\'{a}n, Javier},
                 title = "Overview of the {CLEF}-2022 {CheckThat}! Lab Task 1 on Identifying Relevant Claims in Tweets",
diff --git a/llmebench/datasets/Checkworthiness.py b/llmebench/datasets/CT22Checkworthiness.py
similarity index 89%
rename from llmebench/datasets/Checkworthiness.py
rename to llmebench/datasets/CT22Checkworthiness.py
index 9556d768..b7d9806c 100644
--- a/llmebench/datasets/Checkworthiness.py
+++ b/llmebench/datasets/CT22Checkworthiness.py
@@ -3,21 +3,21 @@
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class CheckworthinessDataset(DatasetBase):
+class CT22CheckworthinessDataset(DatasetBase):
     def __init__(self, **kwargs):
-        super(CheckworthinessDataset, self).__init__(**kwargs)
+        super(CT22CheckworthinessDataset, self).__init__(**kwargs)
 
     def get_data_sample(self):
         return {
             "input": "some tweet",
-            "label": "checkworthy",
+            "label": "1",
             "input_id": 0,
             "line_number": 0,
         }
 
     def metadata():
         return {
-            "language": "ar",
+            "language": ["ar", "bg", "nl", "en", "es", "tr"],
             "citation": """@inproceedings{nakov2022overview,
                   title={Overview of the clef--2022 checkthat! lab on fighting the covid-19 infodemic and fake news detection},
                   author={Nakov, Preslav and Barr{\\'o}n-Cede{\\~n}o, Alberto and da San Martino, Giovanni and Alam, Firoj and Stru{\\ss}, Julia Maria and Mandl, Thomas and M{\\'\\i}guez, Rub{\\'e}n and Caselli, Tommaso and Kutlu, Mucahid and Zaghouani, Wajdi and others},
diff --git a/llmebench/datasets/Claim.py b/llmebench/datasets/CT22Claim.py
similarity index 90%
rename from llmebench/datasets/Claim.py
rename to llmebench/datasets/CT22Claim.py
index 923ad528..83e8faf2 100644
--- a/llmebench/datasets/Claim.py
+++ b/llmebench/datasets/CT22Claim.py
@@ -1,15 +1,13 @@
-import pandas as pd
-
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class CovidClaimDataset(DatasetBase):
+class CT22ClaimDataset(DatasetBase):
     def __init__(self, **kwargs):
-        super(CovidClaimDataset, self).__init__(**kwargs)
+        super(CT22ClaimDataset, self).__init__(**kwargs)
 
     def metadata():
         return {
-            "language": "ar",
+            "language": ["ar", "bg", "nl", "en", "tr"],
             "citation": """@inproceedings{nakov2022overview,
                 title={Overview of the CLEF-2022 CheckThat! lab task 1 on identifying relevant claims in tweets},
                 author={Nakov, Preslav and Barr{\\o}n-Cede{\\~n}o, Alberto and Da San Martino, Giovanni and Alam, Firoj and Kutlu, Mucahid and Zaghouani, Wajdi and Li, Chengkai and Shaar, Shaden and Mubarak, Hamdy and Nikolov, Alex},
diff --git a/llmebench/datasets/Harmful.py b/llmebench/datasets/CT22Harmful.py
similarity index 90%
rename from llmebench/datasets/Harmful.py
rename to llmebench/datasets/CT22Harmful.py
index a711c1f3..c19049e5 100644
--- a/llmebench/datasets/Harmful.py
+++ b/llmebench/datasets/CT22Harmful.py
@@ -1,15 +1,13 @@
-import pandas as pd
-
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class CovidHarmfulDataset(DatasetBase):
+class CT22HarmfulDataset(DatasetBase):
     def __init__(self, **kwargs):
-        super(CovidHarmfulDataset, self).__init__(**kwargs)
+        super(CT22HarmfulDataset, self).__init__(**kwargs)
 
     def metadata():
         return {
-            "language": "ar",
+            "language": ["ar", "bg", "nl", "en", "tr"],
             "citation": """@inproceedings{nakov2022overview,
                     title={Overview of the CLEF-2022 CheckThat! lab task 1 on identifying relevant claims in tweets},
                     author={Nakov, Preslav and Barr{\\'o}n-Cede{\\~n}o, Alberto and Da San Martino, Giovanni and Alam, Firoj and Kutlu, Mucahid and Zaghouani, Wajdi and Li, Chengkai and Shaar, Shaden and Mubarak, Hamdy and Nikolov, Alex},
diff --git a/llmebench/datasets/Subjectivity.py b/llmebench/datasets/CT23Subjectivity.py
similarity index 93%
rename from llmebench/datasets/Subjectivity.py
rename to llmebench/datasets/CT23Subjectivity.py
index 64299c04..b1ba7fe4 100644
--- a/llmebench/datasets/Subjectivity.py
+++ b/llmebench/datasets/CT23Subjectivity.py
@@ -3,9 +3,9 @@
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class SubjectivityDataset(DatasetBase):
+class CT23SubjectivityDataset(DatasetBase):
     def __init__(self, **kwargs):
-        super(SubjectivityDataset, self).__init__(**kwargs)
+        super(CT23SubjectivityDataset, self).__init__(**kwargs)
 
     def get_data_sample(self):
         return {"input": "some tweet", "label": "SUBJ"}
diff --git a/llmebench/datasets/FactualityKhouja20.py b/llmebench/datasets/Khouja20Factuality.py
similarity index 90%
rename from llmebench/datasets/FactualityKhouja20.py
rename to llmebench/datasets/Khouja20Factuality.py
index aaedbb81..af5e2abe 100644
--- a/llmebench/datasets/FactualityKhouja20.py
+++ b/llmebench/datasets/Khouja20Factuality.py
@@ -1,11 +1,9 @@
-import pandas as pd
-
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class FactualityKhouja20Dataset(DatasetBase):
+class Khouja20FactualityDataset(DatasetBase):
     def __init__(self, **kwargs):
-        super(FactualityKhouja20Dataset, self).__init__(**kwargs)
+        super(Khouja20FactualityDataset, self).__init__(**kwargs)
 
     def metadata():
         return {
diff --git a/llmebench/datasets/StanceKhouja20.py b/llmebench/datasets/Khouja20Stance.py
similarity index 92%
rename from llmebench/datasets/StanceKhouja20.py
rename to llmebench/datasets/Khouja20Stance.py
index 025fa8af..5be7c387 100644
--- a/llmebench/datasets/StanceKhouja20.py
+++ b/llmebench/datasets/Khouja20Stance.py
@@ -1,9 +1,9 @@
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class StanceKhouja20Dataset(DatasetBase):
+class Khouja20StanceDataset(DatasetBase):
     def __init__(self, **kwargs):
-        super(StanceKhouja20Dataset, self).__init__(**kwargs)
+        super(Khouja20StanceDataset, self).__init__(**kwargs)
 
     def metadata():
         return {
diff --git a/llmebench/datasets/Offensive.py b/llmebench/datasets/OSACT4SubtaskA.py
similarity index 92%
rename from llmebench/datasets/Offensive.py
rename to llmebench/datasets/OSACT4SubtaskA.py
index 98277449..dbeb9c92 100644
--- a/llmebench/datasets/Offensive.py
+++ b/llmebench/datasets/OSACT4SubtaskA.py
@@ -1,9 +1,9 @@
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class OffensiveDataset(DatasetBase):
+class OSACT4SubtaskADataset(DatasetBase):
     def __init__(self, **kwargs):
-        super(OffensiveDataset, self).__init__(**kwargs)
+        super(OSACT4SubtaskADataset, self).__init__(**kwargs)
 
     def metadata():
         return {
diff --git a/llmebench/datasets/HateSpeech.py b/llmebench/datasets/OSACT4SubtaskB.py
similarity index 93%
rename from llmebench/datasets/HateSpeech.py
rename to llmebench/datasets/OSACT4SubtaskB.py
index 15bacebe..784e6a95 100644
--- a/llmebench/datasets/HateSpeech.py
+++ b/llmebench/datasets/OSACT4SubtaskB.py
@@ -1,9 +1,9 @@
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class HateSpeechDataset(DatasetBase):
+class OSACT4SubtaskBDataset(DatasetBase):
     def __init__(self, **kwargs):
-        super(HateSpeechDataset, self).__init__(**kwargs)
+        super(OSACT4SubtaskBDataset, self).__init__(**kwargs)
 
     def metadata():
         return {
diff --git a/llmebench/datasets/PADT.py b/llmebench/datasets/PADT.py
new file mode 100644
index 00000000..baf7ee36
--- /dev/null
+++ b/llmebench/datasets/PADT.py
@@ -0,0 +1,71 @@
+from llmebench.datasets.dataset_base import DatasetBase
+
+
+class PADTDataset(DatasetBase):
+    def __init__(self, **kwargs):
+        super(PADTDataset, self).__init__(**kwargs)
+
+    def metadata():
+        return {
+            "language": "ar",
+            "citation": """@inproceedings{buchholz-marsi-2006-conll,
+                title = "{C}o{NLL}-{X} Shared Task on Multilingual Dependency Parsing",
+                author = "Buchholz, Sabine  and
+                  Marsi, Erwin",
+                booktitle = "Proceedings of the Tenth Conference on Computational Natural Language Learning ({C}o{NLL}-X)",
+                month = jun,
+                year = "2006",
+                address = "New York City",
+                publisher = "Association for Computational Linguistics",
+                url = "https://aclanthology.org/W06-2920",
+                pages = "149--164",
+            }
+            @inproceedings{smrz2002prague,
+                title={Prague dependency treebank for Arabic: Multi-level annotation of Arabic corpus},
+                author={Smrz, Otakar and {\\v{S}}naidauf, Jan and Zem{\\'a}nek, Petr},
+                booktitle={Proc. of the Intern. Symposium on Processing of Arabic},
+                pages={147--155},
+                year={2002}
+            }
+            @misc{hajic2004prague,
+                title={Prague Arabic Dependency Treebank 1.0. LDC2004T23},
+                author={Hajic, Jan and Smrz, Otakar and Zem{\'a}nek, Petr and Pajas, Petr and {\v{S}}naidauf, Jan and Be{\v{s}}ka, Emanuel and Kr{\'a}cmar, Jakub and Hassanov{\'a}, Kamila},
+                year={2004},
+                publisher={2004a}
+            }
+            """,
+        }
+
+    def get_data_sample(self):
+        return {
+            "input": "Original sentence",
+            "label": {
+                "1": "2",
+                "2": "0",
+            },
+        }
+
+    def load_data(self, data_path, no_labels=False):
+        data = []
+        send_id = 0
+        sent_lab = {}
+        sent_src = []
+        with open(data_path, "r") as fp:
+            for line_idx, line in enumerate(fp):
+                if len(line.split("\t")) < 6:
+                    data.append(
+                        {
+                            "input": "\n".join(sent_src),
+                            "label": sent_lab,
+                            "sent_number": send_id,
+                        }
+                    )
+                    send_id += 1
+                    sent_lab = {}
+                    sent_src = []
+                else:
+                    sent_src.append("\t".join(line.split("\t")[:6]))
+                    lid = line.split("\t")[0]
+                    sent_lab[lid] = line.split("\t")[6]
+
+        return data
diff --git a/llmebench/datasets/QCRIDialectalArabicPOS.py b/llmebench/datasets/QCRIDialectalArabicPOS.py
new file mode 100644
index 00000000..573a3a8d
--- /dev/null
+++ b/llmebench/datasets/QCRIDialectalArabicPOS.py
@@ -0,0 +1,46 @@
+from llmebench.datasets.dataset_base import DatasetBase
+
+
+class QCRIDialectalArabicPOSDataset(DatasetBase):
+    def __init__(self, **kwargs):
+        super(QCRIDialectalArabicPOSDataset, self).__init__(**kwargs)
+
+    def metadata():
+        return {
+            "language": "ar",
+            "citation": """@InProceedings{DARWISH18.562,
+                author = {Kareem Darwish , Hamdy Mubarak , Ahmed Abdelali , Mohamed Eldesouki , Younes Samih , Randah Alharbi , Mohammed Attia , Walid Magdy and Laura Kallmeyer},
+                title = {Multi-Dialect Arabic POS Tagging: A CRF Approach},
+                booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)},
+                year = {2018},
+                month = {may},
+                date = {7-12},
+                location = {Miyazaki, Japan},
+                editor = {Nicoletta Calzolari (Conference chair) and Khalid Choukri and Christopher Cieri and Thierry Declerck and Sara Goggi and Koiti Hasida and Hitoshi Isahara and Bente Maegaard and Joseph Mariani and Hélène Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis and Takenobu Tokunaga},
+                publisher = {European Language Resources Association (ELRA)},
+                address = {Paris, France},
+                isbn = {979-10-95546-00-9},
+                language = {english}
+            }""",
+        }
+
+    def get_data_sample(self):
+        return {
+            "input": "Original sentence",
+            "label": "Sentence with POS tags",
+        }
+
+    def load_data(self, data_path, no_labels=False):
+        data = []
+
+        with open(data_path, "r") as fp:
+            for line_idx, line in enumerate(fp):
+                data.append(
+                    {
+                        "input": line.strip().split("\t")[0],
+                        "label": line.strip().split("\t")[1],
+                        "line_number": line_idx,
+                    }
+                )
+
+        return data
diff --git a/llmebench/datasets/ArabicSegmentation.py b/llmebench/datasets/QCRIDialectalArabicSegmentation.py
similarity index 92%
rename from llmebench/datasets/ArabicSegmentation.py
rename to llmebench/datasets/QCRIDialectalArabicSegmentation.py
index 017ccb9f..9293c4d7 100644
--- a/llmebench/datasets/ArabicSegmentation.py
+++ b/llmebench/datasets/QCRIDialectalArabicSegmentation.py
@@ -1,9 +1,9 @@
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class ArabicSegmentationDataset(DatasetBase):
+class QCRIDialectalArabicSegmentationDataset(DatasetBase):
     def __init__(self, **kwargs):
-        super(ArabicSegmentationDataset, self).__init__(**kwargs)
+        super(QCRIDialectalArabicSegmentationDataset, self).__init__(**kwargs)
 
     def metadata():
         return {
diff --git a/llmebench/datasets/STSArSemEval17Track1.py b/llmebench/datasets/SemEval17T1STS.py
similarity index 94%
rename from llmebench/datasets/STSArSemEval17Track1.py
rename to llmebench/datasets/SemEval17T1STS.py
index a08f0ffb..e5412f4f 100644
--- a/llmebench/datasets/STSArSemEval17Track1.py
+++ b/llmebench/datasets/SemEval17T1STS.py
@@ -1,9 +1,9 @@
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class STSArSemEval17Track1Dataset(DatasetBase):
+class SemEval17T1STSDataset(DatasetBase):
     def __init__(self, **kwargs):
-        super(STSArSemEval17Track1Dataset, self).__init__(**kwargs)
+        super(SemEval17T1STSDataset, self).__init__(**kwargs)
 
     def metadata():
         return {
diff --git a/llmebench/datasets/STSArSemEval17Track2.py b/llmebench/datasets/SemEval17T2STS.py
similarity index 94%
rename from llmebench/datasets/STSArSemEval17Track2.py
rename to llmebench/datasets/SemEval17T2STS.py
index 4f08e183..8b916880 100644
--- a/llmebench/datasets/STSArSemEval17Track2.py
+++ b/llmebench/datasets/SemEval17T2STS.py
@@ -1,9 +1,9 @@
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class STSArSemEval17Track2Dataset(DatasetBase):
+class SemEval17T2STSDataset(DatasetBase):
     def __init__(self, **kwargs):
-        super(STSArSemEval17Track2Dataset, self).__init__(**kwargs)
+        super(SemEval17T2STSDataset, self).__init__(**kwargs)
 
     def metadata():
         return {
diff --git a/llmebench/datasets/PropagandaSemEval23.py b/llmebench/datasets/SemEval23T3Propaganda.py
similarity index 95%
rename from llmebench/datasets/PropagandaSemEval23.py
rename to llmebench/datasets/SemEval23T3Propaganda.py
index cc130442..5cbcb671 100644
--- a/llmebench/datasets/PropagandaSemEval23.py
+++ b/llmebench/datasets/SemEval23T3Propaganda.py
@@ -1,19 +1,18 @@
 import json
-import os
 from pathlib import Path
 
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class PropagandaSemEval23Dataset(DatasetBase):
+class SemEval23T3PropagandaDataset(DatasetBase):
     def __init__(self, techniques_path=None, **kwargs):
         # Get the path to the file listing the target techniques
         self.techniques_path = Path(techniques_path) if techniques_path else None
-        super(PropagandaSemEval23Dataset, self).__init__(**kwargs)
+        super(SemEval23T3PropagandaDataset, self).__init__(**kwargs)
 
     def metadata():
         return {
-            "language": "multilingual",
+            "language": ["en", "es", "fr", "de", "el", "it", "ka", "pl", "ru"],
             "citation": """@inproceedings{piskorski-etal-2023-semeval,
                 title = "{S}em{E}val-2023 Task 3: Detecting the Category, the Framing, and the Persuasion Techniques in Online News in a Multi-lingual Setup",
                 author = "Piskorski, Jakub  and
diff --git a/llmebench/datasets/FactualityUnifiedFC.py b/llmebench/datasets/UnifiedFCFactuality.py
similarity index 95%
rename from llmebench/datasets/FactualityUnifiedFC.py
rename to llmebench/datasets/UnifiedFCFactuality.py
index bffe9b22..672c4907 100644
--- a/llmebench/datasets/FactualityUnifiedFC.py
+++ b/llmebench/datasets/UnifiedFCFactuality.py
@@ -1,9 +1,9 @@
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class FactualityUnifiedFCDataset(DatasetBase):
+class UnifiedFCFactualityDataset(DatasetBase):
     def __init__(self, **kwargs):
-        super(FactualityUnifiedFCDataset, self).__init__(**kwargs)
+        super(UnifiedFCFactualityDataset, self).__init__(**kwargs)
 
     def metadata():
         return {
diff --git a/llmebench/datasets/StanceUnifiedFC.py b/llmebench/datasets/UnifiedFCStance.py
similarity index 95%
rename from llmebench/datasets/StanceUnifiedFC.py
rename to llmebench/datasets/UnifiedFCStance.py
index f4365acc..0e478261 100644
--- a/llmebench/datasets/StanceUnifiedFC.py
+++ b/llmebench/datasets/UnifiedFCStance.py
@@ -1,14 +1,11 @@
 import json
-import os
-
-import pandas as pd
 
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class StanceUnifiedFCDataset(DatasetBase):
+class UnifiedFCStanceDataset(DatasetBase):
     def __init__(self, **kwargs):
-        super(StanceUnifiedFCDataset, self).__init__(**kwargs)
+        super(UnifiedFCStanceDataset, self).__init__(**kwargs)
 
     def metadata():
         return {
diff --git a/llmebench/datasets/Propaganda.py b/llmebench/datasets/WANLP22T3Propaganda.py
similarity index 76%
rename from llmebench/datasets/Propaganda.py
rename to llmebench/datasets/WANLP22T3Propaganda.py
index e3e7074f..fc623ca8 100644
--- a/llmebench/datasets/Propaganda.py
+++ b/llmebench/datasets/WANLP22T3Propaganda.py
@@ -5,17 +5,22 @@
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class PropagandaTweetDataset(DatasetBase):
+class WANLP22T3PropagandaDataset(DatasetBase):
     def __init__(self, techniques_path=None, **kwargs):
         # Get the path to the file listing the target techniques
         self.techniques_path = Path(techniques_path) if techniques_path else None
-        super(PropagandaTweetDataset, self).__init__(**kwargs)
+        super(WANLP22T3PropagandaDataset, self).__init__(**kwargs)
 
     def metadata():
         return {
             "language": "ar",
-            "citation": """@article{wanlp2023,
-                year={2023}
+            "citation": """@inproceedings{alam2022overview,
+              title={Overview of the $\{$WANLP$\}$ 2022 Shared Task on Propaganda Detection in $\{$A$\}$ rabic},
+              author={Alam, Firoj and Mubarak, Hamdy and Zaghouani, Wajdi and Da San Martino, Giovanni and Nakov, Preslav and others},
+              booktitle={Proceedings of the The Seventh Arabic Natural Language Processing Workshop (WANLP)},
+              pages={108--118},
+              year={2022},
+              organization={Association for Computational Linguistics}
             }""",
         }
 
diff --git a/llmebench/datasets/WikiNewsDiacritization.py b/llmebench/datasets/WikiNewsDiacritization.py
new file mode 100644
index 00000000..04bf7801
--- /dev/null
+++ b/llmebench/datasets/WikiNewsDiacritization.py
@@ -0,0 +1,47 @@
+from llmebench.datasets.dataset_base import DatasetBase
+
+
+class WikiNewsDiacritizationDataset(DatasetBase):
+    def __init__(self, **kwargs):
+        super(WikiNewsDiacritizationDataset, self).__init__(**kwargs)
+
+    def metadata():
+        return {
+            "language": "ar",
+            "citation": """@inproceedings{darwish-etal-2017-arabic,
+                title = "{A}rabic Diacritization: Stats, Rules, and Hacks",
+                author = "Darwish, Kareem  and
+                  Mubarak, Hamdy  and
+                  Abdelali, Ahmed",
+                booktitle = "Proceedings of the Third {A}rabic Natural Language Processing Workshop",
+                month = apr,
+                year = "2017",
+                address = "Valencia, Spain",
+                publisher = "Association for Computational Linguistics",
+                url = "https://aclanthology.org/W17-1302",
+                doi = "10.18653/v1/W17-1302",
+                pages = "9--17",
+            }""",
+        }
+
+    def get_data_sample(self):
+        return {
+            "input": "Original sentence",
+            "label": "Sentence with diacritized words",
+        }
+
+    def load_data(self, data_path, no_labels=False):
+        data = []
+
+        with open(data_path, "r") as fp:
+            for line_idx, line in enumerate(fp):
+                text, diacritized_text = line.split("\t")
+                data.append(
+                    {
+                        "input": text.strip(),
+                        "label": diacritized_text.strip(),
+                        "line_number": line_idx,
+                    }
+                )
+
+        return data
diff --git a/llmebench/datasets/Lemmatization.py b/llmebench/datasets/WikiNewsLemmatization.py
similarity index 61%
rename from llmebench/datasets/Lemmatization.py
rename to llmebench/datasets/WikiNewsLemmatization.py
index 6f31c461..94a9ba0d 100644
--- a/llmebench/datasets/Lemmatization.py
+++ b/llmebench/datasets/WikiNewsLemmatization.py
@@ -1,18 +1,22 @@
 from llmebench.datasets.dataset_base import DatasetBase
 
 
-class LemmatizationDataset(DatasetBase):
+class WikiNewsLemmatizationDataset(DatasetBase):
     def __init__(self, **kwargs):
-        super(LemmatizationDataset, self).__init__(**kwargs)
+        super(WikiNewsLemmatizationDataset, self).__init__(**kwargs)
 
     def metadata():
         return {
             "language": "ar",
-            "citation": """@inproceedings{mubarak2018build,
-                title={Build Fast and Accurate Lemmatization for Arabic},
-                author={Mubarak, Hamdy},
-                booktitle={Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)},
-                year={2018}
+            "citation": """@inproceedings{mubarak-2018-build,
+                title = "Build Fast and Accurate Lemmatization for {A}rabic",
+                author = "Mubarak, Hamdy",
+                booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)",
+                month = may,
+                year = "2018",
+                address = "Miyazaki, Japan",
+                publisher = "European Language Resources Association (ELRA)",
+                url = "https://aclanthology.org/L18-1181",
             }""",
         }
 
@@ -23,7 +27,6 @@ def get_data_sample(self):
         }
 
     def load_data(self, data_path, no_labels=False):
-        # TODO: modify to iterator
         # Format: words \t lemmas
         data = []
         with open(data_path, "r") as fp:
diff --git a/llmebench/datasets/WikiNewsPOS.py b/llmebench/datasets/WikiNewsPOS.py
new file mode 100644
index 00000000..c4bc3b94
--- /dev/null
+++ b/llmebench/datasets/WikiNewsPOS.py
@@ -0,0 +1,39 @@
+from llmebench.datasets.dataset_base import DatasetBase
+
+
+class WikiNewsPOSDataset(DatasetBase):
+    def __init__(self, **kwargs):
+        super(WikiNewsPOSDataset, self).__init__(**kwargs)
+
+    def metadata():
+        return {
+            "language": "ar",
+            "citation": """@inproceedings{darwish2017arabic,
+                title={Arabic {POS} tagging: Don’t abandon feature engineering just yet},
+                author={Darwish, Kareem and Mubarak, Hamdy and Abdelali, Ahmed and Eldesouki, Mohamed},
+                booktitle={Proceedings of the third arabic natural language processing workshop},
+                pages={130--137},
+                year={2017}
+            }""",
+        }
+
+    def get_data_sample(self):
+        return {
+            "input": "Original sentence",
+            "label": "Sentence with POS tags",
+        }
+
+    def load_data(self, data_path, no_labels=False):
+        data = []
+
+        with open(data_path, "r") as fp:
+            for line_idx, line in enumerate(fp):
+                data.append(
+                    {
+                        "input": line.strip().split("\t")[0],
+                        "label": line.strip().split("\t")[1],
+                        "line_number": line_idx,
+                    }
+                )
+
+        return data
diff --git a/llmebench/datasets/WikiNewsSegmentation.py b/llmebench/datasets/WikiNewsSegmentation.py
new file mode 100644
index 00000000..639615d4
--- /dev/null
+++ b/llmebench/datasets/WikiNewsSegmentation.py
@@ -0,0 +1,40 @@
+from llmebench.datasets.dataset_base import DatasetBase
+
+
+class WikiNewsSegmentationDataset(DatasetBase):
+    def __init__(self, **kwargs):
+        super(WikiNewsSegmentationDataset, self).__init__(**kwargs)
+
+    def metadata():
+        return {
+            "language": "ar",
+            "citation": """@inproceedings{darwish2016farasa,
+                title={Farasa: A new fast and accurate {A}rabic word segmenter},
+                author={Darwish, Kareem and Mubarak, Hamdy},
+                booktitle={Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC'16)},
+                pages={1070--1074},
+                year={2016}
+            }""",
+        }
+
+    def get_data_sample(self):
+        return {
+            "input": "Original sentence",
+            "label": "Sentence with segmented words",
+        }
+
+    def load_data(self, data_path, no_labels=False):
+        # TODO: modify to iterator
+        data = []
+
+        with open(data_path, "r") as fp:
+            for line_idx, line in enumerate(fp):
+                data.append(
+                    {
+                        "input": line.replace("+", "").strip(),
+                        "label": line.strip(),
+                        "line_number": line_idx,
+                    }
+                )
+
+        return data
diff --git a/llmebench/datasets/XGLUEPOS.py b/llmebench/datasets/XGLUEPOS.py
new file mode 100644
index 00000000..b90c9821
--- /dev/null
+++ b/llmebench/datasets/XGLUEPOS.py
@@ -0,0 +1,39 @@
+from llmebench.datasets.dataset_base import DatasetBase
+
+
+class XGLUEPOSDataset(DatasetBase):
+    def __init__(self, **kwargs):
+        super(XGLUEPOSDataset, self).__init__(**kwargs)
+
+    def metadata():
+        return {
+            "language": "ar",
+            "citation": """@inproceedings{liang2020xglue,
+                title={XGLUE: A new benchmark datasetfor cross-lingual pre-training, understanding and generation},
+                author={Liang, Yaobo and Duan, Nan and Gong, Yeyun and Wu, Ning and Guo, Fenfei and Qi, Weizhen and Gong, Ming and Shou, Linjun and Jiang, Daxin and Cao, Guihong and others},
+                booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
+                pages={6008--6018},
+                year={2020}
+            }""",
+        }
+
+    def get_data_sample(self):
+        return {
+            "input": "Original sentence",
+            "label": "Sentence with POS tags",
+        }
+
+    def load_data(self, data_path, no_labels=False):
+        data = []
+
+        with open(data_path, "r") as fp:
+            for line_idx, line in enumerate(fp):
+                data.append(
+                    {
+                        "input": line.strip().split("\t")[0],
+                        "label": line.strip().split("\t")[1],
+                        "line_number": line_idx,
+                    }
+                )
+
+        return data
diff --git a/llmebench/datasets/__init__.py b/llmebench/datasets/__init__.py
index 721f1f72..65cba2c7 100644
--- a/llmebench/datasets/__init__.py
+++ b/llmebench/datasets/__init__.py
@@ -1,28 +1,24 @@
+from .ADI import ADIDataset
 from .Adult import AdultDataset
 from .ANERcorp import ANERcorpDataset
 from .Aqmar import AqmarDataset
 from .AraBench import AraBenchDataset
 from .ArabGend import ArabGendDataset
-from .ArabicDiacritization import ArabicDiacritizationDataset
-from .ArabicParsing import ArabicParsingDataset
-from .ArabicPOS import ArabicPOSDataset
-from .ArabicSegmentation import ArabicSegmentationDataset
 from .ArapTweet import ArapTweetDataset
 from .ARCD import ARCDDataset
 from .ArSarcasm import ArSarcasmDataset
-from .ArSASSentiment import ArSASSentimentDataset
-from .Attentionworthy import AttentionworthyDataset
+from .ArSAS import ArSASDataset
 from .BanglaSentiment import BanglaSentimentDataset
-from .Checkworthiness import CheckworthinessDataset
-from .Claim import CovidClaimDataset
-from .DialectADI import DialectADIDataset
+from .BibleMaghrebiDiacritization import BibleMaghrebiDiacritizationDataset
+from .COVID19Factuality import COVID19FactualityDataset
+from .CT22Attentionworthy import CT22AttentionworthyDataset
+from .CT22Checkworthiness import CT22CheckworthinessDataset
+from .CT22Claim import CT22ClaimDataset
+from .CT22Harmful import CT22HarmfulDataset
+from .CT23Subjectivity import CT23SubjectivityDataset
 from .Emotion import EmotionDataset
-from .FactualityCOVID19 import FactualityCOVID19Dataset
-from .FactualityKhouja20 import FactualityKhouja20Dataset
-from .FactualityUnifiedFC import FactualityUnifiedFCDataset
-from .Harmful import CovidHarmfulDataset
-from .HateSpeech import HateSpeechDataset
-from .Lemmatization import LemmatizationDataset
+from .Khouja20Factuality import Khouja20FactualityDataset
+from .Khouja20Stance import Khouja20StanceDataset
 from .Location import LocationDataset
 from .MGBWords import MGBWordsDataset
 from .MLQA import MLQADataset
@@ -31,17 +27,25 @@
 from .NewsCatAlArabiya import NewsCatAlArabiyaDataset
 from .NewsCatAlKhaleej import NewsCatAlKhaleejDataset
 from .NewsCatASND import NewsCatASNDDataset
-from .Offensive import OffensiveDataset
-from .Propaganda import PropagandaTweetDataset
-from .PropagandaSemEval23 import PropagandaSemEval23Dataset
+from .OSACT4SubtaskA import OSACT4SubtaskADataset
+from .OSACT4SubtaskB import OSACT4SubtaskBDataset
+from .PADT import PADTDataset
 from .QADI import QADIDataset
+from .QCRIDialectalArabicPOS import QCRIDialectalArabicPOSDataset
+from .QCRIDialectalArabicSegmentation import QCRIDialectalArabicSegmentationDataset
+from .SemEval17T1STS import SemEval17T1STSDataset
+from .SemEval17T2STS import SemEval17T2STSDataset
+from .SemEval23T3Propaganda import SemEval23T3PropagandaDataset
 from .Spam import SpamDataset
-from .StanceKhouja20 import StanceKhouja20Dataset
-from .StanceUnifiedFC import StanceUnifiedFCDataset
-from .STSArSemEval17Track1 import STSArSemEval17Track1Dataset
-from .STSArSemEval17Track2 import STSArSemEval17Track2Dataset
 from .STSQ2Q import Q2QSimDataset
-from .Subjectivity import SubjectivityDataset
 from .TyDiQA import TyDiQADataset
+from .UnifiedFCFactuality import UnifiedFCFactualityDataset
+from .UnifiedFCStance import UnifiedFCStanceDataset
+from .WANLP22T3Propaganda import WANLP22T3PropagandaDataset
+from .WikiNewsDiacritization import WikiNewsDiacritizationDataset
+from .WikiNewsLemmatization import WikiNewsLemmatizationDataset
+from .WikiNewsPOS import WikiNewsPOSDataset
+from .WikiNewsSegmentation import WikiNewsSegmentationDataset
+from .XGLUEPOS import XGLUEPOSDataset
 from .XNLI import XNLIDataset
 from .XQuAD import XQuADDataset
diff --git a/llmebench/tasks/FactualityUnifiedFC.py b/llmebench/tasks/Factuality.py
similarity index 65%
rename from llmebench/tasks/FactualityUnifiedFC.py
rename to llmebench/tasks/Factuality.py
index d87c8fd4..53caf17f 100644
--- a/llmebench/tasks/FactualityUnifiedFC.py
+++ b/llmebench/tasks/Factuality.py
@@ -3,9 +3,9 @@
 from llmebench.tasks.task_base import TaskBase
 
 
-class FactualityUnifiedFCTask(TaskBase):
+class FactualityTask(TaskBase):
     def __init__(self, **kwargs):
-        super(FactualityUnifiedFCTask, self).__init__(**kwargs)
+        super(FactualityTask, self).__init__(**kwargs)
 
     def evaluate(self, gold_labels, pred_labels):
         pred_labels = [
@@ -16,12 +16,19 @@ def evaluate(self, gold_labels, pred_labels):
         precision = precision_score(gold_labels, pred_labels, average="macro")
         recall = recall_score(gold_labels, pred_labels, average="macro")
         f1 = f1_score(gold_labels, pred_labels, average="macro")
+
+        w_precision = precision_score(gold_labels, pred_labels, average="weighted")
+        w_recall = recall_score(gold_labels, pred_labels, average="weighted")
+        w_f1 = f1_score(gold_labels, pred_labels, average="weighted")
+
         results = {
             "accuracy": acc,
             "macro-precision": precision,
             "macro-recall": recall,
             "macro-f1": f1,
-            "msg": "performance with respect macro-F1.",
+            "w-precision": w_precision,
+            "w-recall": w_recall,
+            "w-f1": w_f1,
         }
 
         return results
diff --git a/llmebench/tasks/FactualityCOVID19.py b/llmebench/tasks/FactualityCOVID19.py
deleted file mode 100644
index 38f108c5..00000000
--- a/llmebench/tasks/FactualityCOVID19.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
-
-from llmebench.tasks.task_base import TaskBase
-
-
-class FactualityCOVID19Task(TaskBase):
-    def __init__(self, **kwargs):
-        super(FactualityCOVID19Task, self).__init__(**kwargs)
-
-    def evaluate(self, gold_labels, pred_labels):
-        pred_labels = [
-            p if p else self.get_random_prediction(set(gold_labels))
-            for p in pred_labels
-        ]
-        acc = accuracy_score(gold_labels, pred_labels)
-        precision = precision_score(gold_labels, pred_labels, average="weighted")
-        recall = recall_score(gold_labels, pred_labels, average="weighted")
-        f1 = f1_score(gold_labels, pred_labels, average="weighted")
-        results = {
-            "accuracy": acc,
-            "w-precision": precision,
-            "w-recall": recall,
-            "w-f1": f1,
-            "msg": "performance with respect weighted-F1. W-F1 - official measure. Ref: CheckThat-2022",
-        }
-
-        return results
diff --git a/llmebench/tasks/FactualityKhouja20.py b/llmebench/tasks/FactualityKhouja20.py
deleted file mode 100644
index 079d2c68..00000000
--- a/llmebench/tasks/FactualityKhouja20.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from sklearn.metrics import f1_score
-
-from llmebench.tasks.task_base import TaskBase
-
-
-class FactualityKhouja20Task(TaskBase):
-    def __init__(self, **kwargs):
-        super(FactualityKhouja20Task, self).__init__(**kwargs)
-
-    def evaluate(self, true_labels, predicted_labels):
-        predicted_labels = [
-            p if p else self.get_random_prediction(set(true_labels))
-            for p in predicted_labels
-        ]
-        return {"Macro F1": f1_score(true_labels, predicted_labels, average="macro")}
diff --git a/llmebench/tasks/PropagandaMultilabelSemEval23.py b/llmebench/tasks/MultilabelPropaganda.py
similarity index 61%
rename from llmebench/tasks/PropagandaMultilabelSemEval23.py
rename to llmebench/tasks/MultilabelPropaganda.py
index 5e9b6dcd..a09713f9 100644
--- a/llmebench/tasks/PropagandaMultilabelSemEval23.py
+++ b/llmebench/tasks/MultilabelPropaganda.py
@@ -1,23 +1,29 @@
-import itertools
-
 from sklearn import preprocessing
 from sklearn.metrics import f1_score
 
 from llmebench.tasks.task_base import TaskBase
 
 
-class PropagandaMultilabelSemEval23Task(TaskBase):
+class MultilabelPropagandaTask(TaskBase):
     def __init__(self, **kwargs):
-        super(PropagandaMultilabelSemEval23Task, self).__init__(**kwargs)
+        super(MultilabelPropagandaTask, self).__init__(**kwargs)
 
     def evaluate(self, true_labels, predicted_labels):
-        # Handle cases when model fails!
-        # Flatten true labels as it is a list of lists
-        predicted_labels = [p if p else ["no_technique"] for p in predicted_labels]
-
         # Need the pre-defined list of techniques
         techniques = self.dataset.get_predefined_techniques()
 
+        # To generalize task to multiple datasets, since we use "no technique" as the random label next
+        no_technique_label = "no_technique"
+        for tch in techniques:
+            if "technique" in tch:
+                no_technique_label = tch
+                break
+
+        # Handle cases when model fails!
+        # use no_technique_label as the random label
+        predicted_labels = [p if p else [no_technique_label] for p in predicted_labels]
+
+        # Flatten true labels as it is a list of lists
         # Binarize labels and use them for multi-label evaluation
         mlb = preprocessing.MultiLabelBinarizer(classes=techniques)
         mlb.fit([techniques])
diff --git a/llmebench/tasks/NewsCatAkhbarona.py b/llmebench/tasks/NewsCatAkhbarona.py
deleted file mode 100644
index c25bbfb9..00000000
--- a/llmebench/tasks/NewsCatAkhbarona.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from sklearn.metrics import (
-    accuracy_score,
-    f1_score,
-    precision_recall_fscore_support,
-    precision_score,
-    recall_score,
-)
-
-from llmebench.tasks.task_base import TaskBase
-
-
-class NewsCatAkhbaronaTask(TaskBase):
-    def __init__(self, **kwargs):
-        super(NewsCatAkhbaronaTask, self).__init__(**kwargs)
-
-    def evaluate(self, gold_labels, pred_labels):
-        pred_labels = [
-            p if p else self.get_random_prediction(set(gold_labels))
-            for p in pred_labels
-        ]
-
-        acc = accuracy_score(gold_labels, pred_labels)
-        precision = precision_score(gold_labels, pred_labels, average="macro")
-        recall = recall_score(gold_labels, pred_labels, average="macro")
-        f1 = f1_score(gold_labels, pred_labels, average="macro")
-        results = {
-            "accuracy": acc,
-            "macro-precision": precision,
-            "macro-recall": recall,
-            "macro-f1": f1,
-            "msg": "performance with respect macro-F1.",
-        }
-
-        return results
diff --git a/llmebench/tasks/NewsCatAlArabiya.py b/llmebench/tasks/NewsCatAlArabiya.py
deleted file mode 100644
index 0ed8e697..00000000
--- a/llmebench/tasks/NewsCatAlArabiya.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from sklearn.metrics import (
-    accuracy_score,
-    f1_score,
-    precision_recall_fscore_support,
-    precision_score,
-    recall_score,
-)
-
-from llmebench.tasks.task_base import TaskBase
-
-
-class NewsCatAlArabiyaTask(TaskBase):
-    def __init__(self, **kwargs):
-        super(NewsCatAlArabiyaTask, self).__init__(**kwargs)
-
-    def evaluate(self, gold_labels, pred_labels):
-        pred_labels = [
-            p if p else self.get_random_prediction(set(gold_labels))
-            for p in pred_labels
-        ]
-
-        acc = accuracy_score(gold_labels, pred_labels)
-        precision = precision_score(gold_labels, pred_labels, average="macro")
-        recall = recall_score(gold_labels, pred_labels, average="macro")
-        f1 = f1_score(gold_labels, pred_labels, average="macro")
-        results = {
-            "accuracy": acc,
-            "macro-precision": precision,
-            "macro-recall": recall,
-            "macro-f1": f1,
-            "msg": "performance with respect macro-F1.",
-        }
-
-        return results
diff --git a/llmebench/tasks/NewsCatAlKhaleej.py b/llmebench/tasks/NewsCatAlKhaleej.py
deleted file mode 100644
index 176a08ba..00000000
--- a/llmebench/tasks/NewsCatAlKhaleej.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from sklearn.metrics import (
-    accuracy_score,
-    f1_score,
-    precision_recall_fscore_support,
-    precision_score,
-    recall_score,
-)
-
-from llmebench.tasks.task_base import TaskBase
-
-
-class NewsCatAlKhaleejTask(TaskBase):
-    def __init__(self, **kwargs):
-        super(NewsCatAlKhaleejTask, self).__init__(**kwargs)
-
-    def evaluate(self, gold_labels, pred_labels):
-        pred_labels = [
-            p if p else self.get_random_prediction(set(gold_labels))
-            for p in pred_labels
-        ]
-
-        acc = accuracy_score(gold_labels, pred_labels)
-        precision = precision_score(gold_labels, pred_labels, average="macro")
-        recall = recall_score(gold_labels, pred_labels, average="macro")
-        f1 = f1_score(gold_labels, pred_labels, average="macro")
-        results = {
-            "accuracy": acc,
-            "macro-precision": precision,
-            "macro-recall": recall,
-            "macro-f1": f1,
-            "msg": "performance with respect macro-F1.",
-        }
-
-        return results
diff --git a/llmebench/tasks/NewsCatASND.py b/llmebench/tasks/NewsCategorization.py
similarity index 90%
rename from llmebench/tasks/NewsCatASND.py
rename to llmebench/tasks/NewsCategorization.py
index 5ab6091e..8d453354 100644
--- a/llmebench/tasks/NewsCatASND.py
+++ b/llmebench/tasks/NewsCategorization.py
@@ -9,9 +9,9 @@
 from llmebench.tasks.task_base import TaskBase
 
 
-class NewsCatASNDTask(TaskBase):
+class NewsCategorizationTask(TaskBase):
     def __init__(self, **kwargs):
-        super(NewsCatASNDTask, self).__init__(**kwargs)
+        super(NewsCategorizationTask, self).__init__(**kwargs)
 
     def evaluate(self, gold_labels, pred_labels):
         pred_labels = [
diff --git a/llmebench/tasks/PropagandaMultilabel.py b/llmebench/tasks/PropagandaMultilabel.py
deleted file mode 100644
index c661a2a1..00000000
--- a/llmebench/tasks/PropagandaMultilabel.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import itertools
-
-from sklearn import preprocessing
-from sklearn.metrics import f1_score
-
-from llmebench.tasks.task_base import TaskBase
-
-
-class PropagandaMultilabelTask(TaskBase):
-    def __init__(self, **kwargs):
-        super(PropagandaMultilabelTask, self).__init__(**kwargs)
-
-    def evaluate(self, true_labels, predicted_labels):
-        # Handle cases when model fails!
-        # Flatten true labels as it is a list of lists
-        predicted_labels = [p if p else ["no technique"] for p in predicted_labels]
-
-        # Need the pre-defined list of techniques
-        techniques = self.dataset.get_predefined_techniques()
-
-        # Binarize labels and use them for multi-label evaluation
-        mlb = preprocessing.MultiLabelBinarizer()
-        mlb.fit([techniques])
-        gold = mlb.transform(true_labels)
-        pred = mlb.transform(predicted_labels)
-
-        micro_f1 = f1_score(gold, pred, average="micro")
-
-        return {"Micro F1": micro_f1}
diff --git a/llmebench/tasks/STSTrack1.py b/llmebench/tasks/STS.py
similarity index 69%
rename from llmebench/tasks/STSTrack1.py
rename to llmebench/tasks/STS.py
index 20b9a0e0..5c7ce9b2 100644
--- a/llmebench/tasks/STSTrack1.py
+++ b/llmebench/tasks/STS.py
@@ -1,13 +1,11 @@
-import math
-
 import numpy as np
 
 from llmebench.tasks.task_base import TaskBase
 
 
-class STSTrack1Task(TaskBase):
+class STSTask(TaskBase):
     def __init__(self, **kwargs):
-        super(STSTrack1Task, self).__init__(**kwargs)
+        super(STSTask, self).__init__(**kwargs)
 
     def evaluate(self, true_scores, predicted_scores):
         score_range = [0, 5]
@@ -15,5 +13,5 @@ def evaluate(self, true_scores, predicted_scores):
             p if p is not None else self.get_random_continuous_prediction(score_range)
             for p in predicted_scores
         ]
-        # Pearson Correction is the off-diagnal of the symmetric correlation 2x2 matrix
+        # Pearson Correction is the off-diagonal of the symmetric correlation 2x2 matrix
         return {"PC": np.corrcoef(true_scores, predicted_scores)[0, 1]}
diff --git a/llmebench/tasks/STSTrack2.py b/llmebench/tasks/STSTrack2.py
deleted file mode 100644
index 562af7a6..00000000
--- a/llmebench/tasks/STSTrack2.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import math
-
-import numpy as np
-
-from llmebench.tasks.task_base import TaskBase
-
-
-class STSTrack2Task(TaskBase):
-    def __init__(self, **kwargs):
-        super(STSTrack2Task, self).__init__(**kwargs)
-
-    def evaluate(self, true_scores, predicted_scores):
-        score_range = [0, 5]
-        predicted_scores = [
-            p if p is not None else self.get_random_continuous_prediction(score_range)
-            for p in predicted_scores
-        ]
-        # Pearson Correction is the off-diagnal of the symmetric correlation 2x2 matrix
-        return {"PC": np.corrcoef(true_scores, predicted_scores)[0, 1]}
diff --git a/llmebench/tasks/StanceKhouja20.py b/llmebench/tasks/Stance.py
similarity index 81%
rename from llmebench/tasks/StanceKhouja20.py
rename to llmebench/tasks/Stance.py
index bbb5930c..9dcd8748 100644
--- a/llmebench/tasks/StanceKhouja20.py
+++ b/llmebench/tasks/Stance.py
@@ -3,9 +3,9 @@
 from llmebench.tasks.task_base import TaskBase
 
 
-class StanceKhouja20Task(TaskBase):
+class StanceTask(TaskBase):
     def __init__(self, **kwargs):
-        super(StanceKhouja20Task, self).__init__(**kwargs)
+        super(StanceTask, self).__init__(**kwargs)
 
     def evaluate(self, true_labels, predicted_labels):
         predicted_labels = [
diff --git a/llmebench/tasks/StanceUnifiedFC.py b/llmebench/tasks/StanceUnifiedFC.py
deleted file mode 100644
index 5ab04762..00000000
--- a/llmebench/tasks/StanceUnifiedFC.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from sklearn.metrics import f1_score
-
-from llmebench.tasks.task_base import TaskBase
-
-
-class StanceUnifiedFCTask(TaskBase):
-    def __init__(self, **kwargs):
-        super(StanceUnifiedFCTask, self).__init__(**kwargs)
-
-    def evaluate(self, true_labels, predicted_labels):
-        predicted_labels = [
-            p if p else self.get_random_prediction(set(true_labels))
-            for p in predicted_labels
-        ]
-
-        return {"Macro F1": f1_score(true_labels, predicted_labels, average="macro")}
diff --git a/llmebench/tasks/__init__.py b/llmebench/tasks/__init__.py
index f5255d83..29997b19 100644
--- a/llmebench/tasks/__init__.py
+++ b/llmebench/tasks/__init__.py
@@ -11,29 +11,21 @@
 from .DemographyNameInfo import DemographyNameInfoTask
 from .DialectID import DialectIDTask
 from .Emotion import EmotionTask
-from .FactualityCOVID19 import FactualityCOVID19Task
-from .FactualityKhouja20 import FactualityKhouja20Task
-from .FactualityUnifiedFC import FactualityUnifiedFCTask
+from .Factuality import FactualityTask
 from .HarmfulDetection import HarmfulDetectionTask
 from .HateSpeech import HateSpeechTask
 from .Lemmatization import LemmatizationTask
 from .MachineTranslation import MachineTranslationTask
+from .MultilabelPropaganda import MultilabelPropagandaTask
 from .NER import NERTask
-from .NewsCatAkhbarona import NewsCatAkhbaronaTask
-from .NewsCatAlArabiya import NewsCatAlArabiyaTask
-from .NewsCatAlKhaleej import NewsCatAlKhaleejTask
-from .NewsCatASND import NewsCatASNDTask
+from .NewsCategorization import NewsCategorizationTask
 from .Offensive import OffensiveTask
-from .PropagandaMultilabel import PropagandaMultilabelTask
-from .PropagandaMultilabelSemEval23 import PropagandaMultilabelSemEval23Task
 from .Q2QSimDetect import Q2QSimDetectionTask
 from .QA import QATask
 from .Sarcasm import SarcasmTask
 from .Sentiment import SentimentTask
 from .Spam import SpamTask
-from .StanceKhouja20 import StanceKhouja20Task
-from .StanceUnifiedFC import StanceUnifiedFCTask
-from .STSTrack1 import STSTrack1Task
-from .STSTrack2 import STSTrack2Task
+from .Stance import StanceTask
+from .STS import STSTask
 from .Subjectivity import SubjectivityTask
 from .XNLI import XNLITask
diff --git a/tests/datasets/test_metadata.py b/tests/datasets/test_metadata.py
index 425c63d9..37cd721f 100644
--- a/tests/datasets/test_metadata.py
+++ b/tests/datasets/test_metadata.py
@@ -17,7 +17,7 @@ def setUpClass(cls):
             [m[1] for m in inspect.getmembers(datasets, inspect.isclass)]
         )
 
-    def test_dataset_exports(self):
+    def test_dataset_metadata(self):
         "Test if all datasets export the required metadata"
 
         for dataset in self.datasets:
@@ -32,4 +32,7 @@ def test_dataset_exports(self):
                 languages = [languages]
 
             for language in languages:
-                self.assertTrue(language == "multilingual" or tag_is_valid(language))
+                self.assertTrue(
+                    language == "multilingual" or tag_is_valid(language),
+                    f"{language} is not a valid language",
+                )