From cfa450403d667258c3d16cb060d1f9fe79ff12fa Mon Sep 17 00:00:00 2001
From: Fahim Dalvi <faimaduddin@hbku.edu.qa>
Date: Wed, 23 Aug 2023 10:09:24 +0300
Subject: [PATCH] Rename package to `llmebench` (#174)

This commit renames the top-level package to `llmebench` to highlight the
multilingual nature of the framework. All assets have been modified to use
the new package name as well.
---
 .github/workflows/code-formatting.yml         |  2 +-
 README.md                                     | 26 +++++++++----------
 .../MT/AraBench_Ara2Eng_BLOOMZ_ZeroShot.py    |  6 ++---
 .../MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py  |  6 ++---
 .../MT/AraBench_Ara2Eng_ChatGPT_ZeroShot.py   |  6 ++---
 .../NER/MGBWords_ChatGPT_ZeroShot.py          |  6 ++---
 .../MGBWords_GPTChatCompletion_ZeroShot.py    |  6 ++---
 .../NER/NERANERcorp_ChatGPT_ZeroShot.py       |  6 ++---
 .../NERANERcorp_GPTChatCompletion_FewShot.py  |  6 ++---
 .../NERANERcorp_GPTChatCompletion_ZeroShot.py |  6 ++---
 .../NER/NERAqmar_ChatGPT_ZeroShot.py          |  6 ++---
 .../NER/NERAqmar_GPTChatCompletion_FewShot.py |  6 ++---
 .../NERAqmar_GPTChatCompletion_ZeroShot.py    |  6 ++---
 .../benchmark_v1/QA/ARCD_BLOOMZ_ZeroShot.py   |  6 ++---
 .../benchmark_v1/QA/ARCD_ChatGPT_ZeroShot.py  |  6 ++---
 .../QA/ARCD_GPTChatCompletion_FewShot.py      |  6 ++---
 .../QA/ARCD_GPTChatCompletion_ZeroShot.py     |  6 ++---
 .../benchmark_v1/QA/MLQA_BLOOMZ_ZeroShot.py   |  6 ++---
 .../benchmark_v1/QA/MLQA_ChatGPT_ZeroShot.py  |  6 ++---
 .../QA/MLQA_GPTChatCompletion_FewShot.py      |  6 ++---
 .../QA/MLQA_GPTChatCompletion_ZeroShot.py     |  6 ++---
 .../benchmark_v1/QA/TyDiQA_BLOOMZ_ZeroShot.py |  6 ++---
 .../QA/TyDiQA_ChatGPT_ZeroShot.py             |  6 ++---
 .../QA/TyDiQA_GPTChatCompletion_FewShot.py    |  6 ++---
 .../QA/TydiQA_GPTChatCompletion_ZeroShot.py   |  6 ++---
 .../benchmark_v1/QA/XQuAD_BLOOMZ_ZeroShot.py  |  6 ++---
 .../benchmark_v1/QA/XQuAD_ChatGPT_ZeroShot.py |  6 ++---
 .../QA/XQuAD_GPTChatCompletion_ZeroShot.py    |  6 ++---
 .../QA/XQuaD_GPTChatCompletion_FewShot.py     |  6 ++---
 .../STS/Q2QSim_BLOOMZ_ZeroShot.py             |  6 ++---
 .../STS/Q2QSim_ChatGPT_ZeroShot.py            |  6 ++---
 .../benchmark_v1/STS/Q2QSim_GPT4_FewShot.py   |  6 ++---
 .../benchmark_v1/STS/Q2QSim_GPT4_ZeroShot.py  |  6 ++---
 .../STS/STSTrack1_BLOOMZ_ZeroShot.py          |  6 ++---
 .../STS/STSTrack1_ChatGPT_ZeroShot.py         |  6 ++---
 .../STS/STSTrack1_GPT4_FewShot.py             |  6 ++---
 .../STS/STSTrack1_GPT4_ZeroShot.py            |  6 ++---
 .../STS/STSTrack2_BLOOMZ_ZeroShot.py          |  6 ++---
 .../STS/STSTrack2_ChatGPT_ZeroShot.py         |  6 ++---
 .../STS/STSTrack2_GPT4_FewShot.py             |  6 ++---
 .../STS/STSTrack2_GPT4_ZeroShot.py            |  6 ++---
 .../gender/GenderArabGend_BLOOMZ_ZeroShot.py  |  6 ++---
 .../gender/GenderArabGend_ChatGPT_ZeroShot.py |  6 ++---
 ...nderArabGend_GPTChatCompletion_ZeroShot.py |  6 ++---
 .../gender/GenderArapTweet_BLOOMZ_ZeroShot.py |  6 ++---
 .../GenderArapTweet_ChatGPT_ZeroShot.py       |  6 ++---
 ...nderArapTweet_GPTChatCompletion_FewShot.py |  6 ++---
 ...derArapTweet_GPTChatCompletion_ZeroShot.py |  6 ++---
 .../location/Location_BLOOMZ_ZeroShot.py      |  6 ++---
 .../location/Location_ChatGPT_ZeroShot.py     |  6 ++---
 .../Location_GPTChatCompletion_FewShot.py     |  6 ++---
 .../Location_GPTChatCompletion_ZeroShot.py    |  6 ++---
 .../name_info/NameInfo_BLOOMZ_ZeroShot.py     |  6 ++---
 .../name_info/NameInfo_ChatGPT_ZeroShot.py    |  6 ++---
 .../NameInfo_GPTChatCompletion_FewShot.py     |  6 ++---
 .../NameInfo_GPTChatCompletion_ZeroShot.py    |  6 ++---
 .../Adult_BLOOMZ_ZeroShot.py                  |  6 ++---
 .../Adult_ChatGPT_ZeroShot.py                 |  6 ++---
 .../Adult_GPTChatCompletion_FewShot.py        |  6 ++---
 .../Adult_GPTChatCompletion_ZeroShot.py       |  6 ++---
 .../Attentionworthy_BLOOMZ_ZeroShot.py        |  6 ++---
 .../Attentionworthy_ChatGPT_ZeroShot.py       |  6 ++---
 ...tentionworthy_GPTChatCompletion_Fewshot.py |  6 ++---
 ...entionworthy_GPTChatCompletion_ZeroShot.py |  6 ++---
 .../COVClaimDetect_BLOOMZ_ZeroShot.py         |  6 ++---
 .../COVClaimDetect_CGPT35_ZeroShot.py         |  6 ++---
 .../COVClaimDetect_GPT4_FewShot.py            |  6 ++---
 .../COVHarmfulDetect_BLOOMZ_ZeroShot.py       |  6 ++---
 .../COVHarmfulDetect_CGPT35_ZeroShot.py       |  6 ++---
 .../COVHarmfulDetect_GPT4_FewShot.py          |  6 ++---
 .../Checkworthiness_BLOOMZ_BGZeroShot.py      |  6 ++---
 .../Checkworthiness_BLOOMZ_ENZeroShot.py      |  6 ++---
 .../Checkworthiness_BLOOMZ_ESZeroShot.py      |  6 ++---
 .../Checkworthiness_BLOOMZ_NLZeroShot.py      |  6 ++---
 .../Checkworthiness_BLOOMZ_TRZeroShot.py      |  6 ++---
 .../Checkworthiness_BLOOMZ_ZeroShot.py        |  6 ++---
 .../Checkworthiness_ChatGPT_ZeroShot.py       |  6 ++---
 ...kworthiness_GPTChatCompletion_BGFewShot.py |  6 ++---
 ...worthiness_GPTChatCompletion_BGZeroShot.py |  6 ++---
 ...kworthiness_GPTChatCompletion_ENFewShot.py |  6 ++---
 ...worthiness_GPTChatCompletion_ENZeroShot.py |  6 ++---
 ...kworthiness_GPTChatCompletion_ESFewShot.py |  6 ++---
 ...worthiness_GPTChatCompletion_ESZeroShot.py |  6 ++---
 ...eckworthiness_GPTChatCompletion_FewShot.py |  6 ++---
 ...kworthiness_GPTChatCompletion_NLFewShot.py |  6 ++---
 ...worthiness_GPTChatCompletion_NLZeroShot.py |  6 ++---
 ...kworthiness_GPTChatCompletion_TRFewShot.py |  6 ++---
 ...worthiness_GPTChatCompletion_TRZeroShot.py |  6 ++---
 ...ckworthiness_GPTChatCompletion_ZeroShot.py |  6 ++---
 .../ClaimDetectCOVID19_CGPT35_ZeroShot.py     |  6 ++---
 ...etectCOVID19_GPTChatCompletion_ZeroShot.py |  6 ++---
 .../FactualityCOVID19_BLOOMZ_ZeroShot.py      |  6 ++---
 ...ualityCOVID19_GPTChatCompletion_FewShot.py |  6 ++---
 ...alityCOVID19_GPTChatCompletion_ZeroShot.py |  6 ++---
 .../FactualityKhouja20_BLOOMZ_ZeroShot.py     |  6 ++---
 .../FactualityKhouja20_ChatGPT_ZeroShot.py    |  6 ++---
 .../FactualityKhouja20_GPT4_FewShot.py        |  6 ++---
 .../FactualityKhouja20_GPT4_ZeroShot.py       |  6 ++---
 .../FactualityUnifiedFC_BLOOMZ_ZeroShot.py    |  6 ++---
 .../FactualityUnifiedFC_GPT4_FewShot.py       |  6 ++---
 ...ityUnifiedFC_GPTChatCompletion_ZeroShot.py |  6 ++---
 .../HarmfulDetectCOVID19_CGPT35_ZeroShot.py   |  6 ++---
 ...etectCOVID19_GPTChatCompletion_ZeroShot.py |  6 ++---
 .../HateSpeech_ChatGPT_ZeroShot.py            |  6 ++---
 .../HateSpeech_GPTChatCompletion_FewShot.py   |  6 ++---
 .../Offensive_GPTChatCompletion_FewShot.py    |  6 ++---
 .../PropMultilabel_BLOOMZ_ENZeroShot.py       |  6 ++---
 .../PropMultilabel_BLOOMZ_FRZeroShot.py       |  6 ++---
 .../PropMultilabel_BLOOMZ_GEZeroShot.py       |  6 ++---
 .../PropMultilabel_BLOOMZ_ITZeroShot.py       |  6 ++---
 .../PropMultilabel_BLOOMZ_POZeroShot.py       |  6 ++---
 .../PropMultilabel_BLOOMZ_RUZeroShot.py       |  6 ++---
 .../PropMultilabel_BLOOMZ_ZeroShot.py         |  6 ++---
 .../PropMultilabel_CGPT35_ZeroShot.py         |  6 ++---
 .../PropMultilabel_GPT4_ENFewShot.py          |  6 ++---
 .../PropMultilabel_GPT4_ENZeroShot.py         |  6 ++---
 .../PropMultilabel_GPT4_FRFewShot.py          |  6 ++---
 .../PropMultilabel_GPT4_FRZeroShot.py         |  6 ++---
 .../PropMultilabel_GPT4_GEFewShot.py          |  6 ++---
 .../PropMultilabel_GPT4_GEZeroShot.py         |  6 ++---
 .../PropMultilabel_GPT4_ITFewShot.py          |  6 ++---
 .../PropMultilabel_GPT4_ITZeroShot.py         |  6 ++---
 .../PropMultilabel_GPT4_POFewShot.py          |  6 ++---
 .../PropMultilabel_GPT4_POZeroShot.py         |  6 ++---
 .../PropMultilabel_GPT4_RUFewShot.py          |  6 ++---
 .../PropMultilabel_GPT4_RUZeroShot.py         |  6 ++---
 .../PropMultilabel_GPT4_ZeroShot.py           |  6 ++---
 ...ropMultilabel_GPTChatCompletion_FewShot.py |  6 ++---
 ...opMultilabel_GPTChatCompletion_ZeroShot.py |  6 ++---
 .../Subjectivity_BLOOMZ_ZeroShot.py           |  6 ++---
 .../Subjectivity_ChatGPT_ZeroShot.py          |  6 ++---
 .../Subjectivity_GPTChatCompletion_FewShot.py |  6 ++---
 ...Subjectivity_GPTChatCompletion_ZeroShot.py |  6 ++---
 .../NewsCat_ASND_BLOOMZ_ZeroShot.py           |  6 ++---
 .../NewsCat_ASND_ChatGPT_ZeroShot.py          |  6 ++---
 .../NewsCat_ASND_GPTChatCompletion_FewShot.py |  6 ++---
 ...NewsCat_ASND_GPTChatCompletion_ZeroShot.py |  6 ++---
 .../NewsCat_Akhbarona_BLOOMZ_ZeroShot.py      |  6 ++---
 .../NewsCat_Akhbarona_ChatGPT_ZeroShot.py     |  6 ++---
 ...Cat_Akhbarona_GPTChatCompletion_FewShot.py |  6 ++---
 ...at_Akhbarona_GPTChatCompletion_ZeroShot.py |  6 ++---
 .../NewsCat_AlArabiya_BLOOMZ_ZeroShot.py      |  6 ++---
 .../NewsCat_AlArabiya_ChatGPT_ZeroShot.py     |  6 ++---
 ...Cat_AlArabiya_GPTChatCompletion_FewShot.py |  6 ++---
 ...at_AlArabiya_GPTChatCompletion_ZeroShot.py |  6 ++---
 .../NewsCat_AlKhaleej_BLOOMZ_ZeroShot.py      |  6 ++---
 .../NewsCat_AlKhaleej_ChatGPT_ZeroShot.py     |  6 ++---
 ...Cat_AlKhaleej_GPTChatCompletion_FewShot.py |  6 ++---
 ...at_AlKhaleej_GPTChatCompletion_ZeroShot.py |  6 ++---
 .../sarcasm/ArSarcasm2_GPT3_Zeroshot.py       |  6 ++---
 .../sarcasm/ArSarcasm2_GPT4_FewShot.py        |  6 ++---
 .../sarcasm/ArSarcasm2_GPT4_Zeroshot.py       |  6 ++---
 .../sarcasm/ArSarcasm_BLOOMZ_Zeroshot.py      |  6 ++---
 .../sarcasm/ArSarcasm_GPT3_Zeroshot.py        |  6 ++---
 .../sarcasm/ArSarcasm_GPT4_Fewshot.py         |  6 ++---
 .../sarcasm/ArSarcasm_GPT4_Zeroshot.py        |  6 ++---
 .../semantics/XNLI_BLOOMZ_ZeroShot.py         |  6 ++---
 .../semantics/XNLI_CGPT4_FewShot.py           |  6 ++---
 .../semantics/XNLI_CGPT4_ZeroShot.py          |  6 ++---
 .../semantics/XNLI_ChatGPT_ZeroShot.py        |  6 ++---
 .../emotion/Emotion_BLOOMZ_ZeroShot.py        |  6 ++---
 .../emotion/Emotion_ChatGPT_ZeroShot.py       |  6 ++---
 .../Emotion_GPTChatCompletion_FewShot.py      |  6 ++---
 .../Emotion_GPTChatCompletion_ZeroShot.py     |  6 ++---
 .../offensive/Offensive_BLOOMZ_ZeroShot.py    |  6 ++---
 .../offensive/Offensive_ChatGPT_ZeroShot.py   |  6 ++---
 .../Offensive_GPTChatCompletion_ZeroShot.py   |  6 ++---
 .../ArSASSentiment_BLOOMZ_ZeroShot.py         |  6 ++---
 .../ArSASSentiment_ChatGPT_ZeroShot.py        |  6 ++---
 ...SASSentiment_GPTChatCompletion_ZeroShot.py |  6 ++---
 .../BanglaSentiment_BLOOMZ_ZeroShot.py        |  6 ++---
 .../sentiment/BanglaSentiment_GPT4_FewShot.py |  6 ++---
 .../BanglaSentiment_GPT4_ZeroShot.py          |  6 ++---
 .../sentiment/spam/Spam_BLOOMZ_ZeroShot.py    |  6 ++---
 .../sentiment/spam/Spam_ChatGPT_ZeroShot.py   |  6 ++---
 .../spam/Spam_GPTChatCompletion_ZeroShot.py   |  6 ++---
 .../StanceKhouja20_BLOOMZ_ZeroShot.py         |  6 ++---
 .../StanceKhouja20_ChatGPT_ZeroShot.py        |  6 ++---
 ...tanceKhouja20_GPTChatCompletion_FewShot.py |  6 ++---
 ...anceKhouja20_GPTChatCompletion_ZeroShot.py |  6 ++---
 .../StanceUnifiedFC_BLOOMZ_ZeroShot.py        |  6 ++---
 .../StanceUnifiedFC_ChatGPT_ZeroShot.py       |  6 ++---
 ...anceUnifiedFC_GPTChatCompletion_FewShot.py |  6 ++---
 ...nceUnifiedFC_GPTChatCompletion_ZeroShot.py |  6 ++---
 .../DialectADI_BLOOMZ_ZeroShot.py             |  6 ++---
 .../DialectADI_ChatGPT_ZeroShot.py            |  6 ++---
 .../DialectADI_GPTChatCompletion_FewShot.py   |  6 ++---
 .../DialectADI_GPTChatCompletion_ZeroShot.py  |  6 ++---
 .../DialectID_QADI_ChatGPT_ZeroShot.py        |  6 ++---
 ...alectID_QADI_GPTChatCompletion_ZeroShot.py |  6 ++---
 .../POS_ChatGPT_ZeroShot.py                   |  6 ++---
 .../POS_GPT4_ZeroShot.py                      |  6 ++---
 .../POS_GPTChatCompletion_FewShot.py          |  6 ++---
 .../diacritization_ChatGPT_ZeroShot.py        |  6 ++---
 .../Lemmatization_ChatGPT_ZeroShot.py         |  6 ++---
 .../parsing_ChatGPT_ZeroShot.py               |  6 ++---
 .../parsing_GPT4_ZeroShot.py                  |  6 ++---
 .../segmentation_ChatGPT_ZeroShot.py          |  6 ++---
 .../segmentation_GPT4_ZeroShot.py             |  6 ++---
 .../__init__.py                               |  0
 .../__main__.py                               |  0
 .../benchmark.py                              |  0
 .../datasets/ANERcorp.py                      |  2 +-
 .../datasets/ARCD.py                          |  2 +-
 .../datasets/Adult.py                         |  2 +-
 .../datasets/Aqmar.py                         |  2 +-
 .../datasets/ArSASSentiment.py                |  2 +-
 .../datasets/ArSarcasm.py                     |  2 +-
 .../datasets/AraBench.py                      |  2 +-
 .../datasets/ArabGend.py                      |  2 +-
 .../datasets/ArabicDiacritization.py          |  2 +-
 .../datasets/ArabicPOS.py                     |  2 +-
 .../datasets/ArabicParsing.py                 |  2 +-
 .../datasets/ArabicSegmentation.py            |  2 +-
 .../datasets/ArapTweet.py                     |  2 +-
 .../datasets/Attentionworthy.py               |  2 +-
 .../datasets/BanglaSentiment.py               |  2 +-
 .../datasets/Checkworthiness.py               |  2 +-
 .../datasets/Claim.py                         |  2 +-
 .../datasets/DialectADI.py                    |  2 +-
 .../datasets/Emotion.py                       |  2 +-
 .../datasets/FactualityCOVID19.py             |  2 +-
 .../datasets/FactualityKhouja20.py            |  2 +-
 .../datasets/FactualityUnifiedFC.py           |  2 +-
 .../datasets/Harmful.py                       |  2 +-
 .../datasets/HateSpeech.py                    |  2 +-
 .../datasets/Lemmatization.py                 |  2 +-
 .../datasets/Location.py                      |  2 +-
 .../datasets/MGBWords.py                      |  2 +-
 .../datasets/MLQA.py                          |  2 +-
 .../datasets/NameInfo.py                      |  2 +-
 .../datasets/NewsCatASND.py                   |  2 +-
 .../datasets/NewsCatAkhbarona.py              |  2 +-
 .../datasets/NewsCatAlArabiya.py              |  2 +-
 .../datasets/NewsCatAlKhaleej.py              |  2 +-
 .../datasets/Offensive.py                     |  2 +-
 .../datasets/Propaganda.py                    |  2 +-
 .../datasets/PropagandaSemEval23.py           |  2 +-
 .../datasets/QADI.py                          |  2 +-
 .../datasets/SQuADBase.py                     |  2 +-
 .../datasets/STSArSemEval17Track1.py          |  2 +-
 .../datasets/STSArSemEval17Track2.py          |  2 +-
 .../datasets/STSQ2Q.py                        |  2 +-
 .../datasets/Spam.py                          |  2 +-
 .../datasets/StanceKhouja20.py                |  2 +-
 .../datasets/StanceUnifiedFC.py               |  2 +-
 .../datasets/Subjectivity.py                  |  2 +-
 .../datasets/TyDiQA.py                        |  2 +-
 .../datasets/XNLI.py                          |  2 +-
 .../datasets/XQuAD.py                         |  2 +-
 .../datasets/__init__.py                      |  0
 .../datasets/dataset_base.py                  |  0
 .../models/BLOOMPetal.py                      |  2 +-
 .../models/GPT.py                             |  2 +-
 .../models/RandomGPT.py                       |  2 +-
 .../models/__init__.py                        |  0
 .../models/model_base.py                      |  0
 .../tasks/Adult.py                            |  2 +-
 .../tasks/ArabicDiacritization.py             |  2 +-
 .../tasks/ArabicPOS.py                        |  2 +-
 .../tasks/ArabicParsing.py                    |  2 +-
 .../tasks/ArabicSegmentation.py               |  2 +-
 .../tasks/Attentionworthy.py                  |  2 +-
 .../tasks/Checkworthiness.py                  |  2 +-
 .../tasks/ClaimDetection.py                   |  2 +-
 .../tasks/DemographyGender.py                 |  2 +-
 .../tasks/DemographyLocation.py               |  2 +-
 .../tasks/DemographyNameInfo.py               |  2 +-
 .../tasks/DialectID.py                        |  2 +-
 .../tasks/Emotion.py                          |  2 +-
 .../tasks/FactualityCOVID19.py                |  2 +-
 .../tasks/FactualityKhouja20.py               |  2 +-
 .../tasks/FactualityUnifiedFC.py              |  2 +-
 .../tasks/HarmfulDetection.py                 |  2 +-
 .../tasks/HateSpeech.py                       |  2 +-
 .../tasks/Lemmatization.py                    |  2 +-
 .../tasks/MachineTranslation.py               |  2 +-
 .../tasks/NER.py                              |  2 +-
 .../tasks/NewsCatASND.py                      |  2 +-
 .../tasks/NewsCatAkhbarona.py                 |  2 +-
 .../tasks/NewsCatAlArabiya.py                 |  2 +-
 .../tasks/NewsCatAlKhaleej.py                 |  2 +-
 .../tasks/Offensive.py                        |  2 +-
 .../tasks/PropagandaMultilabel.py             |  2 +-
 .../tasks/PropagandaMultilabelSemEval23.py    |  2 +-
 .../tasks/Q2QSimDetect.py                     |  2 +-
 .../tasks/QA.py                               |  2 +-
 .../tasks/STSTrack1.py                        |  2 +-
 .../tasks/STSTrack2.py                        |  2 +-
 .../tasks/Sarcasm.py                          |  2 +-
 .../tasks/Sentiment.py                        |  2 +-
 .../tasks/Spam.py                             |  2 +-
 .../tasks/StanceKhouja20.py                   |  2 +-
 .../tasks/StanceUnifiedFC.py                  |  2 +-
 .../tasks/Subjectivity.py                     |  2 +-
 .../tasks/XNLI.py                             |  2 +-
 .../tasks/__init__.py                         |  0
 .../tasks/task_base.py                        |  0
 {arabic_llm_benchmark => llmebench}/utils.py  |  0
 scripts/format_code.sh                        |  4 +--
 scripts/run_tests.sh                          |  2 +-
 setup.cfg                                     |  6 ++---
 tests/datasets/test_exports.py                |  8 +++---
 tests/models/test_BLOOMPetal.py               |  4 +--
 tests/models/test_GPT.py                      |  4 +--
 tests/models/test_GPTChatCompletion.py        |  4 +--
 tests/models/test_exports.py                  |  8 +++---
 tests/tasks/test_evaluation.py                |  2 +-
 tests/tasks/test_exports.py                   |  8 +++---
 tests/test_benchmark_assets.py                |  2 +-
 310 files changed, 721 insertions(+), 721 deletions(-)
 rename {arabic_llm_benchmark => llmebench}/__init__.py (100%)
 rename {arabic_llm_benchmark => llmebench}/__main__.py (100%)
 rename {arabic_llm_benchmark => llmebench}/benchmark.py (100%)
 rename {arabic_llm_benchmark => llmebench}/datasets/ANERcorp.py (98%)
 rename {arabic_llm_benchmark => llmebench}/datasets/ARCD.py (88%)
 rename {arabic_llm_benchmark => llmebench}/datasets/Adult.py (95%)
 rename {arabic_llm_benchmark => llmebench}/datasets/Aqmar.py (98%)
 rename {arabic_llm_benchmark => llmebench}/datasets/ArSASSentiment.py (92%)
 rename {arabic_llm_benchmark => llmebench}/datasets/ArSarcasm.py (95%)
 rename {arabic_llm_benchmark => llmebench}/datasets/AraBench.py (96%)
 rename {arabic_llm_benchmark => llmebench}/datasets/ArabGend.py (93%)
 rename {arabic_llm_benchmark => llmebench}/datasets/ArabicDiacritization.py (95%)
 rename {arabic_llm_benchmark => llmebench}/datasets/ArabicPOS.py (95%)
 rename {arabic_llm_benchmark => llmebench}/datasets/ArabicParsing.py (96%)
 rename {arabic_llm_benchmark => llmebench}/datasets/ArabicSegmentation.py (96%)
 rename {arabic_llm_benchmark => llmebench}/datasets/ArapTweet.py (97%)
 rename {arabic_llm_benchmark => llmebench}/datasets/Attentionworthy.py (95%)
 rename {arabic_llm_benchmark => llmebench}/datasets/BanglaSentiment.py (95%)
 rename {arabic_llm_benchmark => llmebench}/datasets/Checkworthiness.py (96%)
 rename {arabic_llm_benchmark => llmebench}/datasets/Claim.py (95%)
 rename {arabic_llm_benchmark => llmebench}/datasets/DialectADI.py (92%)
 rename {arabic_llm_benchmark => llmebench}/datasets/Emotion.py (94%)
 rename {arabic_llm_benchmark => llmebench}/datasets/FactualityCOVID19.py (97%)
 rename {arabic_llm_benchmark => llmebench}/datasets/FactualityKhouja20.py (94%)
 rename {arabic_llm_benchmark => llmebench}/datasets/FactualityUnifiedFC.py (97%)
 rename {arabic_llm_benchmark => llmebench}/datasets/Harmful.py (95%)
 rename {arabic_llm_benchmark => llmebench}/datasets/HateSpeech.py (95%)
 rename {arabic_llm_benchmark => llmebench}/datasets/Lemmatization.py (95%)
 rename {arabic_llm_benchmark => llmebench}/datasets/Location.py (94%)
 rename {arabic_llm_benchmark => llmebench}/datasets/MGBWords.py (96%)
 rename {arabic_llm_benchmark => llmebench}/datasets/MLQA.py (88%)
 rename {arabic_llm_benchmark => llmebench}/datasets/NameInfo.py (93%)
 rename {arabic_llm_benchmark => llmebench}/datasets/NewsCatASND.py (95%)
 rename {arabic_llm_benchmark => llmebench}/datasets/NewsCatAkhbarona.py (95%)
 rename {arabic_llm_benchmark => llmebench}/datasets/NewsCatAlArabiya.py (95%)
 rename {arabic_llm_benchmark => llmebench}/datasets/NewsCatAlKhaleej.py (95%)
 rename {arabic_llm_benchmark => llmebench}/datasets/Offensive.py (94%)
 rename {arabic_llm_benchmark => llmebench}/datasets/Propaganda.py (96%)
 rename {arabic_llm_benchmark => llmebench}/datasets/PropagandaSemEval23.py (98%)
 rename {arabic_llm_benchmark => llmebench}/datasets/QADI.py (94%)
 rename {arabic_llm_benchmark => llmebench}/datasets/SQuADBase.py (95%)
 rename {arabic_llm_benchmark => llmebench}/datasets/STSArSemEval17Track1.py (96%)
 rename {arabic_llm_benchmark => llmebench}/datasets/STSArSemEval17Track2.py (96%)
 rename {arabic_llm_benchmark => llmebench}/datasets/STSQ2Q.py (95%)
 rename {arabic_llm_benchmark => llmebench}/datasets/Spam.py (94%)
 rename {arabic_llm_benchmark => llmebench}/datasets/StanceKhouja20.py (94%)
 rename {arabic_llm_benchmark => llmebench}/datasets/StanceUnifiedFC.py (97%)
 rename {arabic_llm_benchmark => llmebench}/datasets/Subjectivity.py (95%)
 rename {arabic_llm_benchmark => llmebench}/datasets/TyDiQA.py (91%)
 rename {arabic_llm_benchmark => llmebench}/datasets/XNLI.py (96%)
 rename {arabic_llm_benchmark => llmebench}/datasets/XQuAD.py (90%)
 rename {arabic_llm_benchmark => llmebench}/datasets/__init__.py (100%)
 rename {arabic_llm_benchmark => llmebench}/datasets/dataset_base.py (100%)
 rename {arabic_llm_benchmark => llmebench}/models/BLOOMPetal.py (97%)
 rename {arabic_llm_benchmark => llmebench}/models/GPT.py (98%)
 rename {arabic_llm_benchmark => llmebench}/models/RandomGPT.py (94%)
 rename {arabic_llm_benchmark => llmebench}/models/__init__.py (100%)
 rename {arabic_llm_benchmark => llmebench}/models/model_base.py (100%)
 rename {arabic_llm_benchmark => llmebench}/tasks/Adult.py (88%)
 rename {arabic_llm_benchmark => llmebench}/tasks/ArabicDiacritization.py (98%)
 rename {arabic_llm_benchmark => llmebench}/tasks/ArabicPOS.py (94%)
 rename {arabic_llm_benchmark => llmebench}/tasks/ArabicParsing.py (93%)
 rename {arabic_llm_benchmark => llmebench}/tasks/ArabicSegmentation.py (94%)
 rename {arabic_llm_benchmark => llmebench}/tasks/Attentionworthy.py (94%)
 rename {arabic_llm_benchmark => llmebench}/tasks/Checkworthiness.py (94%)
 rename {arabic_llm_benchmark => llmebench}/tasks/ClaimDetection.py (89%)
 rename {arabic_llm_benchmark => llmebench}/tasks/DemographyGender.py (88%)
 rename {arabic_llm_benchmark => llmebench}/tasks/DemographyLocation.py (88%)
 rename {arabic_llm_benchmark => llmebench}/tasks/DemographyNameInfo.py (89%)
 rename {arabic_llm_benchmark => llmebench}/tasks/DialectID.py (93%)
 rename {arabic_llm_benchmark => llmebench}/tasks/Emotion.py (89%)
 rename {arabic_llm_benchmark => llmebench}/tasks/FactualityCOVID19.py (94%)
 rename {arabic_llm_benchmark => llmebench}/tasks/FactualityKhouja20.py (88%)
 rename {arabic_llm_benchmark => llmebench}/tasks/FactualityUnifiedFC.py (94%)
 rename {arabic_llm_benchmark => llmebench}/tasks/HarmfulDetection.py (90%)
 rename {arabic_llm_benchmark => llmebench}/tasks/HateSpeech.py (88%)
 rename {arabic_llm_benchmark => llmebench}/tasks/Lemmatization.py (92%)
 rename {arabic_llm_benchmark => llmebench}/tasks/MachineTranslation.py (87%)
 rename {arabic_llm_benchmark => llmebench}/tasks/NER.py (96%)
 rename {arabic_llm_benchmark => llmebench}/tasks/NewsCatASND.py (94%)
 rename {arabic_llm_benchmark => llmebench}/tasks/NewsCatAkhbarona.py (94%)
 rename {arabic_llm_benchmark => llmebench}/tasks/NewsCatAlArabiya.py (94%)
 rename {arabic_llm_benchmark => llmebench}/tasks/NewsCatAlKhaleej.py (94%)
 rename {arabic_llm_benchmark => llmebench}/tasks/Offensive.py (88%)
 rename {arabic_llm_benchmark => llmebench}/tasks/PropagandaMultilabel.py (94%)
 rename {arabic_llm_benchmark => llmebench}/tasks/PropagandaMultilabelSemEval23.py (94%)
 rename {arabic_llm_benchmark => llmebench}/tasks/Q2QSimDetect.py (88%)
 rename {arabic_llm_benchmark => llmebench}/tasks/QA.py (97%)
 rename {arabic_llm_benchmark => llmebench}/tasks/STSTrack1.py (90%)
 rename {arabic_llm_benchmark => llmebench}/tasks/STSTrack2.py (90%)
 rename {arabic_llm_benchmark => llmebench}/tasks/Sarcasm.py (89%)
 rename {arabic_llm_benchmark => llmebench}/tasks/Sentiment.py (94%)
 rename {arabic_llm_benchmark => llmebench}/tasks/Spam.py (88%)
 rename {arabic_llm_benchmark => llmebench}/tasks/StanceKhouja20.py (88%)
 rename {arabic_llm_benchmark => llmebench}/tasks/StanceUnifiedFC.py (88%)
 rename {arabic_llm_benchmark => llmebench}/tasks/Subjectivity.py (95%)
 rename {arabic_llm_benchmark => llmebench}/tasks/XNLI.py (88%)
 rename {arabic_llm_benchmark => llmebench}/tasks/__init__.py (100%)
 rename {arabic_llm_benchmark => llmebench}/tasks/task_base.py (100%)
 rename {arabic_llm_benchmark => llmebench}/utils.py (100%)

diff --git a/.github/workflows/code-formatting.yml b/.github/workflows/code-formatting.yml
index 18f89895..ee02756b 100644
--- a/.github/workflows/code-formatting.yml
+++ b/.github/workflows/code-formatting.yml
@@ -33,7 +33,7 @@ jobs:
           pip install '.[dev]'
       - name: Run ufmt check on framework code
         run: |
-          ufmt check arabic_llm_benchmark
+          ufmt check llmebench
       - name: Run ufmt check on test code
         run: |
           ufmt check tests
diff --git a/README.md b/README.md
index 74e5bb8a..a6f22d51 100644
--- a/README.md
+++ b/README.md
@@ -5,14 +5,14 @@
 
 Clone this repository:
 ```bash
-git clone https://github.com/qcri/Arabic_LLM_Benchmark.git
-cd Arabic_LLM_Benchmark
+git clone https://github.com/qcri/LLMeBench.git
+cd LLMeBench
 ```
 
 Create a virtual environment:
 ```bash
-python -m venv .envs/arabic_llm_benchmark
-source .envs/arabic_llm_benchmark/bin/activate
+python -m venv .envs/llmebench
+source .envs/llmebench/bin/activate
 ```
 
 Install the dependencies and benchmarking package:
@@ -21,7 +21,7 @@ pip install -e '.[dev,fewshot]'
 ```
 
 ## Get the benchmark data
-Download the benchmark from [here](https://neurox.qcri.org/projects/arabic_llm_benchmark/arabic_llm_benchmark_data.zip), and unzip it into the `Arabic_LLM_Benchmark` folder. After this process, there should be a `data` directory inside the top-level folder of the repository, with roughly the following contents:
+Download the benchmark from [here](https://neurox.qcri.org/projects/llmebench/arabic_llm_benchmark_data.zip), and unzip it into the `Arabic_LLM_Benchmark` folder. After this process, there should be a `data` directory inside the top-level folder of the repository, with roughly the following contents:
 
 ```bash
 $ ls data/
@@ -39,7 +39,7 @@ speech
 A sample benchmark is available in `assets/benchmark_v1`. To run the benchmark,
 
 ```bash
-python -m arabic_llm_benchmark <benchmark-dir> <results-dir>
+python -m llmebench <benchmark-dir> <results-dir>
 ```
 
 where `<benchmark-dir>` can point to `assets/benchmark_v1` for example. The
@@ -58,7 +58,7 @@ git checkout -b feat/sarcasm_task
 ```
 
 ### Dataset
-Check if the dataset used by your task already has an implementation in `arabic_llm_benchmark/datasets`. If not, implement a new dataset module (e.g. `arabic_llm_benchmark/datasets/SemEval23.py`), which implements a class (e.g. `SemEval23Dataset`) which subclasses `DatasetBase`. See an existing dataset module for inspiration. Each new dataset class requires implementing three functions:
+Check if the dataset used by your task already has an implementation in `llmebench/datasets`. If not, implement a new dataset module (e.g. `llmebench/datasets/SemEval23.py`), which implements a class (e.g. `SemEval23Dataset`) which subclasses `DatasetBase`. See an existing dataset module for inspiration. Each new dataset class requires implementing three functions:
 
 ```python
 class NewDataset(DatasetBase):
@@ -78,10 +78,10 @@ class NewDataset(DatasetBase):
 		#   "label": this will be used for evaluation
 ```
 
-Once the `Dataset` is implemented, export it in `arabic_llm_benchmark/datasets/__init__.py`.
+Once the `Dataset` is implemented, export it in `llmebench/datasets/__init__.py`.
 
 ### Task
-Check if the task you are adding to the benchmark already has an implementation in `arabic_llm_benchmark/tasks`. If not, implement a new dataset module (e.g. `arabic_llm_benchmark/tasks/Sarcasm.py`), which implements a class (e.g. `SarcasmTask`) which subclasses `TaskBase`. See an existing task module for inspiration. Each new task class requires implementing two functions:
+Check if the task you are adding to the benchmark already has an implementation in `llmebench/tasks`. If not, implement a new dataset module (e.g. `llmebench/tasks/Sarcasm.py`), which implements a class (e.g. `SarcasmTask`) which subclasses `TaskBase`. See an existing task module for inspiration. Each new task class requires implementing two functions:
 
 ```python
 class NewTask(TaskBase):
@@ -97,10 +97,10 @@ class NewTask(TaskBase):
 		# post_process function
 ```
 
-Once the `Task` is implemented, export it in `arabic_llm_benchmark/tasks/__init__.py`.
+Once the `Task` is implemented, export it in `llmebench/tasks/__init__.py`.
 
 ### Model
-Next, check if the model you are trying to run the benchmark for has an implementation in `arabic_llm_benchmark/models`. If not, implement a new model module (e.g. `arabic_llm_benchmark/models/QARiB.py`), which implements a class (e.g. `QARiBModel`) which subclasses `ModelBase`. See an existing model module for inspiration. Each new model class requires implementing two functions:
+Next, check if the model you are trying to run the benchmark for has an implementation in `llmebench/models`. If not, implement a new model module (e.g. `llmebench/models/QARiB.py`), which implements a class (e.g. `QARiBModel`) which subclasses `ModelBase`. See an existing model module for inspiration. Each new model class requires implementing two functions:
 
 ```python
 class NewModel(TaskBase):
@@ -115,7 +115,7 @@ class NewModel(TaskBase):
 		# run the actual model and return model outputs
 ```
 
-Once the `Model` is implemented, export it in `arabic_llm_benchmark/models/__init__.py`.
+Once the `Model` is implemented, export it in `llmebench/models/__init__.py`.
 
 ### Benchmark Asset
 Now that the Dataset, Task and Model are defined, the framework expects a given benchmark asset (e.g. "ArabGender" dataset, "GenderClassification" task, "GPT" model and "ZeroShot" prompting setting) to have a `*.py` file with three functions:
@@ -145,7 +145,7 @@ def post_process(response):
 The benchmarking module allows one to run a specific asset instead of the entire benchmark using the `--filter` option. It is also a good idea to use the `--limit` option to limit the tests to few (e.g. 5 samples). Sample command below:
 
 ```bash
-python -m arabic_llm_benchmark --filter 'demography/gender/AraGend_ChatGPT_ZeroShot' --limit 5 --ignore_cache <benchmark-dir> <results-dir>
+python -m llmebench --filter 'demography/gender/AraGend_ChatGPT_ZeroShot' --limit 5 --ignore_cache <benchmark-dir> <results-dir>
 ```
 
 Make sure to also run `scripts/run_tests.sh` before submitting your code, and once you are ready, you can commit your changes locally and push them to a remote branch:
diff --git a/assets/benchmark_v1/MT/AraBench_Ara2Eng_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/MT/AraBench_Ara2Eng_BLOOMZ_ZeroShot.py
index 9c95d312..3db0e90f 100644
--- a/assets/benchmark_v1/MT/AraBench_Ara2Eng_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/MT/AraBench_Ara2Eng_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import AraBenchDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import MachineTranslationTask
+from llmebench.datasets import AraBenchDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import MachineTranslationTask
 
 
 def config():
diff --git a/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py b/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py
index 486bb51c..68db7bb7 100644
--- a/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py
+++ b/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import AraBenchDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import MachineTranslationTask
+from llmebench.datasets import AraBenchDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import MachineTranslationTask
 
 
 def config():
diff --git a/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT_ZeroShot.py b/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT_ZeroShot.py
index 245c9b46..147859b7 100644
--- a/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import AraBenchDataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import MachineTranslationTask
+from llmebench.datasets import AraBenchDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import MachineTranslationTask
 
 
 def config():
diff --git a/assets/benchmark_v1/NER/MGBWords_ChatGPT_ZeroShot.py b/assets/benchmark_v1/NER/MGBWords_ChatGPT_ZeroShot.py
index 75be7f08..5541a741 100644
--- a/assets/benchmark_v1/NER/MGBWords_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/NER/MGBWords_ChatGPT_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import MGBWordsDataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import NERTask
+from llmebench.datasets import MGBWordsDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import NERTask
 
 
 def config():
diff --git a/assets/benchmark_v1/NER/MGBWords_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/NER/MGBWords_GPTChatCompletion_ZeroShot.py
index 42f047d9..d2abd099 100644
--- a/assets/benchmark_v1/NER/MGBWords_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/NER/MGBWords_GPTChatCompletion_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import MGBWordsDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import NERTask
+from llmebench.datasets import MGBWordsDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import NERTask
 
 
 def config():
diff --git a/assets/benchmark_v1/NER/NERANERcorp_ChatGPT_ZeroShot.py b/assets/benchmark_v1/NER/NERANERcorp_ChatGPT_ZeroShot.py
index 9b644df8..984d1044 100644
--- a/assets/benchmark_v1/NER/NERANERcorp_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/NER/NERANERcorp_ChatGPT_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import ANERcorpDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import NERTask
+from llmebench.datasets import ANERcorpDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import NERTask
 
 
 def config():
diff --git a/assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_FewShot.py
index 6a053e27..10710072 100644
--- a/assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_FewShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import ANERcorpDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import NERTask
+from llmebench.datasets import ANERcorpDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import NERTask
 
 
 def config():
diff --git a/assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_ZeroShot.py
index e507445f..6091e611 100644
--- a/assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import ANERcorpDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import NERTask
+from llmebench.datasets import ANERcorpDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import NERTask
 
 
 def config():
diff --git a/assets/benchmark_v1/NER/NERAqmar_ChatGPT_ZeroShot.py b/assets/benchmark_v1/NER/NERAqmar_ChatGPT_ZeroShot.py
index 6b8f21f7..5f44fbea 100644
--- a/assets/benchmark_v1/NER/NERAqmar_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/NER/NERAqmar_ChatGPT_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import AqmarDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import NERTask
+from llmebench.datasets import AqmarDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import NERTask
 
 
 def config():
diff --git a/assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_FewShot.py
index a9865520..662d1230 100644
--- a/assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_FewShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import AqmarDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import NERTask
+from llmebench.datasets import AqmarDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import NERTask
 
 
 def config():
diff --git a/assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_ZeroShot.py
index b42e55bf..05eb8401 100644
--- a/assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import AqmarDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import NERTask
+from llmebench.datasets import AqmarDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import NERTask
 
 
 def config():
diff --git a/assets/benchmark_v1/QA/ARCD_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/QA/ARCD_BLOOMZ_ZeroShot.py
index 340e6c07..9e3fe217 100644
--- a/assets/benchmark_v1/QA/ARCD_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/QA/ARCD_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ARCDDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import QATask
+from llmebench.datasets import ARCDDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import QATask
 
 
 def config():
diff --git a/assets/benchmark_v1/QA/ARCD_ChatGPT_ZeroShot.py b/assets/benchmark_v1/QA/ARCD_ChatGPT_ZeroShot.py
index 35c59d67..44cefc77 100644
--- a/assets/benchmark_v1/QA/ARCD_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/QA/ARCD_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ARCDDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import QATask
+from llmebench.datasets import ARCDDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import QATask
 
 
 def config():
diff --git a/assets/benchmark_v1/QA/ARCD_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/QA/ARCD_GPTChatCompletion_FewShot.py
index 1a5c7bfa..91ec1a02 100644
--- a/assets/benchmark_v1/QA/ARCD_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/QA/ARCD_GPTChatCompletion_FewShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import ARCDDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import QATask
+from llmebench.datasets import ARCDDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import QATask
 
 random.seed(3333)
 
diff --git a/assets/benchmark_v1/QA/ARCD_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/QA/ARCD_GPTChatCompletion_ZeroShot.py
index 513f81c4..b0fe38da 100644
--- a/assets/benchmark_v1/QA/ARCD_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/QA/ARCD_GPTChatCompletion_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ARCDDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import QATask
+from llmebench.datasets import ARCDDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import QATask
 
 
 def config():
diff --git a/assets/benchmark_v1/QA/MLQA_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/QA/MLQA_BLOOMZ_ZeroShot.py
index b06c9a5b..8e19a535 100644
--- a/assets/benchmark_v1/QA/MLQA_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/QA/MLQA_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import MLQADataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import QATask
+from llmebench.datasets import MLQADataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import QATask
 
 
 def config():
diff --git a/assets/benchmark_v1/QA/MLQA_ChatGPT_ZeroShot.py b/assets/benchmark_v1/QA/MLQA_ChatGPT_ZeroShot.py
index 9524c05c..ba57acbb 100644
--- a/assets/benchmark_v1/QA/MLQA_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/QA/MLQA_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import MLQADataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import QATask
+from llmebench.datasets import MLQADataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import QATask
 
 
 def config():
diff --git a/assets/benchmark_v1/QA/MLQA_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/QA/MLQA_GPTChatCompletion_FewShot.py
index a4ab0d80..aa6bef95 100644
--- a/assets/benchmark_v1/QA/MLQA_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/QA/MLQA_GPTChatCompletion_FewShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import MLQADataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import QATask
+from llmebench.datasets import MLQADataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import QATask
 
 random.seed(3333)
 
diff --git a/assets/benchmark_v1/QA/MLQA_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/QA/MLQA_GPTChatCompletion_ZeroShot.py
index 47030eba..31ac98de 100644
--- a/assets/benchmark_v1/QA/MLQA_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/QA/MLQA_GPTChatCompletion_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import MLQADataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import QATask
+from llmebench.datasets import MLQADataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import QATask
 
 
 def config():
diff --git a/assets/benchmark_v1/QA/TyDiQA_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/QA/TyDiQA_BLOOMZ_ZeroShot.py
index 6a3539c8..4ad3bc10 100644
--- a/assets/benchmark_v1/QA/TyDiQA_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/QA/TyDiQA_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import TyDiQADataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import QATask
+from llmebench.datasets import TyDiQADataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import QATask
 
 
 def config():
diff --git a/assets/benchmark_v1/QA/TyDiQA_ChatGPT_ZeroShot.py b/assets/benchmark_v1/QA/TyDiQA_ChatGPT_ZeroShot.py
index a98e1352..4e8a91e7 100644
--- a/assets/benchmark_v1/QA/TyDiQA_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/QA/TyDiQA_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import TyDiQADataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import QATask
+from llmebench.datasets import TyDiQADataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import QATask
 
 
 def config():
diff --git a/assets/benchmark_v1/QA/TyDiQA_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/QA/TyDiQA_GPTChatCompletion_FewShot.py
index f4b83806..339fbe75 100644
--- a/assets/benchmark_v1/QA/TyDiQA_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/QA/TyDiQA_GPTChatCompletion_FewShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import TyDiQADataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import QATask
+from llmebench.datasets import TyDiQADataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import QATask
 
 random.seed(3333)
 
diff --git a/assets/benchmark_v1/QA/TydiQA_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/QA/TydiQA_GPTChatCompletion_ZeroShot.py
index c2105b5d..e5d80a12 100644
--- a/assets/benchmark_v1/QA/TydiQA_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/QA/TydiQA_GPTChatCompletion_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import TyDiQADataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import QATask
+from llmebench.datasets import TyDiQADataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import QATask
 
 
 def config():
diff --git a/assets/benchmark_v1/QA/XQuAD_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/QA/XQuAD_BLOOMZ_ZeroShot.py
index 7b8de231..ccc89880 100644
--- a/assets/benchmark_v1/QA/XQuAD_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/QA/XQuAD_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import XQuADDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import QATask
+from llmebench.datasets import XQuADDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import QATask
 
 
 def config():
diff --git a/assets/benchmark_v1/QA/XQuAD_ChatGPT_ZeroShot.py b/assets/benchmark_v1/QA/XQuAD_ChatGPT_ZeroShot.py
index afb0ba6f..9729931e 100644
--- a/assets/benchmark_v1/QA/XQuAD_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/QA/XQuAD_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import XQuADDataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import QATask
+from llmebench.datasets import XQuADDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import QATask
 
 
 def config():
diff --git a/assets/benchmark_v1/QA/XQuAD_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/QA/XQuAD_GPTChatCompletion_ZeroShot.py
index caab5c27..86413e14 100644
--- a/assets/benchmark_v1/QA/XQuAD_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/QA/XQuAD_GPTChatCompletion_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import XQuADDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import QATask
+from llmebench.datasets import XQuADDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import QATask
 
 
 def config():
diff --git a/assets/benchmark_v1/QA/XQuaD_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/QA/XQuaD_GPTChatCompletion_FewShot.py
index a1df0eb9..01a143c2 100644
--- a/assets/benchmark_v1/QA/XQuaD_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/QA/XQuaD_GPTChatCompletion_FewShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import XQuADDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import QATask
+from llmebench.datasets import XQuADDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import QATask
 
 random.seed(3333)
 
diff --git a/assets/benchmark_v1/STS/Q2QSim_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/STS/Q2QSim_BLOOMZ_ZeroShot.py
index d9aae05b..ea8ceca0 100644
--- a/assets/benchmark_v1/STS/Q2QSim_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/STS/Q2QSim_BLOOMZ_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import Q2QSimDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import Q2QSimDetectionTask
+from llmebench.datasets import Q2QSimDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import Q2QSimDetectionTask
 
 
 def config():
diff --git a/assets/benchmark_v1/STS/Q2QSim_ChatGPT_ZeroShot.py b/assets/benchmark_v1/STS/Q2QSim_ChatGPT_ZeroShot.py
index 839e94e3..c23e3f7e 100644
--- a/assets/benchmark_v1/STS/Q2QSim_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/STS/Q2QSim_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import Q2QSimDataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import Q2QSimDetectionTask
+from llmebench.datasets import Q2QSimDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import Q2QSimDetectionTask
 
 
 def config():
diff --git a/assets/benchmark_v1/STS/Q2QSim_GPT4_FewShot.py b/assets/benchmark_v1/STS/Q2QSim_GPT4_FewShot.py
index 9112d402..791fd0e3 100644
--- a/assets/benchmark_v1/STS/Q2QSim_GPT4_FewShot.py
+++ b/assets/benchmark_v1/STS/Q2QSim_GPT4_FewShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import Q2QSimDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import Q2QSimDetectionTask
+from llmebench.datasets import Q2QSimDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import Q2QSimDetectionTask
 
 
 def config():
diff --git a/assets/benchmark_v1/STS/Q2QSim_GPT4_ZeroShot.py b/assets/benchmark_v1/STS/Q2QSim_GPT4_ZeroShot.py
index 926684fa..20ed5bb3 100644
--- a/assets/benchmark_v1/STS/Q2QSim_GPT4_ZeroShot.py
+++ b/assets/benchmark_v1/STS/Q2QSim_GPT4_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import Q2QSimDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import Q2QSimDetectionTask
+from llmebench.datasets import Q2QSimDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import Q2QSimDetectionTask
 
 
 def config():
diff --git a/assets/benchmark_v1/STS/STSTrack1_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/STS/STSTrack1_BLOOMZ_ZeroShot.py
index 6fb16a88..9fd822f1 100644
--- a/assets/benchmark_v1/STS/STSTrack1_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/STS/STSTrack1_BLOOMZ_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import STSArSemEval17Track1Dataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import STSTrack1Task
+from llmebench.datasets import STSArSemEval17Track1Dataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import STSTrack1Task
 
 
 def config():
diff --git a/assets/benchmark_v1/STS/STSTrack1_ChatGPT_ZeroShot.py b/assets/benchmark_v1/STS/STSTrack1_ChatGPT_ZeroShot.py
index 1ec7f78a..1b99e05a 100644
--- a/assets/benchmark_v1/STS/STSTrack1_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/STS/STSTrack1_ChatGPT_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import STSArSemEval17Track1Dataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import STSTrack1Task
+from llmebench.datasets import STSArSemEval17Track1Dataset
+from llmebench.models import GPTModel
+from llmebench.tasks import STSTrack1Task
 
 
 def config():
diff --git a/assets/benchmark_v1/STS/STSTrack1_GPT4_FewShot.py b/assets/benchmark_v1/STS/STSTrack1_GPT4_FewShot.py
index d1f645a6..aff4ddad 100644
--- a/assets/benchmark_v1/STS/STSTrack1_GPT4_FewShot.py
+++ b/assets/benchmark_v1/STS/STSTrack1_GPT4_FewShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import STSArSemEval17Track1Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import STSTrack1Task
+from llmebench.datasets import STSArSemEval17Track1Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import STSTrack1Task
 
 
 def config():
diff --git a/assets/benchmark_v1/STS/STSTrack1_GPT4_ZeroShot.py b/assets/benchmark_v1/STS/STSTrack1_GPT4_ZeroShot.py
index 01c3a640..3f94a169 100644
--- a/assets/benchmark_v1/STS/STSTrack1_GPT4_ZeroShot.py
+++ b/assets/benchmark_v1/STS/STSTrack1_GPT4_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import STSArSemEval17Track1Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import STSTrack1Task
+from llmebench.datasets import STSArSemEval17Track1Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import STSTrack1Task
 
 
 def config():
diff --git a/assets/benchmark_v1/STS/STSTrack2_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/STS/STSTrack2_BLOOMZ_ZeroShot.py
index e688befd..9b2e6efe 100644
--- a/assets/benchmark_v1/STS/STSTrack2_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/STS/STSTrack2_BLOOMZ_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import STSArSemEval17Track1Dataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import STSTrack1Task
+from llmebench.datasets import STSArSemEval17Track1Dataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import STSTrack1Task
 
 
 def config():
diff --git a/assets/benchmark_v1/STS/STSTrack2_ChatGPT_ZeroShot.py b/assets/benchmark_v1/STS/STSTrack2_ChatGPT_ZeroShot.py
index fb7a6776..da4dff44 100644
--- a/assets/benchmark_v1/STS/STSTrack2_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/STS/STSTrack2_ChatGPT_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import STSArSemEval17Track2Dataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import STSTrack2Task
+from llmebench.datasets import STSArSemEval17Track2Dataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import STSTrack2Task
 
 
 def config():
diff --git a/assets/benchmark_v1/STS/STSTrack2_GPT4_FewShot.py b/assets/benchmark_v1/STS/STSTrack2_GPT4_FewShot.py
index c445c4c6..a4e9e840 100644
--- a/assets/benchmark_v1/STS/STSTrack2_GPT4_FewShot.py
+++ b/assets/benchmark_v1/STS/STSTrack2_GPT4_FewShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import STSArSemEval17Track2Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import STSTrack2Task
+from llmebench.datasets import STSArSemEval17Track2Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import STSTrack2Task
 
 
 def config():
diff --git a/assets/benchmark_v1/STS/STSTrack2_GPT4_ZeroShot.py b/assets/benchmark_v1/STS/STSTrack2_GPT4_ZeroShot.py
index 9ce5a288..4be3334c 100644
--- a/assets/benchmark_v1/STS/STSTrack2_GPT4_ZeroShot.py
+++ b/assets/benchmark_v1/STS/STSTrack2_GPT4_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import STSArSemEval17Track2Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import STSTrack2Task
+from llmebench.datasets import STSArSemEval17Track2Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import STSTrack2Task
 
 
 def config():
diff --git a/assets/benchmark_v1/demography/gender/GenderArabGend_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/demography/gender/GenderArabGend_BLOOMZ_ZeroShot.py
index 6877a162..915e1dc9 100644
--- a/assets/benchmark_v1/demography/gender/GenderArabGend_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/demography/gender/GenderArabGend_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ArabGendDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import DemographyGenderTask
+from llmebench.datasets import ArabGendDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import DemographyGenderTask
 
 
 def config():
diff --git a/assets/benchmark_v1/demography/gender/GenderArabGend_ChatGPT_ZeroShot.py b/assets/benchmark_v1/demography/gender/GenderArabGend_ChatGPT_ZeroShot.py
index 938a7e1b..11a30148 100644
--- a/assets/benchmark_v1/demography/gender/GenderArabGend_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/demography/gender/GenderArabGend_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ArabGendDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import DemographyGenderTask
+from llmebench.datasets import ArabGendDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import DemographyGenderTask
 
 
 def config():
diff --git a/assets/benchmark_v1/demography/gender/GenderArabGend_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/demography/gender/GenderArabGend_GPTChatCompletion_ZeroShot.py
index 8311a1f1..4cda2896 100644
--- a/assets/benchmark_v1/demography/gender/GenderArabGend_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/demography/gender/GenderArabGend_GPTChatCompletion_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ArabGendDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import DemographyGenderTask
+from llmebench.datasets import ArabGendDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import DemographyGenderTask
 
 
 def config():
diff --git a/assets/benchmark_v1/demography/gender/GenderArapTweet_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/demography/gender/GenderArapTweet_BLOOMZ_ZeroShot.py
index 1f538c86..d9acb381 100644
--- a/assets/benchmark_v1/demography/gender/GenderArapTweet_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/demography/gender/GenderArapTweet_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ArapTweetDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import DemographyGenderTask
+from llmebench.datasets import ArapTweetDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import DemographyGenderTask
 
 
 def config():
diff --git a/assets/benchmark_v1/demography/gender/GenderArapTweet_ChatGPT_ZeroShot.py b/assets/benchmark_v1/demography/gender/GenderArapTweet_ChatGPT_ZeroShot.py
index 5ad89801..5c5f0e4d 100644
--- a/assets/benchmark_v1/demography/gender/GenderArapTweet_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/demography/gender/GenderArapTweet_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ArapTweetDataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import DemographyGenderTask
+from llmebench.datasets import ArapTweetDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import DemographyGenderTask
 
 
 def config():
diff --git a/assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_FewShot.py
index 73f0c875..587648a2 100644
--- a/assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_FewShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ArapTweetDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import DemographyGenderTask
+from llmebench.datasets import ArapTweetDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import DemographyGenderTask
 
 
 def config():
diff --git a/assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_ZeroShot.py
index cd19deb7..f6e59513 100644
--- a/assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ArapTweetDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import DemographyGenderTask
+from llmebench.datasets import ArapTweetDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import DemographyGenderTask
 
 
 def config():
diff --git a/assets/benchmark_v1/demography/location/Location_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/demography/location/Location_BLOOMZ_ZeroShot.py
index 685fcdf7..bf4db900 100644
--- a/assets/benchmark_v1/demography/location/Location_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/demography/location/Location_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import LocationDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import DemographyLocationTask
+from llmebench.datasets import LocationDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import DemographyLocationTask
 
 
 def config():
diff --git a/assets/benchmark_v1/demography/location/Location_ChatGPT_ZeroShot.py b/assets/benchmark_v1/demography/location/Location_ChatGPT_ZeroShot.py
index 29a98e99..50e28ef0 100644
--- a/assets/benchmark_v1/demography/location/Location_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/demography/location/Location_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import LocationDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import DemographyLocationTask
+from llmebench.datasets import LocationDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import DemographyLocationTask
 
 
 def config():
diff --git a/assets/benchmark_v1/demography/location/Location_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/demography/location/Location_GPTChatCompletion_FewShot.py
index 81ba853e..e342ebd5 100644
--- a/assets/benchmark_v1/demography/location/Location_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/demography/location/Location_GPTChatCompletion_FewShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import LocationDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import DemographyLocationTask
+from llmebench.datasets import LocationDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import DemographyLocationTask
 
 
 def config():
diff --git a/assets/benchmark_v1/demography/location/Location_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/demography/location/Location_GPTChatCompletion_ZeroShot.py
index bb39abd1..42e5b344 100644
--- a/assets/benchmark_v1/demography/location/Location_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/demography/location/Location_GPTChatCompletion_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import LocationDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import DemographyLocationTask
+from llmebench.datasets import LocationDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import DemographyLocationTask
 
 
 def config():
diff --git a/assets/benchmark_v1/demography/name_info/NameInfo_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/demography/name_info/NameInfo_BLOOMZ_ZeroShot.py
index 9424f35a..c9aa0bf9 100644
--- a/assets/benchmark_v1/demography/name_info/NameInfo_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/demography/name_info/NameInfo_BLOOMZ_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import NameInfoDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import DemographyNameInfoTask
+from llmebench.datasets import NameInfoDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import DemographyNameInfoTask
 
 
 def config():
diff --git a/assets/benchmark_v1/demography/name_info/NameInfo_ChatGPT_ZeroShot.py b/assets/benchmark_v1/demography/name_info/NameInfo_ChatGPT_ZeroShot.py
index 181b5981..0e6bf76a 100644
--- a/assets/benchmark_v1/demography/name_info/NameInfo_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/demography/name_info/NameInfo_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import NameInfoDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import DemographyNameInfoTask
+from llmebench.datasets import NameInfoDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import DemographyNameInfoTask
 
 
 def config():
diff --git a/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_FewShot.py
index 70c89a13..7f7b5fee 100644
--- a/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_FewShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import NameInfoDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import DemographyNameInfoTask
+from llmebench.datasets import NameInfoDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import DemographyNameInfoTask
 
 
 def config():
diff --git a/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_ZeroShot.py
index 75f580f3..dda2109b 100644
--- a/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import NameInfoDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import DemographyNameInfoTask
+from llmebench.datasets import NameInfoDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import DemographyNameInfoTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_BLOOMZ_ZeroShot.py
index afcbdd81..70c7ffb5 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import AdultDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import AdultTask
+from llmebench.datasets import AdultDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import AdultTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_ChatGPT_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_ChatGPT_ZeroShot.py
index dbb3da49..9d1b05e8 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import AdultDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import AdultTask
+from llmebench.datasets import AdultDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import AdultTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_FewShot.py
index 8b71f386..685bd33e 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_FewShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import AdultDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import AdultTask
+from llmebench.datasets import AdultDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import AdultTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_ZeroShot.py
index 6c2a6d83..e59830f3 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import AdultDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import AdultTask
+from llmebench.datasets import AdultDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import AdultTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_BLOOMZ_ZeroShot.py
index a8c29cb9..37f15dde 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import AttentionworthyDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import AttentionworthyTask
+from llmebench.datasets import AttentionworthyDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import AttentionworthyTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_ChatGPT_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_ChatGPT_ZeroShot.py
index d96ecb17..23d7f66c 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_ChatGPT_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import AttentionworthyDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import AttentionworthyTask
+from llmebench.datasets import AttentionworthyDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import AttentionworthyTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_Fewshot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_Fewshot.py
index 75852f05..92c7a32b 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_Fewshot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_Fewshot.py
@@ -2,9 +2,9 @@
 import random
 import re
 
-from arabic_llm_benchmark.datasets import AttentionworthyDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import AttentionworthyTask
+from llmebench.datasets import AttentionworthyDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import AttentionworthyTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_ZeroShot.py
index 57018b43..69da7868 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_ZeroShot.py
@@ -2,9 +2,9 @@
 import random
 import re
 
-from arabic_llm_benchmark.datasets import AttentionworthyDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import AttentionworthyTask
+from llmebench.datasets import AttentionworthyDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import AttentionworthyTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_BLOOMZ_ZeroShot.py
index e23814b7..63be543d 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import CovidClaimDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import ClaimDetectionTask
+from llmebench.datasets import CovidClaimDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import ClaimDetectionTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_CGPT35_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_CGPT35_ZeroShot.py
index f71b72cf..f2107913 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_CGPT35_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_CGPT35_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import CovidClaimDataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import ClaimDetectionTask
+from llmebench.datasets import CovidClaimDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import ClaimDetectionTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_GPT4_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_GPT4_FewShot.py
index f9fda31f..8ed99f9a 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_GPT4_FewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_GPT4_FewShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import CovidClaimDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import ClaimDetectionTask
+from llmebench.datasets import CovidClaimDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import ClaimDetectionTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_BLOOMZ_ZeroShot.py
index 6f8dff3d..20cb07f7 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import CovidHarmfulDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import HarmfulDetectionTask
+from llmebench.datasets import CovidHarmfulDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import HarmfulDetectionTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_CGPT35_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_CGPT35_ZeroShot.py
index c1b6871d..fdcf21f5 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_CGPT35_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_CGPT35_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import CovidHarmfulDataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import HarmfulDetectionTask
+from llmebench.datasets import CovidHarmfulDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import HarmfulDetectionTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_GPT4_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_GPT4_FewShot.py
index 05f558c8..fe497189 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_GPT4_FewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_GPT4_FewShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import CovidHarmfulDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import HarmfulDetectionTask
+from llmebench.datasets import CovidHarmfulDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import HarmfulDetectionTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_BGZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_BGZeroShot.py
index e94c24a8..10c2a3bf 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_BGZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_BGZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import CheckworthinessTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ENZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ENZeroShot.py
index c3add3ff..5bf0e294 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ENZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ENZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import CheckworthinessTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ESZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ESZeroShot.py
index f2242843..d8c1a989 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ESZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ESZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import CheckworthinessTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_NLZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_NLZeroShot.py
index 82b72bce..06793d52 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_NLZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_NLZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import CheckworthinessTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_TRZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_TRZeroShot.py
index c8b8650a..61338f04 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_TRZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_TRZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import CheckworthinessTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ZeroShot.py
index b6ee3933..7d7c512f 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import CheckworthinessTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_ChatGPT_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_ChatGPT_ZeroShot.py
index 5ab232ea..5e60e8a5 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import CheckworthinessTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGFewShot.py
index 79607dbe..495d05d2 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGFewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGFewShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import CheckworthinessTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGZeroShot.py
index 1fe1ba4b..2d118f14 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import CheckworthinessTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENFewShot.py
index 57c61e5a..d0b67175 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENFewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENFewShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import CheckworthinessTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENZeroShot.py
index 75800b2e..8cb18b32 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import CheckworthinessTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESFewShot.py
index 0d535e0b..e6099674 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESFewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESFewShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import CheckworthinessTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESZeroShot.py
index eff5ead0..3ed072e6 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import CheckworthinessTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_FewShot.py
index 90a3adef..444fb4de 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_FewShot.py
@@ -2,9 +2,9 @@
 import random
 import re
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import CheckworthinessTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLFewShot.py
index a79f6e33..7f241509 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLFewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLFewShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import CheckworthinessTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLZeroShot.py
index 5b527b26..f52b350c 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import CheckworthinessTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRFewShot.py
index decf403b..c1ae1b57 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRFewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRFewShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import CheckworthinessTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRZeroShot.py
index f4e3fdb0..14ebc259 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import CheckworthinessTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ZeroShot.py
index 71e0a98f..d90cb0ce 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ZeroShot.py
@@ -2,9 +2,9 @@
 import random
 import re
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import CheckworthinessTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_CGPT35_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_CGPT35_ZeroShot.py
index cdcf77fa..f8e2ba13 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_CGPT35_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_CGPT35_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import CovidClaimDataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import ClaimDetectionTask
+from llmebench.datasets import CovidClaimDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import ClaimDetectionTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_GPTChatCompletion_ZeroShot.py
index eb83bac4..1e9c3c73 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_GPTChatCompletion_ZeroShot.py
@@ -2,9 +2,9 @@
 import random
 import re
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import CheckworthinessTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_BLOOMZ_ZeroShot.py
index 66294b18..fa4dc0dd 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import FactualityCOVID19Dataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import FactualityCOVID19Task
+from llmebench.datasets import FactualityCOVID19Dataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import FactualityCOVID19Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_FewShot.py
index 264d3207..e165251a 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_FewShot.py
@@ -2,9 +2,9 @@
 import random
 import re
 
-from arabic_llm_benchmark.datasets import FactualityCOVID19Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import FactualityCOVID19Task
+from llmebench.datasets import FactualityCOVID19Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import FactualityCOVID19Task
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_ZeroShot.py
index c11207c7..6b74eb7e 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_ZeroShot.py
@@ -2,9 +2,9 @@
 import random
 import re
 
-from arabic_llm_benchmark.datasets import FactualityCOVID19Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import FactualityCOVID19Task
+from llmebench.datasets import FactualityCOVID19Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import FactualityCOVID19Task
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_BLOOMZ_ZeroShot.py
index bc5b1389..7fd21752 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_BLOOMZ_ZeroShot.py
@@ -2,9 +2,9 @@
 import random
 import re
 
-from arabic_llm_benchmark.datasets import FactualityKhouja20Dataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import FactualityKhouja20Task
+from llmebench.datasets import FactualityKhouja20Dataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import FactualityKhouja20Task
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_ChatGPT_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_ChatGPT_ZeroShot.py
index a087781f..1dceb259 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import FactualityKhouja20Dataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import FactualityKhouja20Task
+from llmebench.datasets import FactualityKhouja20Dataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import FactualityKhouja20Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_FewShot.py
index febb0e1c..82fb95ef 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_FewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_FewShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import FactualityKhouja20Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import FactualityKhouja20Task
+from llmebench.datasets import FactualityKhouja20Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import FactualityKhouja20Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_ZeroShot.py
index 8816f134..add924df 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import FactualityKhouja20Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import FactualityKhouja20Task
+from llmebench.datasets import FactualityKhouja20Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import FactualityKhouja20Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_BLOOMZ_ZeroShot.py
index 606e401f..47d99663 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_BLOOMZ_ZeroShot.py
@@ -2,9 +2,9 @@
 import random
 import re
 
-from arabic_llm_benchmark.datasets import FactualityUnifiedFCDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import FactualityUnifiedFCTask
+from llmebench.datasets import FactualityUnifiedFCDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import FactualityUnifiedFCTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPT4_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPT4_FewShot.py
index c5336d73..a85f1182 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPT4_FewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPT4_FewShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import FactualityUnifiedFCDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import FactualityUnifiedFCTask
+from llmebench.datasets import FactualityUnifiedFCDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import FactualityUnifiedFCTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPTChatCompletion_ZeroShot.py
index 4f592e02..0c1d21c2 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPTChatCompletion_ZeroShot.py
@@ -2,9 +2,9 @@
 import random
 import re
 
-from arabic_llm_benchmark.datasets import FactualityUnifiedFCDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import FactualityUnifiedFCTask
+from llmebench.datasets import FactualityUnifiedFCDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import FactualityUnifiedFCTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_CGPT35_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_CGPT35_ZeroShot.py
index 262ab383..8db951ae 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_CGPT35_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_CGPT35_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import CovidHarmfulDataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import HarmfulDetectionTask
+from llmebench.datasets import CovidHarmfulDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import HarmfulDetectionTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_GPTChatCompletion_ZeroShot.py
index b814dc23..8b0813b0 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_GPTChatCompletion_ZeroShot.py
@@ -2,9 +2,9 @@
 import random
 import re
 
-from arabic_llm_benchmark.datasets import CheckworthinessDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import CheckworthinessTask
+from llmebench.datasets import CheckworthinessDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import CheckworthinessTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_ChatGPT_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_ChatGPT_ZeroShot.py
index f3ded397..2cdd6740 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import HateSpeechDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import HateSpeechTask
+from llmebench.datasets import HateSpeechDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import HateSpeechTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_GPTChatCompletion_FewShot.py
index 4c07b90e..28e883e8 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_GPTChatCompletion_FewShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import HateSpeechDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import HateSpeechTask
+from llmebench.datasets import HateSpeechDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import HateSpeechTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Offensive_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Offensive_GPTChatCompletion_FewShot.py
index 1e76972f..044abf2e 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Offensive_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Offensive_GPTChatCompletion_FewShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import OffensiveDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import OffensiveTask
+from llmebench.datasets import OffensiveDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import OffensiveTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ENZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ENZeroShot.py
index ff0942f5..6fa517a7 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ENZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ENZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import PropagandaMultilabelSemEval23Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_FRZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_FRZeroShot.py
index f4716374..cc58c362 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_FRZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_FRZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import PropagandaMultilabelSemEval23Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_GEZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_GEZeroShot.py
index b57e14d9..7f38863d 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_GEZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_GEZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import PropagandaMultilabelSemEval23Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ITZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ITZeroShot.py
index ec09905a..9103c03c 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ITZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ITZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import PropagandaMultilabelSemEval23Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_POZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_POZeroShot.py
index e25f4b3b..4fca8321 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_POZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_POZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import PropagandaMultilabelSemEval23Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_RUZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_RUZeroShot.py
index 27acdbf8..9b3114e6 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_RUZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_RUZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import PropagandaMultilabelSemEval23Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ZeroShot.py
index 40d0e5d5..0247eaa6 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ZeroShot.py
@@ -2,9 +2,9 @@
 import random
 import re
 
-from arabic_llm_benchmark.datasets import PropagandaTweetDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelTask
+from llmebench.datasets import PropagandaTweetDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import PropagandaMultilabelTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_CGPT35_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_CGPT35_ZeroShot.py
index a3052a44..b9ad7f81 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_CGPT35_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_CGPT35_ZeroShot.py
@@ -2,9 +2,9 @@
 
 import regex as re
 
-from arabic_llm_benchmark.datasets import PropagandaTweetDataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelTask
+from llmebench.datasets import PropagandaTweetDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import PropagandaMultilabelTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENFewShot.py
index 00fe1d12..e7f33f39 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENFewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENFewShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import PropagandaMultilabelSemEval23Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENZeroShot.py
index 8dba8ba8..590ca1ab 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENZeroShot.py
@@ -2,9 +2,9 @@
 
 import regex as re
 
-from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import PropagandaMultilabelSemEval23Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRFewShot.py
index cd74909d..2c642c7f 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRFewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRFewShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import PropagandaMultilabelSemEval23Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRZeroShot.py
index bd3baa14..6c7c4dc7 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRZeroShot.py
@@ -2,9 +2,9 @@
 
 import regex as re
 
-from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import PropagandaMultilabelSemEval23Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEFewShot.py
index edeb07ed..70523f8e 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEFewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEFewShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import PropagandaMultilabelSemEval23Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEZeroShot.py
index 92e2779e..e7836e14 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEZeroShot.py
@@ -2,9 +2,9 @@
 
 import regex as re
 
-from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import PropagandaMultilabelSemEval23Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITFewShot.py
index 1a061438..2b782ca3 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITFewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITFewShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import PropagandaMultilabelSemEval23Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITZeroShot.py
index 9c1aeda3..511ca715 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITZeroShot.py
@@ -2,9 +2,9 @@
 
 import regex as re
 
-from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import PropagandaMultilabelSemEval23Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POFewShot.py
index c716bbda..d9447aaa 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POFewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POFewShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import PropagandaMultilabelSemEval23Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POZeroShot.py
index d5699927..cc0a834b 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POZeroShot.py
@@ -2,9 +2,9 @@
 
 import regex as re
 
-from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import PropagandaMultilabelSemEval23Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUFewShot.py
index 75cdeac2..d4abcaac 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUFewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUFewShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import PropagandaMultilabelSemEval23Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUZeroShot.py
index e1366663..e2557173 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUZeroShot.py
@@ -2,9 +2,9 @@
 
 import regex as re
 
-from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task
+from llmebench.datasets import PropagandaSemEval23Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import PropagandaMultilabelSemEval23Task
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ZeroShot.py
index a34dc865..5311470f 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ZeroShot.py
@@ -2,9 +2,9 @@
 
 import regex as re
 
-from arabic_llm_benchmark.datasets import PropagandaTweetDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelTask
+from llmebench.datasets import PropagandaTweetDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import PropagandaMultilabelTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_FewShot.py
index 22d0c25c..0b1f913b 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_FewShot.py
@@ -2,9 +2,9 @@
 import random
 import re
 
-from arabic_llm_benchmark.datasets import PropagandaTweetDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelTask
+from llmebench.datasets import PropagandaTweetDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import PropagandaMultilabelTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_ZeroShot.py
index 9cd765d7..271385fe 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_ZeroShot.py
@@ -2,9 +2,9 @@
 import random
 import re
 
-from arabic_llm_benchmark.datasets import PropagandaTweetDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import PropagandaMultilabelTask
+from llmebench.datasets import PropagandaTweetDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import PropagandaMultilabelTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_BLOOMZ_ZeroShot.py
index 9f95128f..63fa49b4 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import SubjectivityDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import SubjectivityTask
+from llmebench.datasets import SubjectivityDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import SubjectivityTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_ChatGPT_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_ChatGPT_ZeroShot.py
index 8aa0d8b8..ac41046e 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import SubjectivityDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import SubjectivityTask
+from llmebench.datasets import SubjectivityDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import SubjectivityTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_FewShot.py
index 7e25e6de..93859b9f 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_FewShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import SubjectivityDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import SubjectivityTask
+from llmebench.datasets import SubjectivityDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import SubjectivityTask
 
 
 def config():
diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_ZeroShot.py
index 79cce313..95c46f73 100644
--- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_ZeroShot.py
@@ -2,9 +2,9 @@
 import random
 import re
 
-from arabic_llm_benchmark.datasets import SubjectivityDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import SubjectivityTask
+from llmebench.datasets import SubjectivityDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import SubjectivityTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py
index 0d55e1ef..48f34c4c 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import NewsCatASNDDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import NewsCatASNDTask
+from llmebench.datasets import NewsCatASNDDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import NewsCatASNDTask
 
 random.seed(1333)
 
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_ASND_ChatGPT_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_ASND_ChatGPT_ZeroShot.py
index 14decb4a..b08edeec 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_ASND_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/news_categorization/NewsCat_ASND_ChatGPT_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import NewsCatASNDDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import NewsCatASNDTask
+from llmebench.datasets import NewsCatASNDDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import NewsCatASNDTask
 
 random.seed(1333)
 
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_FewShot.py
index df934a29..b1d0059e 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_FewShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import NewsCatASNDDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import NewsCatASNDTask
+from llmebench.datasets import NewsCatASNDDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import NewsCatASNDTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_ZeroShot.py
index b2a6f30a..67c0c625 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import NewsCatASNDDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import NewsCatASNDTask
+from llmebench.datasets import NewsCatASNDDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import NewsCatASNDTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_BLOOMZ_ZeroShot.py
index 7d4b193b..a23188ff 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_BLOOMZ_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import NewsCatAkhbaronaDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import NewsCatAkhbaronaTask
+from llmebench.datasets import NewsCatAkhbaronaDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import NewsCatAkhbaronaTask
 
 random.seed(1333)
 
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_ChatGPT_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_ChatGPT_ZeroShot.py
index 6fd6ea90..aa44e595 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_ChatGPT_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import NewsCatAkhbaronaDataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import NewsCatAkhbaronaTask
+from llmebench.datasets import NewsCatAkhbaronaDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import NewsCatAkhbaronaTask
 
 random.seed(1333)
 
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_FewShot.py
index 76c0ad68..698e26f6 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_FewShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import NewsCatAkhbaronaDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import NewsCatAkhbaronaTask
+from llmebench.datasets import NewsCatAkhbaronaDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import NewsCatAkhbaronaTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_ZeroShot.py
index 5ec5f3a9..c029118d 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import NewsCatAkhbaronaDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import NewsCatAkhbaronaTask
+from llmebench.datasets import NewsCatAkhbaronaDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import NewsCatAkhbaronaTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_BLOOMZ_ZeroShot.py
index 4a4f5613..1ce28d74 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_BLOOMZ_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import NewsCatAlArabiyaDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import NewsCatAlArabiyaTask
+from llmebench.datasets import NewsCatAlArabiyaDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import NewsCatAlArabiyaTask
 
 random.seed(1333)
 
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_ChatGPT_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_ChatGPT_ZeroShot.py
index 70df073f..458761f2 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_ChatGPT_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import NewsCatAlArabiyaDataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import NewsCatAlArabiyaTask
+from llmebench.datasets import NewsCatAlArabiyaDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import NewsCatAlArabiyaTask
 
 random.seed(1333)
 
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_FewShot.py
index 70fa21f6..6f407dd5 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_FewShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import NewsCatAlArabiyaDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import NewsCatAlArabiyaTask
+from llmebench.datasets import NewsCatAlArabiyaDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import NewsCatAlArabiyaTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_ZeroShot.py
index 6f3a8b2f..333352ec 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import NewsCatAlArabiyaDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import NewsCatAlArabiyaTask
+from llmebench.datasets import NewsCatAlArabiyaDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import NewsCatAlArabiyaTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_BLOOMZ_ZeroShot.py
index 4a7a5863..03516c58 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_BLOOMZ_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import NewsCatAlArabiyaDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import NewsCatAlArabiyaTask
+from llmebench.datasets import NewsCatAlArabiyaDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import NewsCatAlArabiyaTask
 
 random.seed(1333)
 
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_ChatGPT_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_ChatGPT_ZeroShot.py
index 23d4b7a6..63ae2363 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_ChatGPT_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import NewsCatAlKhaleejDataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import NewsCatAlKhaleejTask
+from llmebench.datasets import NewsCatAlKhaleejDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import NewsCatAlKhaleejTask
 
 random.seed(1333)
 
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_FewShot.py
index 672fd9fc..c95f9f36 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_FewShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import NewsCatAlKhaleejDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import NewsCatAlKhaleejTask
+from llmebench.datasets import NewsCatAlKhaleejDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import NewsCatAlKhaleejTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_ZeroShot.py
index ee4db9e8..b7f37f16 100644
--- a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import random
 
-from arabic_llm_benchmark.datasets import NewsCatAlKhaleejDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import NewsCatAlKhaleejTask
+from llmebench.datasets import NewsCatAlKhaleejDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import NewsCatAlKhaleejTask
 
 
 random.seed(1333)
diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT3_Zeroshot.py b/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT3_Zeroshot.py
index f08a6396..45aceebb 100644
--- a/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT3_Zeroshot.py
+++ b/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT3_Zeroshot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ArSarcasmDataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import SarcasmTask
+from llmebench.datasets import ArSarcasmDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import SarcasmTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_FewShot.py b/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_FewShot.py
index 8005b477..7cbaab33 100644
--- a/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_FewShot.py
+++ b/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_FewShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ArSarcasmDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import SarcasmTask
+from llmebench.datasets import ArSarcasmDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import SarcasmTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_Zeroshot.py b/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_Zeroshot.py
index 935aab4e..a408546b 100644
--- a/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_Zeroshot.py
+++ b/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_Zeroshot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ArSarcasmDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import SarcasmTask
+from llmebench.datasets import ArSarcasmDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import SarcasmTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm_BLOOMZ_Zeroshot.py b/assets/benchmark_v1/sarcasm/ArSarcasm_BLOOMZ_Zeroshot.py
index 40d7d016..27e123b2 100644
--- a/assets/benchmark_v1/sarcasm/ArSarcasm_BLOOMZ_Zeroshot.py
+++ b/assets/benchmark_v1/sarcasm/ArSarcasm_BLOOMZ_Zeroshot.py
@@ -1,10 +1,10 @@
 import os
 
-from arabic_llm_benchmark.datasets import ArSarcasmDataset
+from llmebench.datasets import ArSarcasmDataset
 
-from arabic_llm_benchmark.models import BLOOMPetalModel
+from llmebench.models import BLOOMPetalModel
 
-from arabic_llm_benchmark.tasks import SarcasmTask
+from llmebench.tasks import SarcasmTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm_GPT3_Zeroshot.py b/assets/benchmark_v1/sarcasm/ArSarcasm_GPT3_Zeroshot.py
index d4f2752f..781d36ea 100644
--- a/assets/benchmark_v1/sarcasm/ArSarcasm_GPT3_Zeroshot.py
+++ b/assets/benchmark_v1/sarcasm/ArSarcasm_GPT3_Zeroshot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ArSarcasmDataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import SarcasmTask
+from llmebench.datasets import ArSarcasmDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import SarcasmTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Fewshot.py b/assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Fewshot.py
index 5ec5cf1e..9b984d8c 100644
--- a/assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Fewshot.py
+++ b/assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Fewshot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ArSarcasmDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import SarcasmTask
+from llmebench.datasets import ArSarcasmDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import SarcasmTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Zeroshot.py b/assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Zeroshot.py
index f8a01ad1..f95798c0 100644
--- a/assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Zeroshot.py
+++ b/assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Zeroshot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ArSarcasmDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import SarcasmTask
+from llmebench.datasets import ArSarcasmDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import SarcasmTask
 
 
 def config():
diff --git a/assets/benchmark_v1/semantics/XNLI_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/semantics/XNLI_BLOOMZ_ZeroShot.py
index c892056d..0c9a88ad 100644
--- a/assets/benchmark_v1/semantics/XNLI_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/semantics/XNLI_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import XNLIDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import XNLITask
+from llmebench.datasets import XNLIDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import XNLITask
 
 
 def config():
diff --git a/assets/benchmark_v1/semantics/XNLI_CGPT4_FewShot.py b/assets/benchmark_v1/semantics/XNLI_CGPT4_FewShot.py
index 556a5bc5..67a6b2a1 100644
--- a/assets/benchmark_v1/semantics/XNLI_CGPT4_FewShot.py
+++ b/assets/benchmark_v1/semantics/XNLI_CGPT4_FewShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import XNLIDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import XNLITask
+from llmebench.datasets import XNLIDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import XNLITask
 
 
 def config():
diff --git a/assets/benchmark_v1/semantics/XNLI_CGPT4_ZeroShot.py b/assets/benchmark_v1/semantics/XNLI_CGPT4_ZeroShot.py
index 494a3281..f3aae913 100644
--- a/assets/benchmark_v1/semantics/XNLI_CGPT4_ZeroShot.py
+++ b/assets/benchmark_v1/semantics/XNLI_CGPT4_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import XNLIDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import XNLITask
+from llmebench.datasets import XNLIDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import XNLITask
 
 
 def config():
diff --git a/assets/benchmark_v1/semantics/XNLI_ChatGPT_ZeroShot.py b/assets/benchmark_v1/semantics/XNLI_ChatGPT_ZeroShot.py
index 12956be9..1f5126d8 100644
--- a/assets/benchmark_v1/semantics/XNLI_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/semantics/XNLI_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import XNLIDataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import XNLITask
+from llmebench.datasets import XNLIDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import XNLITask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment/emotion/Emotion_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/sentiment/emotion/Emotion_BLOOMZ_ZeroShot.py
index 95bdf634..77fc394d 100644
--- a/assets/benchmark_v1/sentiment/emotion/Emotion_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment/emotion/Emotion_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import EmotionDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import EmotionTask
+from llmebench.datasets import EmotionDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import EmotionTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment/emotion/Emotion_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sentiment/emotion/Emotion_ChatGPT_ZeroShot.py
index e48dccf8..aadd03d2 100644
--- a/assets/benchmark_v1/sentiment/emotion/Emotion_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment/emotion/Emotion_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import EmotionDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import EmotionTask
+from llmebench.datasets import EmotionDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import EmotionTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_FewShot.py
index 98dba6ab..1f5b0e1d 100644
--- a/assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_FewShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import EmotionDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import EmotionTask
+from llmebench.datasets import EmotionDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import EmotionTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_ZeroShot.py
index a936f89f..89a18cd2 100644
--- a/assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import EmotionDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import EmotionTask
+from llmebench.datasets import EmotionDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import EmotionTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment/offensive/Offensive_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/sentiment/offensive/Offensive_BLOOMZ_ZeroShot.py
index b6ea7fa8..965bde7b 100644
--- a/assets/benchmark_v1/sentiment/offensive/Offensive_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment/offensive/Offensive_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import OffensiveDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import OffensiveTask
+from llmebench.datasets import OffensiveDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import OffensiveTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment/offensive/Offensive_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sentiment/offensive/Offensive_ChatGPT_ZeroShot.py
index 2e044114..8752749f 100644
--- a/assets/benchmark_v1/sentiment/offensive/Offensive_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment/offensive/Offensive_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import OffensiveDataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import OffensiveTask
+from llmebench.datasets import OffensiveDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import OffensiveTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment/offensive/Offensive_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/sentiment/offensive/Offensive_GPTChatCompletion_ZeroShot.py
index bbae54b8..d29b2d92 100644
--- a/assets/benchmark_v1/sentiment/offensive/Offensive_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment/offensive/Offensive_GPTChatCompletion_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import OffensiveDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import OffensiveTask
+from llmebench.datasets import OffensiveDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import OffensiveTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_BLOOMZ_ZeroShot.py
index 4f811ee2..57dd78e2 100644
--- a/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ArSASSentimentDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import SentimentTask
+from llmebench.datasets import ArSASSentimentDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import SentimentTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_ChatGPT_ZeroShot.py
index 507fde3a..7fac34f6 100644
--- a/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ArSASSentimentDataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import SentimentTask
+from llmebench.datasets import ArSASSentimentDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import SentimentTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_GPTChatCompletion_ZeroShot.py
index 2756384d..b9220fb9 100644
--- a/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_GPTChatCompletion_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ArSASSentimentDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import SentimentTask
+from llmebench.datasets import ArSASSentimentDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import SentimentTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_BLOOMZ_ZeroShot.py
index 04b877d8..0a40eb64 100644
--- a/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import BanglaSentimentDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import SentimentTask
+from llmebench.datasets import BanglaSentimentDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import SentimentTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_FewShot.py b/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_FewShot.py
index d6d028fc..cfbfea94 100644
--- a/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_FewShot.py
+++ b/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_FewShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import BanglaSentimentDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import SentimentTask
+from llmebench.datasets import BanglaSentimentDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import SentimentTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_ZeroShot.py b/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_ZeroShot.py
index 59e14216..78e0fa42 100644
--- a/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import BanglaSentimentDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import SentimentTask
+from llmebench.datasets import BanglaSentimentDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import SentimentTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment/spam/Spam_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/sentiment/spam/Spam_BLOOMZ_ZeroShot.py
index abcea764..0d970b40 100644
--- a/assets/benchmark_v1/sentiment/spam/Spam_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment/spam/Spam_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import SpamDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import SpamTask
+from llmebench.datasets import SpamDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import SpamTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment/spam/Spam_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sentiment/spam/Spam_ChatGPT_ZeroShot.py
index 2271d6a4..ba0e4b30 100644
--- a/assets/benchmark_v1/sentiment/spam/Spam_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment/spam/Spam_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import SpamDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import SpamTask
+from llmebench.datasets import SpamDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import SpamTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment/spam/Spam_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/sentiment/spam/Spam_GPTChatCompletion_ZeroShot.py
index 679e37a2..5c124c19 100644
--- a/assets/benchmark_v1/sentiment/spam/Spam_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment/spam/Spam_GPTChatCompletion_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import SpamDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import SpamTask
+from llmebench.datasets import SpamDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import SpamTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_BLOOMZ_ZeroShot.py
index c89e9638..76f29a4d 100644
--- a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import StanceKhouja20Dataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import StanceKhouja20Task
+from llmebench.datasets import StanceKhouja20Dataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import StanceKhouja20Task
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_ChatGPT_ZeroShot.py
index d4a7eef1..d4fbdb93 100644
--- a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import StanceKhouja20Dataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import StanceKhouja20Task
+from llmebench.datasets import StanceKhouja20Dataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import StanceKhouja20Task
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_FewShot.py
index 76e7aa87..1caa14ef 100644
--- a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_FewShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import StanceKhouja20Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import StanceKhouja20Task
+from llmebench.datasets import StanceKhouja20Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import StanceKhouja20Task
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_ZeroShot.py
index ad281ab5..3958ccb6 100644
--- a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import StanceKhouja20Dataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import StanceKhouja20Task
+from llmebench.datasets import StanceKhouja20Dataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import StanceKhouja20Task
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_BLOOMZ_ZeroShot.py
index d82969db..9ac0bb5a 100644
--- a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import StanceUnifiedFCDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import StanceUnifiedFCTask
+from llmebench.datasets import StanceUnifiedFCDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import StanceUnifiedFCTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_ChatGPT_ZeroShot.py
index 0a273f01..8c0f0568 100644
--- a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import StanceUnifiedFCDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import StanceUnifiedFCTask
+from llmebench.datasets import StanceUnifiedFCDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import StanceUnifiedFCTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_FewShot.py
index 1fb02e37..3d27a3d5 100644
--- a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_FewShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import StanceUnifiedFCDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import StanceUnifiedFCTask
+from llmebench.datasets import StanceUnifiedFCDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import StanceUnifiedFCTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_ZeroShot.py
index f06901a3..a435a768 100644
--- a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import StanceUnifiedFCDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import StanceUnifiedFCTask
+from llmebench.datasets import StanceUnifiedFCDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import StanceUnifiedFCTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_BLOOMZ_ZeroShot.py
index 602253da..1fbdfa10 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_BLOOMZ_ZeroShot.py
+++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_BLOOMZ_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import DialectADIDataset
-from arabic_llm_benchmark.models import BLOOMPetalModel
-from arabic_llm_benchmark.tasks import DialectIDTask
+from llmebench.datasets import DialectADIDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import DialectIDTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_ChatGPT_ZeroShot.py
index f3ba850c..8e243727 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import DialectADIDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import DialectIDTask
+from llmebench.datasets import DialectADIDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import DialectIDTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_FewShot.py
index 705db4fa..ccfe05f2 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_FewShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import DialectADIDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import DialectIDTask
+from llmebench.datasets import DialectADIDataset
+from llmebench.models import GPTChatCompletionModel, RandomGPTModel
+from llmebench.tasks import DialectIDTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_ZeroShot.py
index a4c12bb1..6cf4d539 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import DialectADIDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import DialectIDTask
+from llmebench.datasets import DialectADIDataset
+from llmebench.models import GPTChatCompletionModel, RandomGPTModel
+from llmebench.tasks import DialectIDTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_ChatGPT_ZeroShot.py
index cc4b3138..dc87039e 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import QADIDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import DialectIDTask
+from llmebench.datasets import QADIDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import DialectIDTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_GPTChatCompletion_ZeroShot.py
index 60071dff..44254560 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_GPTChatCompletion_ZeroShot.py
+++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_GPTChatCompletion_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import QADIDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import DialectIDTask
+from llmebench.datasets import QADIDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import DialectIDTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_ChatGPT_ZeroShot.py
index 64eef7bd..69f4fe5c 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_ChatGPT_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import ArabicPOSDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import ArabicPOSTask
+from llmebench.datasets import ArabicPOSDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import ArabicPOSTask
 
 mapTags = {
     "UNK": "UNK",
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPT4_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPT4_ZeroShot.py
index 7934dec0..18db4d89 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPT4_ZeroShot.py
+++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPT4_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import ArabicPOSDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import ArabicPOSTask
+from llmebench.datasets import ArabicPOSDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import ArabicPOSTask
 
 mapTags = {
     "UNK": "UNK",
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPTChatCompletion_FewShot.py
index 761d13ae..70cf5815 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPTChatCompletion_FewShot.py
+++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPTChatCompletion_FewShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import ArabicPOSDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import ArabicPOSTask
+from llmebench.datasets import ArabicPOSDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import ArabicPOSTask
 
 mapTags = {
     "UNK": "UNK",
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py
index cf8637a7..0634ab80 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import ArabicDiacritizationDataset
-from arabic_llm_benchmark.models import GPTModel
-from arabic_llm_benchmark.tasks import ArabicDiacritizationTask
+from llmebench.datasets import ArabicDiacritizationDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import ArabicDiacritizationTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/lemmatization/Lemmatization_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/lemmatization/Lemmatization_ChatGPT_ZeroShot.py
index ceae7cc9..9950ccac 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/lemmatization/Lemmatization_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/lemmatization/Lemmatization_ChatGPT_ZeroShot.py
@@ -1,8 +1,8 @@
 import os
 
-from arabic_llm_benchmark.datasets import LemmatizationDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import LemmatizationTask
+from llmebench.datasets import LemmatizationDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import LemmatizationTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_ChatGPT_ZeroShot.py
index 6a093fb4..77a175b7 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_ChatGPT_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import ArabicParsingDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import ArabicParsingTask
+from llmebench.datasets import ArabicParsingDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import ArabicParsingTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_GPT4_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_GPT4_ZeroShot.py
index 9cdfb871..36e3d02f 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_GPT4_ZeroShot.py
+++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_GPT4_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import ArabicParsingDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import ArabicParsingTask
+from llmebench.datasets import ArabicParsingDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import ArabicParsingTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_ChatGPT_ZeroShot.py
index b0830844..d808ce15 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_ChatGPT_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import ArabicSegmentationDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
-from arabic_llm_benchmark.tasks import ArabicSegmentationTask
+from llmebench.datasets import ArabicSegmentationDataset
+from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.tasks import ArabicSegmentationTask
 
 
 def config():
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_GPT4_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_GPT4_ZeroShot.py
index d64163ba..e6a3fc8d 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_GPT4_ZeroShot.py
+++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_GPT4_ZeroShot.py
@@ -1,9 +1,9 @@
 import os
 import re
 
-from arabic_llm_benchmark.datasets import ArabicSegmentationDataset
-from arabic_llm_benchmark.models import GPTChatCompletionModel
-from arabic_llm_benchmark.tasks import ArabicSegmentationTask
+from llmebench.datasets import ArabicSegmentationDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import ArabicSegmentationTask
 
 
 def config():
diff --git a/arabic_llm_benchmark/__init__.py b/llmebench/__init__.py
similarity index 100%
rename from arabic_llm_benchmark/__init__.py
rename to llmebench/__init__.py
diff --git a/arabic_llm_benchmark/__main__.py b/llmebench/__main__.py
similarity index 100%
rename from arabic_llm_benchmark/__main__.py
rename to llmebench/__main__.py
diff --git a/arabic_llm_benchmark/benchmark.py b/llmebench/benchmark.py
similarity index 100%
rename from arabic_llm_benchmark/benchmark.py
rename to llmebench/benchmark.py
diff --git a/arabic_llm_benchmark/datasets/ANERcorp.py b/llmebench/datasets/ANERcorp.py
similarity index 98%
rename from arabic_llm_benchmark/datasets/ANERcorp.py
rename to llmebench/datasets/ANERcorp.py
index d0a77fbb..a9c07b28 100644
--- a/arabic_llm_benchmark/datasets/ANERcorp.py
+++ b/llmebench/datasets/ANERcorp.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class ANERcorpDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/ARCD.py b/llmebench/datasets/ARCD.py
similarity index 88%
rename from arabic_llm_benchmark/datasets/ARCD.py
rename to llmebench/datasets/ARCD.py
index 78db8772..25d2a00b 100644
--- a/arabic_llm_benchmark/datasets/ARCD.py
+++ b/llmebench/datasets/ARCD.py
@@ -1,6 +1,6 @@
 import json
 
-from arabic_llm_benchmark.datasets.SQuADBase import SQuADBase
+from llmebench.datasets.SQuADBase import SQuADBase
 
 
 class ARCDDataset(SQuADBase):
diff --git a/arabic_llm_benchmark/datasets/Adult.py b/llmebench/datasets/Adult.py
similarity index 95%
rename from arabic_llm_benchmark/datasets/Adult.py
rename to llmebench/datasets/Adult.py
index 7ca6c81f..521ed1db 100644
--- a/arabic_llm_benchmark/datasets/Adult.py
+++ b/llmebench/datasets/Adult.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class AdultDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/Aqmar.py b/llmebench/datasets/Aqmar.py
similarity index 98%
rename from arabic_llm_benchmark/datasets/Aqmar.py
rename to llmebench/datasets/Aqmar.py
index 0afd5e5a..0c1409f5 100644
--- a/arabic_llm_benchmark/datasets/Aqmar.py
+++ b/llmebench/datasets/Aqmar.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class AqmarDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/ArSASSentiment.py b/llmebench/datasets/ArSASSentiment.py
similarity index 92%
rename from arabic_llm_benchmark/datasets/ArSASSentiment.py
rename to llmebench/datasets/ArSASSentiment.py
index 4d162bfc..2a56c6fb 100644
--- a/arabic_llm_benchmark/datasets/ArSASSentiment.py
+++ b/llmebench/datasets/ArSASSentiment.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class ArSASSentimentDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/ArSarcasm.py b/llmebench/datasets/ArSarcasm.py
similarity index 95%
rename from arabic_llm_benchmark/datasets/ArSarcasm.py
rename to llmebench/datasets/ArSarcasm.py
index 5c9fab15..ff8da9a2 100644
--- a/arabic_llm_benchmark/datasets/ArSarcasm.py
+++ b/llmebench/datasets/ArSarcasm.py
@@ -1,6 +1,6 @@
 import csv
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class ArSarcasmDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/AraBench.py b/llmebench/datasets/AraBench.py
similarity index 96%
rename from arabic_llm_benchmark/datasets/AraBench.py
rename to llmebench/datasets/AraBench.py
index 6900db4a..83220e8f 100644
--- a/arabic_llm_benchmark/datasets/AraBench.py
+++ b/llmebench/datasets/AraBench.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class AraBenchDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/ArabGend.py b/llmebench/datasets/ArabGend.py
similarity index 93%
rename from arabic_llm_benchmark/datasets/ArabGend.py
rename to llmebench/datasets/ArabGend.py
index db576ce9..537e1a91 100644
--- a/arabic_llm_benchmark/datasets/ArabGend.py
+++ b/llmebench/datasets/ArabGend.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class ArabGendDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/ArabicDiacritization.py b/llmebench/datasets/ArabicDiacritization.py
similarity index 95%
rename from arabic_llm_benchmark/datasets/ArabicDiacritization.py
rename to llmebench/datasets/ArabicDiacritization.py
index 2d43b14c..0372b8fd 100644
--- a/arabic_llm_benchmark/datasets/ArabicDiacritization.py
+++ b/llmebench/datasets/ArabicDiacritization.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class ArabicDiacritizationDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/ArabicPOS.py b/llmebench/datasets/ArabicPOS.py
similarity index 95%
rename from arabic_llm_benchmark/datasets/ArabicPOS.py
rename to llmebench/datasets/ArabicPOS.py
index 1e141610..d7448cf3 100644
--- a/arabic_llm_benchmark/datasets/ArabicPOS.py
+++ b/llmebench/datasets/ArabicPOS.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class ArabicPOSDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/ArabicParsing.py b/llmebench/datasets/ArabicParsing.py
similarity index 96%
rename from arabic_llm_benchmark/datasets/ArabicParsing.py
rename to llmebench/datasets/ArabicParsing.py
index 66fe2f15..c5868e5c 100644
--- a/arabic_llm_benchmark/datasets/ArabicParsing.py
+++ b/llmebench/datasets/ArabicParsing.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class ArabicParsingDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/ArabicSegmentation.py b/llmebench/datasets/ArabicSegmentation.py
similarity index 96%
rename from arabic_llm_benchmark/datasets/ArabicSegmentation.py
rename to llmebench/datasets/ArabicSegmentation.py
index 76fa272d..e7c12394 100644
--- a/arabic_llm_benchmark/datasets/ArabicSegmentation.py
+++ b/llmebench/datasets/ArabicSegmentation.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class ArabicSegmentationDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/ArapTweet.py b/llmebench/datasets/ArapTweet.py
similarity index 97%
rename from arabic_llm_benchmark/datasets/ArapTweet.py
rename to llmebench/datasets/ArapTweet.py
index de73e42e..ad67986c 100644
--- a/arabic_llm_benchmark/datasets/ArapTweet.py
+++ b/llmebench/datasets/ArapTweet.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class ArapTweetDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/Attentionworthy.py b/llmebench/datasets/Attentionworthy.py
similarity index 95%
rename from arabic_llm_benchmark/datasets/Attentionworthy.py
rename to llmebench/datasets/Attentionworthy.py
index 1fdec46a..400d2404 100644
--- a/arabic_llm_benchmark/datasets/Attentionworthy.py
+++ b/llmebench/datasets/Attentionworthy.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class AttentionworthyDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/BanglaSentiment.py b/llmebench/datasets/BanglaSentiment.py
similarity index 95%
rename from arabic_llm_benchmark/datasets/BanglaSentiment.py
rename to llmebench/datasets/BanglaSentiment.py
index 29a34479..09d5cca2 100644
--- a/arabic_llm_benchmark/datasets/BanglaSentiment.py
+++ b/llmebench/datasets/BanglaSentiment.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class BanglaSentimentDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/Checkworthiness.py b/llmebench/datasets/Checkworthiness.py
similarity index 96%
rename from arabic_llm_benchmark/datasets/Checkworthiness.py
rename to llmebench/datasets/Checkworthiness.py
index 90ed51fe..c9fdf837 100644
--- a/arabic_llm_benchmark/datasets/Checkworthiness.py
+++ b/llmebench/datasets/Checkworthiness.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class CheckworthinessDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/Claim.py b/llmebench/datasets/Claim.py
similarity index 95%
rename from arabic_llm_benchmark/datasets/Claim.py
rename to llmebench/datasets/Claim.py
index 57745725..ddec7d30 100644
--- a/arabic_llm_benchmark/datasets/Claim.py
+++ b/llmebench/datasets/Claim.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class CovidClaimDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/DialectADI.py b/llmebench/datasets/DialectADI.py
similarity index 92%
rename from arabic_llm_benchmark/datasets/DialectADI.py
rename to llmebench/datasets/DialectADI.py
index 466e306b..3b463770 100644
--- a/arabic_llm_benchmark/datasets/DialectADI.py
+++ b/llmebench/datasets/DialectADI.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class DialectADIDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/Emotion.py b/llmebench/datasets/Emotion.py
similarity index 94%
rename from arabic_llm_benchmark/datasets/Emotion.py
rename to llmebench/datasets/Emotion.py
index ee656758..18e258a9 100644
--- a/arabic_llm_benchmark/datasets/Emotion.py
+++ b/llmebench/datasets/Emotion.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class EmotionDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/FactualityCOVID19.py b/llmebench/datasets/FactualityCOVID19.py
similarity index 97%
rename from arabic_llm_benchmark/datasets/FactualityCOVID19.py
rename to llmebench/datasets/FactualityCOVID19.py
index 3ddc5d6f..177e81a4 100644
--- a/arabic_llm_benchmark/datasets/FactualityCOVID19.py
+++ b/llmebench/datasets/FactualityCOVID19.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class FactualityCOVID19Dataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/FactualityKhouja20.py b/llmebench/datasets/FactualityKhouja20.py
similarity index 94%
rename from arabic_llm_benchmark/datasets/FactualityKhouja20.py
rename to llmebench/datasets/FactualityKhouja20.py
index ca8a31f2..d8205ba9 100644
--- a/arabic_llm_benchmark/datasets/FactualityKhouja20.py
+++ b/llmebench/datasets/FactualityKhouja20.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class FactualityKhouja20Dataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/FactualityUnifiedFC.py b/llmebench/datasets/FactualityUnifiedFC.py
similarity index 97%
rename from arabic_llm_benchmark/datasets/FactualityUnifiedFC.py
rename to llmebench/datasets/FactualityUnifiedFC.py
index cd78971a..a80f1cfb 100644
--- a/arabic_llm_benchmark/datasets/FactualityUnifiedFC.py
+++ b/llmebench/datasets/FactualityUnifiedFC.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class FactualityUnifiedFCDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/Harmful.py b/llmebench/datasets/Harmful.py
similarity index 95%
rename from arabic_llm_benchmark/datasets/Harmful.py
rename to llmebench/datasets/Harmful.py
index 28b49316..d44eba82 100644
--- a/arabic_llm_benchmark/datasets/Harmful.py
+++ b/llmebench/datasets/Harmful.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class CovidHarmfulDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/HateSpeech.py b/llmebench/datasets/HateSpeech.py
similarity index 95%
rename from arabic_llm_benchmark/datasets/HateSpeech.py
rename to llmebench/datasets/HateSpeech.py
index 43246c64..2881c847 100644
--- a/arabic_llm_benchmark/datasets/HateSpeech.py
+++ b/llmebench/datasets/HateSpeech.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class HateSpeechDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/Lemmatization.py b/llmebench/datasets/Lemmatization.py
similarity index 95%
rename from arabic_llm_benchmark/datasets/Lemmatization.py
rename to llmebench/datasets/Lemmatization.py
index af4a37e0..67f72cd2 100644
--- a/arabic_llm_benchmark/datasets/Lemmatization.py
+++ b/llmebench/datasets/Lemmatization.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class LemmatizationDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/Location.py b/llmebench/datasets/Location.py
similarity index 94%
rename from arabic_llm_benchmark/datasets/Location.py
rename to llmebench/datasets/Location.py
index bb737435..363f91e3 100644
--- a/arabic_llm_benchmark/datasets/Location.py
+++ b/llmebench/datasets/Location.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class LocationDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/MGBWords.py b/llmebench/datasets/MGBWords.py
similarity index 96%
rename from arabic_llm_benchmark/datasets/MGBWords.py
rename to llmebench/datasets/MGBWords.py
index 307358af..ffdbc78e 100644
--- a/arabic_llm_benchmark/datasets/MGBWords.py
+++ b/llmebench/datasets/MGBWords.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class MGBWordsDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/MLQA.py b/llmebench/datasets/MLQA.py
similarity index 88%
rename from arabic_llm_benchmark/datasets/MLQA.py
rename to llmebench/datasets/MLQA.py
index 58d1fdd3..1362ecb3 100644
--- a/arabic_llm_benchmark/datasets/MLQA.py
+++ b/llmebench/datasets/MLQA.py
@@ -1,6 +1,6 @@
 import json
 
-from arabic_llm_benchmark.datasets.SQuADBase import SQuADBase
+from llmebench.datasets.SQuADBase import SQuADBase
 
 
 class MLQADataset(SQuADBase):
diff --git a/arabic_llm_benchmark/datasets/NameInfo.py b/llmebench/datasets/NameInfo.py
similarity index 93%
rename from arabic_llm_benchmark/datasets/NameInfo.py
rename to llmebench/datasets/NameInfo.py
index 0c399f18..17a7026b 100644
--- a/arabic_llm_benchmark/datasets/NameInfo.py
+++ b/llmebench/datasets/NameInfo.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class NameInfoDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/NewsCatASND.py b/llmebench/datasets/NewsCatASND.py
similarity index 95%
rename from arabic_llm_benchmark/datasets/NewsCatASND.py
rename to llmebench/datasets/NewsCatASND.py
index 069df59d..414a573d 100644
--- a/arabic_llm_benchmark/datasets/NewsCatASND.py
+++ b/llmebench/datasets/NewsCatASND.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class NewsCatASNDDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/NewsCatAkhbarona.py b/llmebench/datasets/NewsCatAkhbarona.py
similarity index 95%
rename from arabic_llm_benchmark/datasets/NewsCatAkhbarona.py
rename to llmebench/datasets/NewsCatAkhbarona.py
index 5e6fa125..066cf448 100644
--- a/arabic_llm_benchmark/datasets/NewsCatAkhbarona.py
+++ b/llmebench/datasets/NewsCatAkhbarona.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class NewsCatAkhbaronaDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/NewsCatAlArabiya.py b/llmebench/datasets/NewsCatAlArabiya.py
similarity index 95%
rename from arabic_llm_benchmark/datasets/NewsCatAlArabiya.py
rename to llmebench/datasets/NewsCatAlArabiya.py
index b17bf0cc..f95dae45 100644
--- a/arabic_llm_benchmark/datasets/NewsCatAlArabiya.py
+++ b/llmebench/datasets/NewsCatAlArabiya.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class NewsCatAlArabiyaDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/NewsCatAlKhaleej.py b/llmebench/datasets/NewsCatAlKhaleej.py
similarity index 95%
rename from arabic_llm_benchmark/datasets/NewsCatAlKhaleej.py
rename to llmebench/datasets/NewsCatAlKhaleej.py
index 6e531798..34d10d58 100644
--- a/arabic_llm_benchmark/datasets/NewsCatAlKhaleej.py
+++ b/llmebench/datasets/NewsCatAlKhaleej.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class NewsCatAlKhaleejDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/Offensive.py b/llmebench/datasets/Offensive.py
similarity index 94%
rename from arabic_llm_benchmark/datasets/Offensive.py
rename to llmebench/datasets/Offensive.py
index 3302a42e..19c8a38a 100644
--- a/arabic_llm_benchmark/datasets/Offensive.py
+++ b/llmebench/datasets/Offensive.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class OffensiveDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/Propaganda.py b/llmebench/datasets/Propaganda.py
similarity index 96%
rename from arabic_llm_benchmark/datasets/Propaganda.py
rename to llmebench/datasets/Propaganda.py
index 5c7dc8c6..5f88e441 100644
--- a/arabic_llm_benchmark/datasets/Propaganda.py
+++ b/llmebench/datasets/Propaganda.py
@@ -2,7 +2,7 @@
 
 from pathlib import Path
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class PropagandaTweetDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/PropagandaSemEval23.py b/llmebench/datasets/PropagandaSemEval23.py
similarity index 98%
rename from arabic_llm_benchmark/datasets/PropagandaSemEval23.py
rename to llmebench/datasets/PropagandaSemEval23.py
index 6708fb35..b655ec38 100644
--- a/arabic_llm_benchmark/datasets/PropagandaSemEval23.py
+++ b/llmebench/datasets/PropagandaSemEval23.py
@@ -2,7 +2,7 @@
 import os
 from pathlib import Path
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class PropagandaSemEval23Dataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/QADI.py b/llmebench/datasets/QADI.py
similarity index 94%
rename from arabic_llm_benchmark/datasets/QADI.py
rename to llmebench/datasets/QADI.py
index 8e8e62f8..1a5c9dbb 100644
--- a/arabic_llm_benchmark/datasets/QADI.py
+++ b/llmebench/datasets/QADI.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class QADIDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/SQuADBase.py b/llmebench/datasets/SQuADBase.py
similarity index 95%
rename from arabic_llm_benchmark/datasets/SQuADBase.py
rename to llmebench/datasets/SQuADBase.py
index a77b5e59..5ff00d06 100644
--- a/arabic_llm_benchmark/datasets/SQuADBase.py
+++ b/llmebench/datasets/SQuADBase.py
@@ -1,6 +1,6 @@
 import json
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class SQuADBase(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/STSArSemEval17Track1.py b/llmebench/datasets/STSArSemEval17Track1.py
similarity index 96%
rename from arabic_llm_benchmark/datasets/STSArSemEval17Track1.py
rename to llmebench/datasets/STSArSemEval17Track1.py
index 15fe31c9..b8fe0180 100644
--- a/arabic_llm_benchmark/datasets/STSArSemEval17Track1.py
+++ b/llmebench/datasets/STSArSemEval17Track1.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class STSArSemEval17Track1Dataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/STSArSemEval17Track2.py b/llmebench/datasets/STSArSemEval17Track2.py
similarity index 96%
rename from arabic_llm_benchmark/datasets/STSArSemEval17Track2.py
rename to llmebench/datasets/STSArSemEval17Track2.py
index 7f4f2892..67a7178b 100644
--- a/arabic_llm_benchmark/datasets/STSArSemEval17Track2.py
+++ b/llmebench/datasets/STSArSemEval17Track2.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class STSArSemEval17Track2Dataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/STSQ2Q.py b/llmebench/datasets/STSQ2Q.py
similarity index 95%
rename from arabic_llm_benchmark/datasets/STSQ2Q.py
rename to llmebench/datasets/STSQ2Q.py
index ae8ffe21..1b8c8c5d 100644
--- a/arabic_llm_benchmark/datasets/STSQ2Q.py
+++ b/llmebench/datasets/STSQ2Q.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class Q2QSimDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/Spam.py b/llmebench/datasets/Spam.py
similarity index 94%
rename from arabic_llm_benchmark/datasets/Spam.py
rename to llmebench/datasets/Spam.py
index f73ea86c..6bef9d3f 100644
--- a/arabic_llm_benchmark/datasets/Spam.py
+++ b/llmebench/datasets/Spam.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class SpamDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/StanceKhouja20.py b/llmebench/datasets/StanceKhouja20.py
similarity index 94%
rename from arabic_llm_benchmark/datasets/StanceKhouja20.py
rename to llmebench/datasets/StanceKhouja20.py
index 13eff420..2ad2694b 100644
--- a/arabic_llm_benchmark/datasets/StanceKhouja20.py
+++ b/llmebench/datasets/StanceKhouja20.py
@@ -1,4 +1,4 @@
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class StanceKhouja20Dataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/StanceUnifiedFC.py b/llmebench/datasets/StanceUnifiedFC.py
similarity index 97%
rename from arabic_llm_benchmark/datasets/StanceUnifiedFC.py
rename to llmebench/datasets/StanceUnifiedFC.py
index 9aaa52cd..0033ae0b 100644
--- a/arabic_llm_benchmark/datasets/StanceUnifiedFC.py
+++ b/llmebench/datasets/StanceUnifiedFC.py
@@ -3,7 +3,7 @@
 
 import pandas as pd
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class StanceUnifiedFCDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/Subjectivity.py b/llmebench/datasets/Subjectivity.py
similarity index 95%
rename from arabic_llm_benchmark/datasets/Subjectivity.py
rename to llmebench/datasets/Subjectivity.py
index 087574c4..313644ad 100644
--- a/arabic_llm_benchmark/datasets/Subjectivity.py
+++ b/llmebench/datasets/Subjectivity.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class SubjectivityDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/TyDiQA.py b/llmebench/datasets/TyDiQA.py
similarity index 91%
rename from arabic_llm_benchmark/datasets/TyDiQA.py
rename to llmebench/datasets/TyDiQA.py
index 7ac3ecf8..27602722 100644
--- a/arabic_llm_benchmark/datasets/TyDiQA.py
+++ b/llmebench/datasets/TyDiQA.py
@@ -1,6 +1,6 @@
 import json
 
-from arabic_llm_benchmark.datasets.SQuADBase import SQuADBase
+from llmebench.datasets.SQuADBase import SQuADBase
 
 
 class TyDiQADataset(SQuADBase):
diff --git a/arabic_llm_benchmark/datasets/XNLI.py b/llmebench/datasets/XNLI.py
similarity index 96%
rename from arabic_llm_benchmark/datasets/XNLI.py
rename to llmebench/datasets/XNLI.py
index a19b72b5..3b8e0dcf 100644
--- a/arabic_llm_benchmark/datasets/XNLI.py
+++ b/llmebench/datasets/XNLI.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class XNLIDataset(DatasetBase):
diff --git a/arabic_llm_benchmark/datasets/XQuAD.py b/llmebench/datasets/XQuAD.py
similarity index 90%
rename from arabic_llm_benchmark/datasets/XQuAD.py
rename to llmebench/datasets/XQuAD.py
index d54a4fc2..c6626228 100644
--- a/arabic_llm_benchmark/datasets/XQuAD.py
+++ b/llmebench/datasets/XQuAD.py
@@ -1,6 +1,6 @@
 import json
 
-from arabic_llm_benchmark.datasets.SQuADBase import SQuADBase
+from llmebench.datasets.SQuADBase import SQuADBase
 
 
 class XQuADDataset(SQuADBase):
diff --git a/arabic_llm_benchmark/datasets/__init__.py b/llmebench/datasets/__init__.py
similarity index 100%
rename from arabic_llm_benchmark/datasets/__init__.py
rename to llmebench/datasets/__init__.py
diff --git a/arabic_llm_benchmark/datasets/dataset_base.py b/llmebench/datasets/dataset_base.py
similarity index 100%
rename from arabic_llm_benchmark/datasets/dataset_base.py
rename to llmebench/datasets/dataset_base.py
diff --git a/arabic_llm_benchmark/models/BLOOMPetal.py b/llmebench/models/BLOOMPetal.py
similarity index 97%
rename from arabic_llm_benchmark/models/BLOOMPetal.py
rename to llmebench/models/BLOOMPetal.py
index 9ba9503f..35bec918 100644
--- a/arabic_llm_benchmark/models/BLOOMPetal.py
+++ b/llmebench/models/BLOOMPetal.py
@@ -2,7 +2,7 @@
 
 from websockets.sync.client import connect
 
-from arabic_llm_benchmark.models.model_base import ModelBase
+from llmebench.models.model_base import ModelBase
 
 
 class BLOOMPetalFailure(Exception):
diff --git a/arabic_llm_benchmark/models/GPT.py b/llmebench/models/GPT.py
similarity index 98%
rename from arabic_llm_benchmark/models/GPT.py
rename to llmebench/models/GPT.py
index f1a77ba1..a34966bc 100644
--- a/arabic_llm_benchmark/models/GPT.py
+++ b/llmebench/models/GPT.py
@@ -1,6 +1,6 @@
 import openai
 
-from arabic_llm_benchmark.models.model_base import ModelBase
+from llmebench.models.model_base import ModelBase
 
 
 class GPTModel(ModelBase):
diff --git a/arabic_llm_benchmark/models/RandomGPT.py b/llmebench/models/RandomGPT.py
similarity index 94%
rename from arabic_llm_benchmark/models/RandomGPT.py
rename to llmebench/models/RandomGPT.py
index da9ee9b9..5ca725d5 100644
--- a/arabic_llm_benchmark/models/RandomGPT.py
+++ b/llmebench/models/RandomGPT.py
@@ -1,6 +1,6 @@
 import random
 
-from arabic_llm_benchmark.models.model_base import ModelBase
+from llmebench.models.model_base import ModelBase
 
 
 class GPTResponseMock(dict):
diff --git a/arabic_llm_benchmark/models/__init__.py b/llmebench/models/__init__.py
similarity index 100%
rename from arabic_llm_benchmark/models/__init__.py
rename to llmebench/models/__init__.py
diff --git a/arabic_llm_benchmark/models/model_base.py b/llmebench/models/model_base.py
similarity index 100%
rename from arabic_llm_benchmark/models/model_base.py
rename to llmebench/models/model_base.py
diff --git a/arabic_llm_benchmark/tasks/Adult.py b/llmebench/tasks/Adult.py
similarity index 88%
rename from arabic_llm_benchmark/tasks/Adult.py
rename to llmebench/tasks/Adult.py
index 000bd925..8c99960e 100644
--- a/arabic_llm_benchmark/tasks/Adult.py
+++ b/llmebench/tasks/Adult.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import f1_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class AdultTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/ArabicDiacritization.py b/llmebench/tasks/ArabicDiacritization.py
similarity index 98%
rename from arabic_llm_benchmark/tasks/ArabicDiacritization.py
rename to llmebench/tasks/ArabicDiacritization.py
index 5ff83594..c7644bd1 100644
--- a/arabic_llm_benchmark/tasks/ArabicDiacritization.py
+++ b/llmebench/tasks/ArabicDiacritization.py
@@ -2,7 +2,7 @@
 
 from sklearn.metrics import f1_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 #
diff --git a/arabic_llm_benchmark/tasks/ArabicPOS.py b/llmebench/tasks/ArabicPOS.py
similarity index 94%
rename from arabic_llm_benchmark/tasks/ArabicPOS.py
rename to llmebench/tasks/ArabicPOS.py
index c9426413..922b6dc5 100644
--- a/arabic_llm_benchmark/tasks/ArabicPOS.py
+++ b/llmebench/tasks/ArabicPOS.py
@@ -2,7 +2,7 @@
 
 from sklearn.metrics import accuracy_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class ArabicPOSTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/ArabicParsing.py b/llmebench/tasks/ArabicParsing.py
similarity index 93%
rename from arabic_llm_benchmark/tasks/ArabicParsing.py
rename to llmebench/tasks/ArabicParsing.py
index 926c364f..25127850 100644
--- a/arabic_llm_benchmark/tasks/ArabicParsing.py
+++ b/llmebench/tasks/ArabicParsing.py
@@ -2,7 +2,7 @@
 
 from sklearn.metrics import f1_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class ArabicParsingTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/ArabicSegmentation.py b/llmebench/tasks/ArabicSegmentation.py
similarity index 94%
rename from arabic_llm_benchmark/tasks/ArabicSegmentation.py
rename to llmebench/tasks/ArabicSegmentation.py
index 86db520e..2c0f3deb 100644
--- a/arabic_llm_benchmark/tasks/ArabicSegmentation.py
+++ b/llmebench/tasks/ArabicSegmentation.py
@@ -2,7 +2,7 @@
 
 from sklearn.metrics import accuracy_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class ArabicSegmentationTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/Attentionworthy.py b/llmebench/tasks/Attentionworthy.py
similarity index 94%
rename from arabic_llm_benchmark/tasks/Attentionworthy.py
rename to llmebench/tasks/Attentionworthy.py
index 00dba977..7daa936f 100644
--- a/arabic_llm_benchmark/tasks/Attentionworthy.py
+++ b/llmebench/tasks/Attentionworthy.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class AttentionworthyTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/Checkworthiness.py b/llmebench/tasks/Checkworthiness.py
similarity index 94%
rename from arabic_llm_benchmark/tasks/Checkworthiness.py
rename to llmebench/tasks/Checkworthiness.py
index 88e763e7..62ec0663 100644
--- a/arabic_llm_benchmark/tasks/Checkworthiness.py
+++ b/llmebench/tasks/Checkworthiness.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class CheckworthinessTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/ClaimDetection.py b/llmebench/tasks/ClaimDetection.py
similarity index 89%
rename from arabic_llm_benchmark/tasks/ClaimDetection.py
rename to llmebench/tasks/ClaimDetection.py
index 7a4d5602..17f334d1 100644
--- a/arabic_llm_benchmark/tasks/ClaimDetection.py
+++ b/llmebench/tasks/ClaimDetection.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import accuracy_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class ClaimDetectionTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/DemographyGender.py b/llmebench/tasks/DemographyGender.py
similarity index 88%
rename from arabic_llm_benchmark/tasks/DemographyGender.py
rename to llmebench/tasks/DemographyGender.py
index 3ef3c7b1..d8db7486 100644
--- a/arabic_llm_benchmark/tasks/DemographyGender.py
+++ b/llmebench/tasks/DemographyGender.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import f1_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class DemographyGenderTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/DemographyLocation.py b/llmebench/tasks/DemographyLocation.py
similarity index 88%
rename from arabic_llm_benchmark/tasks/DemographyLocation.py
rename to llmebench/tasks/DemographyLocation.py
index b18a7173..ed0473a4 100644
--- a/arabic_llm_benchmark/tasks/DemographyLocation.py
+++ b/llmebench/tasks/DemographyLocation.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import f1_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class DemographyLocationTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/DemographyNameInfo.py b/llmebench/tasks/DemographyNameInfo.py
similarity index 89%
rename from arabic_llm_benchmark/tasks/DemographyNameInfo.py
rename to llmebench/tasks/DemographyNameInfo.py
index b0aba3d3..dfc7ac73 100644
--- a/arabic_llm_benchmark/tasks/DemographyNameInfo.py
+++ b/llmebench/tasks/DemographyNameInfo.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import f1_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class DemographyNameInfoTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/DialectID.py b/llmebench/tasks/DialectID.py
similarity index 93%
rename from arabic_llm_benchmark/tasks/DialectID.py
rename to llmebench/tasks/DialectID.py
index ac80d878..30f3c7cc 100644
--- a/arabic_llm_benchmark/tasks/DialectID.py
+++ b/llmebench/tasks/DialectID.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class DialectIDTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/Emotion.py b/llmebench/tasks/Emotion.py
similarity index 89%
rename from arabic_llm_benchmark/tasks/Emotion.py
rename to llmebench/tasks/Emotion.py
index 90494e58..af6a243c 100644
--- a/arabic_llm_benchmark/tasks/Emotion.py
+++ b/llmebench/tasks/Emotion.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import jaccard_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class EmotionTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/FactualityCOVID19.py b/llmebench/tasks/FactualityCOVID19.py
similarity index 94%
rename from arabic_llm_benchmark/tasks/FactualityCOVID19.py
rename to llmebench/tasks/FactualityCOVID19.py
index d130e979..38f108c5 100644
--- a/arabic_llm_benchmark/tasks/FactualityCOVID19.py
+++ b/llmebench/tasks/FactualityCOVID19.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class FactualityCOVID19Task(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/FactualityKhouja20.py b/llmebench/tasks/FactualityKhouja20.py
similarity index 88%
rename from arabic_llm_benchmark/tasks/FactualityKhouja20.py
rename to llmebench/tasks/FactualityKhouja20.py
index 665f213a..079d2c68 100644
--- a/arabic_llm_benchmark/tasks/FactualityKhouja20.py
+++ b/llmebench/tasks/FactualityKhouja20.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import f1_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class FactualityKhouja20Task(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/FactualityUnifiedFC.py b/llmebench/tasks/FactualityUnifiedFC.py
similarity index 94%
rename from arabic_llm_benchmark/tasks/FactualityUnifiedFC.py
rename to llmebench/tasks/FactualityUnifiedFC.py
index 61f94e81..d87c8fd4 100644
--- a/arabic_llm_benchmark/tasks/FactualityUnifiedFC.py
+++ b/llmebench/tasks/FactualityUnifiedFC.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class FactualityUnifiedFCTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/HarmfulDetection.py b/llmebench/tasks/HarmfulDetection.py
similarity index 90%
rename from arabic_llm_benchmark/tasks/HarmfulDetection.py
rename to llmebench/tasks/HarmfulDetection.py
index 2cb4731f..f3dffa4d 100644
--- a/arabic_llm_benchmark/tasks/HarmfulDetection.py
+++ b/llmebench/tasks/HarmfulDetection.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import f1_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class HarmfulDetectionTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/HateSpeech.py b/llmebench/tasks/HateSpeech.py
similarity index 88%
rename from arabic_llm_benchmark/tasks/HateSpeech.py
rename to llmebench/tasks/HateSpeech.py
index 3b836227..b7b1c538 100644
--- a/arabic_llm_benchmark/tasks/HateSpeech.py
+++ b/llmebench/tasks/HateSpeech.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import f1_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class HateSpeechTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/Lemmatization.py b/llmebench/tasks/Lemmatization.py
similarity index 92%
rename from arabic_llm_benchmark/tasks/Lemmatization.py
rename to llmebench/tasks/Lemmatization.py
index f32f022f..53c74566 100644
--- a/arabic_llm_benchmark/tasks/Lemmatization.py
+++ b/llmebench/tasks/Lemmatization.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import accuracy_score, f1_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class LemmatizationTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/MachineTranslation.py b/llmebench/tasks/MachineTranslation.py
similarity index 87%
rename from arabic_llm_benchmark/tasks/MachineTranslation.py
rename to llmebench/tasks/MachineTranslation.py
index 59be8d7f..6282a737 100644
--- a/arabic_llm_benchmark/tasks/MachineTranslation.py
+++ b/llmebench/tasks/MachineTranslation.py
@@ -1,6 +1,6 @@
 from nltk.translate.bleu_score import corpus_bleu
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class MachineTranslationTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/NER.py b/llmebench/tasks/NER.py
similarity index 96%
rename from arabic_llm_benchmark/tasks/NER.py
rename to llmebench/tasks/NER.py
index 35304d1c..9b34410a 100644
--- a/arabic_llm_benchmark/tasks/NER.py
+++ b/llmebench/tasks/NER.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import f1_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class NERTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/NewsCatASND.py b/llmebench/tasks/NewsCatASND.py
similarity index 94%
rename from arabic_llm_benchmark/tasks/NewsCatASND.py
rename to llmebench/tasks/NewsCatASND.py
index 3b636fd0..5ab6091e 100644
--- a/arabic_llm_benchmark/tasks/NewsCatASND.py
+++ b/llmebench/tasks/NewsCatASND.py
@@ -6,7 +6,7 @@
     recall_score,
 )
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class NewsCatASNDTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/NewsCatAkhbarona.py b/llmebench/tasks/NewsCatAkhbarona.py
similarity index 94%
rename from arabic_llm_benchmark/tasks/NewsCatAkhbarona.py
rename to llmebench/tasks/NewsCatAkhbarona.py
index a6b0f80f..c25bbfb9 100644
--- a/arabic_llm_benchmark/tasks/NewsCatAkhbarona.py
+++ b/llmebench/tasks/NewsCatAkhbarona.py
@@ -6,7 +6,7 @@
     recall_score,
 )
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class NewsCatAkhbaronaTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/NewsCatAlArabiya.py b/llmebench/tasks/NewsCatAlArabiya.py
similarity index 94%
rename from arabic_llm_benchmark/tasks/NewsCatAlArabiya.py
rename to llmebench/tasks/NewsCatAlArabiya.py
index b118f5bc..0ed8e697 100644
--- a/arabic_llm_benchmark/tasks/NewsCatAlArabiya.py
+++ b/llmebench/tasks/NewsCatAlArabiya.py
@@ -6,7 +6,7 @@
     recall_score,
 )
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class NewsCatAlArabiyaTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/NewsCatAlKhaleej.py b/llmebench/tasks/NewsCatAlKhaleej.py
similarity index 94%
rename from arabic_llm_benchmark/tasks/NewsCatAlKhaleej.py
rename to llmebench/tasks/NewsCatAlKhaleej.py
index a5e5d706..176a08ba 100644
--- a/arabic_llm_benchmark/tasks/NewsCatAlKhaleej.py
+++ b/llmebench/tasks/NewsCatAlKhaleej.py
@@ -6,7 +6,7 @@
     recall_score,
 )
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class NewsCatAlKhaleejTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/Offensive.py b/llmebench/tasks/Offensive.py
similarity index 88%
rename from arabic_llm_benchmark/tasks/Offensive.py
rename to llmebench/tasks/Offensive.py
index 2918ad2a..b2f37675 100644
--- a/arabic_llm_benchmark/tasks/Offensive.py
+++ b/llmebench/tasks/Offensive.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import f1_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class OffensiveTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/PropagandaMultilabel.py b/llmebench/tasks/PropagandaMultilabel.py
similarity index 94%
rename from arabic_llm_benchmark/tasks/PropagandaMultilabel.py
rename to llmebench/tasks/PropagandaMultilabel.py
index fdbe77e3..c661a2a1 100644
--- a/arabic_llm_benchmark/tasks/PropagandaMultilabel.py
+++ b/llmebench/tasks/PropagandaMultilabel.py
@@ -3,7 +3,7 @@
 from sklearn import preprocessing
 from sklearn.metrics import f1_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class PropagandaMultilabelTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/PropagandaMultilabelSemEval23.py b/llmebench/tasks/PropagandaMultilabelSemEval23.py
similarity index 94%
rename from arabic_llm_benchmark/tasks/PropagandaMultilabelSemEval23.py
rename to llmebench/tasks/PropagandaMultilabelSemEval23.py
index 8d562f2e..5e9b6dcd 100644
--- a/arabic_llm_benchmark/tasks/PropagandaMultilabelSemEval23.py
+++ b/llmebench/tasks/PropagandaMultilabelSemEval23.py
@@ -3,7 +3,7 @@
 from sklearn import preprocessing
 from sklearn.metrics import f1_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class PropagandaMultilabelSemEval23Task(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/Q2QSimDetect.py b/llmebench/tasks/Q2QSimDetect.py
similarity index 88%
rename from arabic_llm_benchmark/tasks/Q2QSimDetect.py
rename to llmebench/tasks/Q2QSimDetect.py
index 7d916a13..841ed732 100644
--- a/arabic_llm_benchmark/tasks/Q2QSimDetect.py
+++ b/llmebench/tasks/Q2QSimDetect.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import f1_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class Q2QSimDetectionTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/QA.py b/llmebench/tasks/QA.py
similarity index 97%
rename from arabic_llm_benchmark/tasks/QA.py
rename to llmebench/tasks/QA.py
index 9956582e..7b5b94c8 100644
--- a/arabic_llm_benchmark/tasks/QA.py
+++ b/llmebench/tasks/QA.py
@@ -3,7 +3,7 @@
 import sys
 from collections import Counter
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class QATask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/STSTrack1.py b/llmebench/tasks/STSTrack1.py
similarity index 90%
rename from arabic_llm_benchmark/tasks/STSTrack1.py
rename to llmebench/tasks/STSTrack1.py
index 961ea526..20b9a0e0 100644
--- a/arabic_llm_benchmark/tasks/STSTrack1.py
+++ b/llmebench/tasks/STSTrack1.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class STSTrack1Task(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/STSTrack2.py b/llmebench/tasks/STSTrack2.py
similarity index 90%
rename from arabic_llm_benchmark/tasks/STSTrack2.py
rename to llmebench/tasks/STSTrack2.py
index ff1463a5..562af7a6 100644
--- a/arabic_llm_benchmark/tasks/STSTrack2.py
+++ b/llmebench/tasks/STSTrack2.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class STSTrack2Task(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/Sarcasm.py b/llmebench/tasks/Sarcasm.py
similarity index 89%
rename from arabic_llm_benchmark/tasks/Sarcasm.py
rename to llmebench/tasks/Sarcasm.py
index 454a6c2f..b6427ffb 100644
--- a/arabic_llm_benchmark/tasks/Sarcasm.py
+++ b/llmebench/tasks/Sarcasm.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class SarcasmTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/Sentiment.py b/llmebench/tasks/Sentiment.py
similarity index 94%
rename from arabic_llm_benchmark/tasks/Sentiment.py
rename to llmebench/tasks/Sentiment.py
index ae5beb89..5b35efe5 100644
--- a/arabic_llm_benchmark/tasks/Sentiment.py
+++ b/llmebench/tasks/Sentiment.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class SentimentTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/Spam.py b/llmebench/tasks/Spam.py
similarity index 88%
rename from arabic_llm_benchmark/tasks/Spam.py
rename to llmebench/tasks/Spam.py
index 9fe3c9eb..5ee9df4d 100644
--- a/arabic_llm_benchmark/tasks/Spam.py
+++ b/llmebench/tasks/Spam.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import f1_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class SpamTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/StanceKhouja20.py b/llmebench/tasks/StanceKhouja20.py
similarity index 88%
rename from arabic_llm_benchmark/tasks/StanceKhouja20.py
rename to llmebench/tasks/StanceKhouja20.py
index dd9d9a6f..bbb5930c 100644
--- a/arabic_llm_benchmark/tasks/StanceKhouja20.py
+++ b/llmebench/tasks/StanceKhouja20.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import f1_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class StanceKhouja20Task(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/StanceUnifiedFC.py b/llmebench/tasks/StanceUnifiedFC.py
similarity index 88%
rename from arabic_llm_benchmark/tasks/StanceUnifiedFC.py
rename to llmebench/tasks/StanceUnifiedFC.py
index dd5e8cd5..5ab04762 100644
--- a/arabic_llm_benchmark/tasks/StanceUnifiedFC.py
+++ b/llmebench/tasks/StanceUnifiedFC.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import f1_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class StanceUnifiedFCTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/Subjectivity.py b/llmebench/tasks/Subjectivity.py
similarity index 95%
rename from arabic_llm_benchmark/tasks/Subjectivity.py
rename to llmebench/tasks/Subjectivity.py
index 9ba35380..96272969 100644
--- a/arabic_llm_benchmark/tasks/Subjectivity.py
+++ b/llmebench/tasks/Subjectivity.py
@@ -6,7 +6,7 @@
     recall_score,
 )
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class SubjectivityTask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/XNLI.py b/llmebench/tasks/XNLI.py
similarity index 88%
rename from arabic_llm_benchmark/tasks/XNLI.py
rename to llmebench/tasks/XNLI.py
index 380ef0dc..6d2884d9 100644
--- a/arabic_llm_benchmark/tasks/XNLI.py
+++ b/llmebench/tasks/XNLI.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import accuracy_score
 
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+from llmebench.tasks.task_base import TaskBase
 
 
 class XNLITask(TaskBase):
diff --git a/arabic_llm_benchmark/tasks/__init__.py b/llmebench/tasks/__init__.py
similarity index 100%
rename from arabic_llm_benchmark/tasks/__init__.py
rename to llmebench/tasks/__init__.py
diff --git a/arabic_llm_benchmark/tasks/task_base.py b/llmebench/tasks/task_base.py
similarity index 100%
rename from arabic_llm_benchmark/tasks/task_base.py
rename to llmebench/tasks/task_base.py
diff --git a/arabic_llm_benchmark/utils.py b/llmebench/utils.py
similarity index 100%
rename from arabic_llm_benchmark/utils.py
rename to llmebench/utils.py
diff --git a/scripts/format_code.sh b/scripts/format_code.sh
index 8b1c17c8..11d9f636 100755
--- a/scripts/format_code.sh
+++ b/scripts/format_code.sh
@@ -3,12 +3,12 @@
 # exit when any command fails
 set -e
 
-if [[ ! -f setup.cfg ]] || [[ ! -d arabic_llm_benchmark ]] || [[ ! -d assets ]] || [[ ! -d scripts ]] || [[ ! -d tests ]]
+if [[ ! -f setup.cfg ]] || [[ ! -d llmebench ]] || [[ ! -d assets ]] || [[ ! -d scripts ]] || [[ ! -d tests ]]
 then
   echo "format_code.sh must be run from the root of the repository"
   exit 1
 fi
 
 ufmt format assets
-ufmt format arabic_llm_benchmark
+ufmt format llmebench
 ufmt format tests
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index e8e02785..a8e84718 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -3,7 +3,7 @@
 # exit when any command fails
 set -e
 
-if [[ ! -f setup.cfg ]] || [[ ! -d arabic_llm_benchmark ]] || [[ ! -d assets ]] || [[ ! -d scripts ]] || [[ ! -d tests ]]
+if [[ ! -f setup.cfg ]] || [[ ! -d llmebench ]] || [[ ! -d assets ]] || [[ ! -d scripts ]] || [[ ! -d tests ]]
 then
   echo "run_tests.sh must be run from the root of the repository"
   exit 1
diff --git a/setup.cfg b/setup.cfg
index 9a3e48d9..64f25410 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,9 +1,9 @@
 [metadata]
-name = arabic_llm_benchmark
-version = 0.0.1
+name = llmebench
+version = 0.1.0
 author = Fahim Dalvi
 author_email = faimaduddin@hbku.edu.qa
-description = Arabic Benchmark for LLMs
+description = Benchmarking for LLMs
 long_description = file: README.md
 long_description_content_type = text/markdown
 url = https://github.com/qcri
diff --git a/tests/datasets/test_exports.py b/tests/datasets/test_exports.py
index db82badb..b22935d3 100644
--- a/tests/datasets/test_exports.py
+++ b/tests/datasets/test_exports.py
@@ -4,16 +4,16 @@
 from glob import glob
 from pathlib import Path
 
-import arabic_llm_benchmark.datasets as datasets
-from arabic_llm_benchmark import utils
-from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+import llmebench.datasets as datasets
+from llmebench import utils
+from llmebench.datasets.dataset_base import DatasetBase
 
 
 class TestDatasetExports(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         # Search for all implemented datasets
-        framework_dir = Path("arabic_llm_benchmark")
+        framework_dir = Path("llmebench")
         cls.implemented_datasets = [
             dataset_path
             for dataset_path in glob(str(framework_dir / "datasets" / "*.py"))
diff --git a/tests/models/test_BLOOMPetal.py b/tests/models/test_BLOOMPetal.py
index 8795c7db..ad8783eb 100644
--- a/tests/models/test_BLOOMPetal.py
+++ b/tests/models/test_BLOOMPetal.py
@@ -4,8 +4,8 @@
 
 from unittest.mock import patch
 
-from arabic_llm_benchmark import Benchmark
-from arabic_llm_benchmark.models import BLOOMPetalModel
+from llmebench import Benchmark
+from llmebench.models import BLOOMPetalModel
 
 
 class TestAssetsForBLOOMPetalPrompts(unittest.TestCase):
diff --git a/tests/models/test_GPT.py b/tests/models/test_GPT.py
index 45cac1a5..a921bd27 100644
--- a/tests/models/test_GPT.py
+++ b/tests/models/test_GPT.py
@@ -4,8 +4,8 @@
 
 from unittest.mock import patch
 
-from arabic_llm_benchmark import Benchmark
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
+from llmebench import Benchmark
+from llmebench.models import GPTModel, RandomGPTModel
 
 
 class TestAssetsForGPTPrompts(unittest.TestCase):
diff --git a/tests/models/test_GPTChatCompletion.py b/tests/models/test_GPTChatCompletion.py
index 5c3c220b..47404e69 100644
--- a/tests/models/test_GPTChatCompletion.py
+++ b/tests/models/test_GPTChatCompletion.py
@@ -4,8 +4,8 @@
 
 from unittest.mock import patch
 
-from arabic_llm_benchmark import Benchmark
-from arabic_llm_benchmark.models import GPTChatCompletionModel
+from llmebench import Benchmark
+from llmebench.models import GPTChatCompletionModel
 
 
 class TestAssetsForGPTChatCompletionPrompts(unittest.TestCase):
diff --git a/tests/models/test_exports.py b/tests/models/test_exports.py
index b43a0889..41ac1881 100644
--- a/tests/models/test_exports.py
+++ b/tests/models/test_exports.py
@@ -4,16 +4,16 @@
 from glob import glob
 from pathlib import Path
 
-import arabic_llm_benchmark.models as models
-from arabic_llm_benchmark import utils
-from arabic_llm_benchmark.models.model_base import ModelBase
+import llmebench.models as models
+from llmebench import utils
+from llmebench.models.model_base import ModelBase
 
 
 class TestDatasetExports(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         # Search for all implemented models
-        framework_dir = Path("arabic_llm_benchmark")
+        framework_dir = Path("llmebench")
         cls.implemented_models = [
             model_path
             for model_path in glob(str(framework_dir / "models" / "*.py"))
diff --git a/tests/tasks/test_evaluation.py b/tests/tasks/test_evaluation.py
index 603f003c..d7e687f6 100644
--- a/tests/tasks/test_evaluation.py
+++ b/tests/tasks/test_evaluation.py
@@ -4,7 +4,7 @@
 
 from unittest.mock import patch
 
-from arabic_llm_benchmark import Benchmark
+from llmebench import Benchmark
 
 
 class TestAssetsTaskEvaluation(unittest.TestCase):
diff --git a/tests/tasks/test_exports.py b/tests/tasks/test_exports.py
index 31e8a49f..08e686c8 100644
--- a/tests/tasks/test_exports.py
+++ b/tests/tasks/test_exports.py
@@ -4,16 +4,16 @@
 from glob import glob
 from pathlib import Path
 
-import arabic_llm_benchmark.tasks as tasks
-from arabic_llm_benchmark import Benchmark, utils
-from arabic_llm_benchmark.tasks.task_base import TaskBase
+import llmebench.tasks as tasks
+from llmebench import Benchmark, utils
+from llmebench.tasks.task_base import TaskBase
 
 
 class TestTaskExports(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         # Search for all implemented tasks
-        framework_dir = Path("arabic_llm_benchmark")
+        framework_dir = Path("llmebench")
         cls.implemented_tasks = [
             task_path
             for task_path in glob(str(framework_dir / "tasks" / "*.py"))
diff --git a/tests/test_benchmark_assets.py b/tests/test_benchmark_assets.py
index 3e417d34..da71ab7d 100644
--- a/tests/test_benchmark_assets.py
+++ b/tests/test_benchmark_assets.py
@@ -3,7 +3,7 @@
 import unittest
 from unittest.mock import patch
 
-from arabic_llm_benchmark import Benchmark
+from llmebench import Benchmark
 
 
 class TestBenchmarkAssets(unittest.TestCase):