From cfa450403d667258c3d16cb060d1f9fe79ff12fa Mon Sep 17 00:00:00 2001 From: Fahim Dalvi Date: Wed, 23 Aug 2023 10:09:24 +0300 Subject: [PATCH] Rename package to `llmebench` (#174) This commit renames the top-level package to `llmebench` to highlight the multilingual nature of the framework. All assets have been modified to use the new package name as well. --- .github/workflows/code-formatting.yml | 2 +- README.md | 26 +++++++++---------- .../MT/AraBench_Ara2Eng_BLOOMZ_ZeroShot.py | 6 ++--- .../MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py | 6 ++--- .../MT/AraBench_Ara2Eng_ChatGPT_ZeroShot.py | 6 ++--- .../NER/MGBWords_ChatGPT_ZeroShot.py | 6 ++--- .../MGBWords_GPTChatCompletion_ZeroShot.py | 6 ++--- .../NER/NERANERcorp_ChatGPT_ZeroShot.py | 6 ++--- .../NERANERcorp_GPTChatCompletion_FewShot.py | 6 ++--- .../NERANERcorp_GPTChatCompletion_ZeroShot.py | 6 ++--- .../NER/NERAqmar_ChatGPT_ZeroShot.py | 6 ++--- .../NER/NERAqmar_GPTChatCompletion_FewShot.py | 6 ++--- .../NERAqmar_GPTChatCompletion_ZeroShot.py | 6 ++--- .../benchmark_v1/QA/ARCD_BLOOMZ_ZeroShot.py | 6 ++--- .../benchmark_v1/QA/ARCD_ChatGPT_ZeroShot.py | 6 ++--- .../QA/ARCD_GPTChatCompletion_FewShot.py | 6 ++--- .../QA/ARCD_GPTChatCompletion_ZeroShot.py | 6 ++--- .../benchmark_v1/QA/MLQA_BLOOMZ_ZeroShot.py | 6 ++--- .../benchmark_v1/QA/MLQA_ChatGPT_ZeroShot.py | 6 ++--- .../QA/MLQA_GPTChatCompletion_FewShot.py | 6 ++--- .../QA/MLQA_GPTChatCompletion_ZeroShot.py | 6 ++--- .../benchmark_v1/QA/TyDiQA_BLOOMZ_ZeroShot.py | 6 ++--- .../QA/TyDiQA_ChatGPT_ZeroShot.py | 6 ++--- .../QA/TyDiQA_GPTChatCompletion_FewShot.py | 6 ++--- .../QA/TydiQA_GPTChatCompletion_ZeroShot.py | 6 ++--- .../benchmark_v1/QA/XQuAD_BLOOMZ_ZeroShot.py | 6 ++--- .../benchmark_v1/QA/XQuAD_ChatGPT_ZeroShot.py | 6 ++--- .../QA/XQuAD_GPTChatCompletion_ZeroShot.py | 6 ++--- .../QA/XQuaD_GPTChatCompletion_FewShot.py | 6 ++--- .../STS/Q2QSim_BLOOMZ_ZeroShot.py | 6 ++--- .../STS/Q2QSim_ChatGPT_ZeroShot.py | 6 ++--- .../benchmark_v1/STS/Q2QSim_GPT4_FewShot.py | 6 ++--- .../benchmark_v1/STS/Q2QSim_GPT4_ZeroShot.py | 6 ++--- .../STS/STSTrack1_BLOOMZ_ZeroShot.py | 6 ++--- .../STS/STSTrack1_ChatGPT_ZeroShot.py | 6 ++--- .../STS/STSTrack1_GPT4_FewShot.py | 6 ++--- .../STS/STSTrack1_GPT4_ZeroShot.py | 6 ++--- .../STS/STSTrack2_BLOOMZ_ZeroShot.py | 6 ++--- .../STS/STSTrack2_ChatGPT_ZeroShot.py | 6 ++--- .../STS/STSTrack2_GPT4_FewShot.py | 6 ++--- .../STS/STSTrack2_GPT4_ZeroShot.py | 6 ++--- .../gender/GenderArabGend_BLOOMZ_ZeroShot.py | 6 ++--- .../gender/GenderArabGend_ChatGPT_ZeroShot.py | 6 ++--- ...nderArabGend_GPTChatCompletion_ZeroShot.py | 6 ++--- .../gender/GenderArapTweet_BLOOMZ_ZeroShot.py | 6 ++--- .../GenderArapTweet_ChatGPT_ZeroShot.py | 6 ++--- ...nderArapTweet_GPTChatCompletion_FewShot.py | 6 ++--- ...derArapTweet_GPTChatCompletion_ZeroShot.py | 6 ++--- .../location/Location_BLOOMZ_ZeroShot.py | 6 ++--- .../location/Location_ChatGPT_ZeroShot.py | 6 ++--- .../Location_GPTChatCompletion_FewShot.py | 6 ++--- .../Location_GPTChatCompletion_ZeroShot.py | 6 ++--- .../name_info/NameInfo_BLOOMZ_ZeroShot.py | 6 ++--- .../name_info/NameInfo_ChatGPT_ZeroShot.py | 6 ++--- .../NameInfo_GPTChatCompletion_FewShot.py | 6 ++--- .../NameInfo_GPTChatCompletion_ZeroShot.py | 6 ++--- .../Adult_BLOOMZ_ZeroShot.py | 6 ++--- .../Adult_ChatGPT_ZeroShot.py | 6 ++--- .../Adult_GPTChatCompletion_FewShot.py | 6 ++--- .../Adult_GPTChatCompletion_ZeroShot.py | 6 ++--- .../Attentionworthy_BLOOMZ_ZeroShot.py | 6 ++--- .../Attentionworthy_ChatGPT_ZeroShot.py | 6 ++--- ...tentionworthy_GPTChatCompletion_Fewshot.py | 6 ++--- ...entionworthy_GPTChatCompletion_ZeroShot.py | 6 ++--- .../COVClaimDetect_BLOOMZ_ZeroShot.py | 6 ++--- .../COVClaimDetect_CGPT35_ZeroShot.py | 6 ++--- .../COVClaimDetect_GPT4_FewShot.py | 6 ++--- .../COVHarmfulDetect_BLOOMZ_ZeroShot.py | 6 ++--- .../COVHarmfulDetect_CGPT35_ZeroShot.py | 6 ++--- .../COVHarmfulDetect_GPT4_FewShot.py | 6 ++--- .../Checkworthiness_BLOOMZ_BGZeroShot.py | 6 ++--- .../Checkworthiness_BLOOMZ_ENZeroShot.py | 6 ++--- .../Checkworthiness_BLOOMZ_ESZeroShot.py | 6 ++--- .../Checkworthiness_BLOOMZ_NLZeroShot.py | 6 ++--- .../Checkworthiness_BLOOMZ_TRZeroShot.py | 6 ++--- .../Checkworthiness_BLOOMZ_ZeroShot.py | 6 ++--- .../Checkworthiness_ChatGPT_ZeroShot.py | 6 ++--- ...kworthiness_GPTChatCompletion_BGFewShot.py | 6 ++--- ...worthiness_GPTChatCompletion_BGZeroShot.py | 6 ++--- ...kworthiness_GPTChatCompletion_ENFewShot.py | 6 ++--- ...worthiness_GPTChatCompletion_ENZeroShot.py | 6 ++--- ...kworthiness_GPTChatCompletion_ESFewShot.py | 6 ++--- ...worthiness_GPTChatCompletion_ESZeroShot.py | 6 ++--- ...eckworthiness_GPTChatCompletion_FewShot.py | 6 ++--- ...kworthiness_GPTChatCompletion_NLFewShot.py | 6 ++--- ...worthiness_GPTChatCompletion_NLZeroShot.py | 6 ++--- ...kworthiness_GPTChatCompletion_TRFewShot.py | 6 ++--- ...worthiness_GPTChatCompletion_TRZeroShot.py | 6 ++--- ...ckworthiness_GPTChatCompletion_ZeroShot.py | 6 ++--- .../ClaimDetectCOVID19_CGPT35_ZeroShot.py | 6 ++--- ...etectCOVID19_GPTChatCompletion_ZeroShot.py | 6 ++--- .../FactualityCOVID19_BLOOMZ_ZeroShot.py | 6 ++--- ...ualityCOVID19_GPTChatCompletion_FewShot.py | 6 ++--- ...alityCOVID19_GPTChatCompletion_ZeroShot.py | 6 ++--- .../FactualityKhouja20_BLOOMZ_ZeroShot.py | 6 ++--- .../FactualityKhouja20_ChatGPT_ZeroShot.py | 6 ++--- .../FactualityKhouja20_GPT4_FewShot.py | 6 ++--- .../FactualityKhouja20_GPT4_ZeroShot.py | 6 ++--- .../FactualityUnifiedFC_BLOOMZ_ZeroShot.py | 6 ++--- .../FactualityUnifiedFC_GPT4_FewShot.py | 6 ++--- ...ityUnifiedFC_GPTChatCompletion_ZeroShot.py | 6 ++--- .../HarmfulDetectCOVID19_CGPT35_ZeroShot.py | 6 ++--- ...etectCOVID19_GPTChatCompletion_ZeroShot.py | 6 ++--- .../HateSpeech_ChatGPT_ZeroShot.py | 6 ++--- .../HateSpeech_GPTChatCompletion_FewShot.py | 6 ++--- .../Offensive_GPTChatCompletion_FewShot.py | 6 ++--- .../PropMultilabel_BLOOMZ_ENZeroShot.py | 6 ++--- .../PropMultilabel_BLOOMZ_FRZeroShot.py | 6 ++--- .../PropMultilabel_BLOOMZ_GEZeroShot.py | 6 ++--- .../PropMultilabel_BLOOMZ_ITZeroShot.py | 6 ++--- .../PropMultilabel_BLOOMZ_POZeroShot.py | 6 ++--- .../PropMultilabel_BLOOMZ_RUZeroShot.py | 6 ++--- .../PropMultilabel_BLOOMZ_ZeroShot.py | 6 ++--- .../PropMultilabel_CGPT35_ZeroShot.py | 6 ++--- .../PropMultilabel_GPT4_ENFewShot.py | 6 ++--- .../PropMultilabel_GPT4_ENZeroShot.py | 6 ++--- .../PropMultilabel_GPT4_FRFewShot.py | 6 ++--- .../PropMultilabel_GPT4_FRZeroShot.py | 6 ++--- .../PropMultilabel_GPT4_GEFewShot.py | 6 ++--- .../PropMultilabel_GPT4_GEZeroShot.py | 6 ++--- .../PropMultilabel_GPT4_ITFewShot.py | 6 ++--- .../PropMultilabel_GPT4_ITZeroShot.py | 6 ++--- .../PropMultilabel_GPT4_POFewShot.py | 6 ++--- .../PropMultilabel_GPT4_POZeroShot.py | 6 ++--- .../PropMultilabel_GPT4_RUFewShot.py | 6 ++--- .../PropMultilabel_GPT4_RUZeroShot.py | 6 ++--- .../PropMultilabel_GPT4_ZeroShot.py | 6 ++--- ...ropMultilabel_GPTChatCompletion_FewShot.py | 6 ++--- ...opMultilabel_GPTChatCompletion_ZeroShot.py | 6 ++--- .../Subjectivity_BLOOMZ_ZeroShot.py | 6 ++--- .../Subjectivity_ChatGPT_ZeroShot.py | 6 ++--- .../Subjectivity_GPTChatCompletion_FewShot.py | 6 ++--- ...Subjectivity_GPTChatCompletion_ZeroShot.py | 6 ++--- .../NewsCat_ASND_BLOOMZ_ZeroShot.py | 6 ++--- .../NewsCat_ASND_ChatGPT_ZeroShot.py | 6 ++--- .../NewsCat_ASND_GPTChatCompletion_FewShot.py | 6 ++--- ...NewsCat_ASND_GPTChatCompletion_ZeroShot.py | 6 ++--- .../NewsCat_Akhbarona_BLOOMZ_ZeroShot.py | 6 ++--- .../NewsCat_Akhbarona_ChatGPT_ZeroShot.py | 6 ++--- ...Cat_Akhbarona_GPTChatCompletion_FewShot.py | 6 ++--- ...at_Akhbarona_GPTChatCompletion_ZeroShot.py | 6 ++--- .../NewsCat_AlArabiya_BLOOMZ_ZeroShot.py | 6 ++--- .../NewsCat_AlArabiya_ChatGPT_ZeroShot.py | 6 ++--- ...Cat_AlArabiya_GPTChatCompletion_FewShot.py | 6 ++--- ...at_AlArabiya_GPTChatCompletion_ZeroShot.py | 6 ++--- .../NewsCat_AlKhaleej_BLOOMZ_ZeroShot.py | 6 ++--- .../NewsCat_AlKhaleej_ChatGPT_ZeroShot.py | 6 ++--- ...Cat_AlKhaleej_GPTChatCompletion_FewShot.py | 6 ++--- ...at_AlKhaleej_GPTChatCompletion_ZeroShot.py | 6 ++--- .../sarcasm/ArSarcasm2_GPT3_Zeroshot.py | 6 ++--- .../sarcasm/ArSarcasm2_GPT4_FewShot.py | 6 ++--- .../sarcasm/ArSarcasm2_GPT4_Zeroshot.py | 6 ++--- .../sarcasm/ArSarcasm_BLOOMZ_Zeroshot.py | 6 ++--- .../sarcasm/ArSarcasm_GPT3_Zeroshot.py | 6 ++--- .../sarcasm/ArSarcasm_GPT4_Fewshot.py | 6 ++--- .../sarcasm/ArSarcasm_GPT4_Zeroshot.py | 6 ++--- .../semantics/XNLI_BLOOMZ_ZeroShot.py | 6 ++--- .../semantics/XNLI_CGPT4_FewShot.py | 6 ++--- .../semantics/XNLI_CGPT4_ZeroShot.py | 6 ++--- .../semantics/XNLI_ChatGPT_ZeroShot.py | 6 ++--- .../emotion/Emotion_BLOOMZ_ZeroShot.py | 6 ++--- .../emotion/Emotion_ChatGPT_ZeroShot.py | 6 ++--- .../Emotion_GPTChatCompletion_FewShot.py | 6 ++--- .../Emotion_GPTChatCompletion_ZeroShot.py | 6 ++--- .../offensive/Offensive_BLOOMZ_ZeroShot.py | 6 ++--- .../offensive/Offensive_ChatGPT_ZeroShot.py | 6 ++--- .../Offensive_GPTChatCompletion_ZeroShot.py | 6 ++--- .../ArSASSentiment_BLOOMZ_ZeroShot.py | 6 ++--- .../ArSASSentiment_ChatGPT_ZeroShot.py | 6 ++--- ...SASSentiment_GPTChatCompletion_ZeroShot.py | 6 ++--- .../BanglaSentiment_BLOOMZ_ZeroShot.py | 6 ++--- .../sentiment/BanglaSentiment_GPT4_FewShot.py | 6 ++--- .../BanglaSentiment_GPT4_ZeroShot.py | 6 ++--- .../sentiment/spam/Spam_BLOOMZ_ZeroShot.py | 6 ++--- .../sentiment/spam/Spam_ChatGPT_ZeroShot.py | 6 ++--- .../spam/Spam_GPTChatCompletion_ZeroShot.py | 6 ++--- .../StanceKhouja20_BLOOMZ_ZeroShot.py | 6 ++--- .../StanceKhouja20_ChatGPT_ZeroShot.py | 6 ++--- ...tanceKhouja20_GPTChatCompletion_FewShot.py | 6 ++--- ...anceKhouja20_GPTChatCompletion_ZeroShot.py | 6 ++--- .../StanceUnifiedFC_BLOOMZ_ZeroShot.py | 6 ++--- .../StanceUnifiedFC_ChatGPT_ZeroShot.py | 6 ++--- ...anceUnifiedFC_GPTChatCompletion_FewShot.py | 6 ++--- ...nceUnifiedFC_GPTChatCompletion_ZeroShot.py | 6 ++--- .../DialectADI_BLOOMZ_ZeroShot.py | 6 ++--- .../DialectADI_ChatGPT_ZeroShot.py | 6 ++--- .../DialectADI_GPTChatCompletion_FewShot.py | 6 ++--- .../DialectADI_GPTChatCompletion_ZeroShot.py | 6 ++--- .../DialectID_QADI_ChatGPT_ZeroShot.py | 6 ++--- ...alectID_QADI_GPTChatCompletion_ZeroShot.py | 6 ++--- .../POS_ChatGPT_ZeroShot.py | 6 ++--- .../POS_GPT4_ZeroShot.py | 6 ++--- .../POS_GPTChatCompletion_FewShot.py | 6 ++--- .../diacritization_ChatGPT_ZeroShot.py | 6 ++--- .../Lemmatization_ChatGPT_ZeroShot.py | 6 ++--- .../parsing_ChatGPT_ZeroShot.py | 6 ++--- .../parsing_GPT4_ZeroShot.py | 6 ++--- .../segmentation_ChatGPT_ZeroShot.py | 6 ++--- .../segmentation_GPT4_ZeroShot.py | 6 ++--- .../__init__.py | 0 .../__main__.py | 0 .../benchmark.py | 0 .../datasets/ANERcorp.py | 2 +- .../datasets/ARCD.py | 2 +- .../datasets/Adult.py | 2 +- .../datasets/Aqmar.py | 2 +- .../datasets/ArSASSentiment.py | 2 +- .../datasets/ArSarcasm.py | 2 +- .../datasets/AraBench.py | 2 +- .../datasets/ArabGend.py | 2 +- .../datasets/ArabicDiacritization.py | 2 +- .../datasets/ArabicPOS.py | 2 +- .../datasets/ArabicParsing.py | 2 +- .../datasets/ArabicSegmentation.py | 2 +- .../datasets/ArapTweet.py | 2 +- .../datasets/Attentionworthy.py | 2 +- .../datasets/BanglaSentiment.py | 2 +- .../datasets/Checkworthiness.py | 2 +- .../datasets/Claim.py | 2 +- .../datasets/DialectADI.py | 2 +- .../datasets/Emotion.py | 2 +- .../datasets/FactualityCOVID19.py | 2 +- .../datasets/FactualityKhouja20.py | 2 +- .../datasets/FactualityUnifiedFC.py | 2 +- .../datasets/Harmful.py | 2 +- .../datasets/HateSpeech.py | 2 +- .../datasets/Lemmatization.py | 2 +- .../datasets/Location.py | 2 +- .../datasets/MGBWords.py | 2 +- .../datasets/MLQA.py | 2 +- .../datasets/NameInfo.py | 2 +- .../datasets/NewsCatASND.py | 2 +- .../datasets/NewsCatAkhbarona.py | 2 +- .../datasets/NewsCatAlArabiya.py | 2 +- .../datasets/NewsCatAlKhaleej.py | 2 +- .../datasets/Offensive.py | 2 +- .../datasets/Propaganda.py | 2 +- .../datasets/PropagandaSemEval23.py | 2 +- .../datasets/QADI.py | 2 +- .../datasets/SQuADBase.py | 2 +- .../datasets/STSArSemEval17Track1.py | 2 +- .../datasets/STSArSemEval17Track2.py | 2 +- .../datasets/STSQ2Q.py | 2 +- .../datasets/Spam.py | 2 +- .../datasets/StanceKhouja20.py | 2 +- .../datasets/StanceUnifiedFC.py | 2 +- .../datasets/Subjectivity.py | 2 +- .../datasets/TyDiQA.py | 2 +- .../datasets/XNLI.py | 2 +- .../datasets/XQuAD.py | 2 +- .../datasets/__init__.py | 0 .../datasets/dataset_base.py | 0 .../models/BLOOMPetal.py | 2 +- .../models/GPT.py | 2 +- .../models/RandomGPT.py | 2 +- .../models/__init__.py | 0 .../models/model_base.py | 0 .../tasks/Adult.py | 2 +- .../tasks/ArabicDiacritization.py | 2 +- .../tasks/ArabicPOS.py | 2 +- .../tasks/ArabicParsing.py | 2 +- .../tasks/ArabicSegmentation.py | 2 +- .../tasks/Attentionworthy.py | 2 +- .../tasks/Checkworthiness.py | 2 +- .../tasks/ClaimDetection.py | 2 +- .../tasks/DemographyGender.py | 2 +- .../tasks/DemographyLocation.py | 2 +- .../tasks/DemographyNameInfo.py | 2 +- .../tasks/DialectID.py | 2 +- .../tasks/Emotion.py | 2 +- .../tasks/FactualityCOVID19.py | 2 +- .../tasks/FactualityKhouja20.py | 2 +- .../tasks/FactualityUnifiedFC.py | 2 +- .../tasks/HarmfulDetection.py | 2 +- .../tasks/HateSpeech.py | 2 +- .../tasks/Lemmatization.py | 2 +- .../tasks/MachineTranslation.py | 2 +- .../tasks/NER.py | 2 +- .../tasks/NewsCatASND.py | 2 +- .../tasks/NewsCatAkhbarona.py | 2 +- .../tasks/NewsCatAlArabiya.py | 2 +- .../tasks/NewsCatAlKhaleej.py | 2 +- .../tasks/Offensive.py | 2 +- .../tasks/PropagandaMultilabel.py | 2 +- .../tasks/PropagandaMultilabelSemEval23.py | 2 +- .../tasks/Q2QSimDetect.py | 2 +- .../tasks/QA.py | 2 +- .../tasks/STSTrack1.py | 2 +- .../tasks/STSTrack2.py | 2 +- .../tasks/Sarcasm.py | 2 +- .../tasks/Sentiment.py | 2 +- .../tasks/Spam.py | 2 +- .../tasks/StanceKhouja20.py | 2 +- .../tasks/StanceUnifiedFC.py | 2 +- .../tasks/Subjectivity.py | 2 +- .../tasks/XNLI.py | 2 +- .../tasks/__init__.py | 0 .../tasks/task_base.py | 0 {arabic_llm_benchmark => llmebench}/utils.py | 0 scripts/format_code.sh | 4 +-- scripts/run_tests.sh | 2 +- setup.cfg | 6 ++--- tests/datasets/test_exports.py | 8 +++--- tests/models/test_BLOOMPetal.py | 4 +-- tests/models/test_GPT.py | 4 +-- tests/models/test_GPTChatCompletion.py | 4 +-- tests/models/test_exports.py | 8 +++--- tests/tasks/test_evaluation.py | 2 +- tests/tasks/test_exports.py | 8 +++--- tests/test_benchmark_assets.py | 2 +- 310 files changed, 721 insertions(+), 721 deletions(-) rename {arabic_llm_benchmark => llmebench}/__init__.py (100%) rename {arabic_llm_benchmark => llmebench}/__main__.py (100%) rename {arabic_llm_benchmark => llmebench}/benchmark.py (100%) rename {arabic_llm_benchmark => llmebench}/datasets/ANERcorp.py (98%) rename {arabic_llm_benchmark => llmebench}/datasets/ARCD.py (88%) rename {arabic_llm_benchmark => llmebench}/datasets/Adult.py (95%) rename {arabic_llm_benchmark => llmebench}/datasets/Aqmar.py (98%) rename {arabic_llm_benchmark => llmebench}/datasets/ArSASSentiment.py (92%) rename {arabic_llm_benchmark => llmebench}/datasets/ArSarcasm.py (95%) rename {arabic_llm_benchmark => llmebench}/datasets/AraBench.py (96%) rename {arabic_llm_benchmark => llmebench}/datasets/ArabGend.py (93%) rename {arabic_llm_benchmark => llmebench}/datasets/ArabicDiacritization.py (95%) rename {arabic_llm_benchmark => llmebench}/datasets/ArabicPOS.py (95%) rename {arabic_llm_benchmark => llmebench}/datasets/ArabicParsing.py (96%) rename {arabic_llm_benchmark => llmebench}/datasets/ArabicSegmentation.py (96%) rename {arabic_llm_benchmark => llmebench}/datasets/ArapTweet.py (97%) rename {arabic_llm_benchmark => llmebench}/datasets/Attentionworthy.py (95%) rename {arabic_llm_benchmark => llmebench}/datasets/BanglaSentiment.py (95%) rename {arabic_llm_benchmark => llmebench}/datasets/Checkworthiness.py (96%) rename {arabic_llm_benchmark => llmebench}/datasets/Claim.py (95%) rename {arabic_llm_benchmark => llmebench}/datasets/DialectADI.py (92%) rename {arabic_llm_benchmark => llmebench}/datasets/Emotion.py (94%) rename {arabic_llm_benchmark => llmebench}/datasets/FactualityCOVID19.py (97%) rename {arabic_llm_benchmark => llmebench}/datasets/FactualityKhouja20.py (94%) rename {arabic_llm_benchmark => llmebench}/datasets/FactualityUnifiedFC.py (97%) rename {arabic_llm_benchmark => llmebench}/datasets/Harmful.py (95%) rename {arabic_llm_benchmark => llmebench}/datasets/HateSpeech.py (95%) rename {arabic_llm_benchmark => llmebench}/datasets/Lemmatization.py (95%) rename {arabic_llm_benchmark => llmebench}/datasets/Location.py (94%) rename {arabic_llm_benchmark => llmebench}/datasets/MGBWords.py (96%) rename {arabic_llm_benchmark => llmebench}/datasets/MLQA.py (88%) rename {arabic_llm_benchmark => llmebench}/datasets/NameInfo.py (93%) rename {arabic_llm_benchmark => llmebench}/datasets/NewsCatASND.py (95%) rename {arabic_llm_benchmark => llmebench}/datasets/NewsCatAkhbarona.py (95%) rename {arabic_llm_benchmark => llmebench}/datasets/NewsCatAlArabiya.py (95%) rename {arabic_llm_benchmark => llmebench}/datasets/NewsCatAlKhaleej.py (95%) rename {arabic_llm_benchmark => llmebench}/datasets/Offensive.py (94%) rename {arabic_llm_benchmark => llmebench}/datasets/Propaganda.py (96%) rename {arabic_llm_benchmark => llmebench}/datasets/PropagandaSemEval23.py (98%) rename {arabic_llm_benchmark => llmebench}/datasets/QADI.py (94%) rename {arabic_llm_benchmark => llmebench}/datasets/SQuADBase.py (95%) rename {arabic_llm_benchmark => llmebench}/datasets/STSArSemEval17Track1.py (96%) rename {arabic_llm_benchmark => llmebench}/datasets/STSArSemEval17Track2.py (96%) rename {arabic_llm_benchmark => llmebench}/datasets/STSQ2Q.py (95%) rename {arabic_llm_benchmark => llmebench}/datasets/Spam.py (94%) rename {arabic_llm_benchmark => llmebench}/datasets/StanceKhouja20.py (94%) rename {arabic_llm_benchmark => llmebench}/datasets/StanceUnifiedFC.py (97%) rename {arabic_llm_benchmark => llmebench}/datasets/Subjectivity.py (95%) rename {arabic_llm_benchmark => llmebench}/datasets/TyDiQA.py (91%) rename {arabic_llm_benchmark => llmebench}/datasets/XNLI.py (96%) rename {arabic_llm_benchmark => llmebench}/datasets/XQuAD.py (90%) rename {arabic_llm_benchmark => llmebench}/datasets/__init__.py (100%) rename {arabic_llm_benchmark => llmebench}/datasets/dataset_base.py (100%) rename {arabic_llm_benchmark => llmebench}/models/BLOOMPetal.py (97%) rename {arabic_llm_benchmark => llmebench}/models/GPT.py (98%) rename {arabic_llm_benchmark => llmebench}/models/RandomGPT.py (94%) rename {arabic_llm_benchmark => llmebench}/models/__init__.py (100%) rename {arabic_llm_benchmark => llmebench}/models/model_base.py (100%) rename {arabic_llm_benchmark => llmebench}/tasks/Adult.py (88%) rename {arabic_llm_benchmark => llmebench}/tasks/ArabicDiacritization.py (98%) rename {arabic_llm_benchmark => llmebench}/tasks/ArabicPOS.py (94%) rename {arabic_llm_benchmark => llmebench}/tasks/ArabicParsing.py (93%) rename {arabic_llm_benchmark => llmebench}/tasks/ArabicSegmentation.py (94%) rename {arabic_llm_benchmark => llmebench}/tasks/Attentionworthy.py (94%) rename {arabic_llm_benchmark => llmebench}/tasks/Checkworthiness.py (94%) rename {arabic_llm_benchmark => llmebench}/tasks/ClaimDetection.py (89%) rename {arabic_llm_benchmark => llmebench}/tasks/DemographyGender.py (88%) rename {arabic_llm_benchmark => llmebench}/tasks/DemographyLocation.py (88%) rename {arabic_llm_benchmark => llmebench}/tasks/DemographyNameInfo.py (89%) rename {arabic_llm_benchmark => llmebench}/tasks/DialectID.py (93%) rename {arabic_llm_benchmark => llmebench}/tasks/Emotion.py (89%) rename {arabic_llm_benchmark => llmebench}/tasks/FactualityCOVID19.py (94%) rename {arabic_llm_benchmark => llmebench}/tasks/FactualityKhouja20.py (88%) rename {arabic_llm_benchmark => llmebench}/tasks/FactualityUnifiedFC.py (94%) rename {arabic_llm_benchmark => llmebench}/tasks/HarmfulDetection.py (90%) rename {arabic_llm_benchmark => llmebench}/tasks/HateSpeech.py (88%) rename {arabic_llm_benchmark => llmebench}/tasks/Lemmatization.py (92%) rename {arabic_llm_benchmark => llmebench}/tasks/MachineTranslation.py (87%) rename {arabic_llm_benchmark => llmebench}/tasks/NER.py (96%) rename {arabic_llm_benchmark => llmebench}/tasks/NewsCatASND.py (94%) rename {arabic_llm_benchmark => llmebench}/tasks/NewsCatAkhbarona.py (94%) rename {arabic_llm_benchmark => llmebench}/tasks/NewsCatAlArabiya.py (94%) rename {arabic_llm_benchmark => llmebench}/tasks/NewsCatAlKhaleej.py (94%) rename {arabic_llm_benchmark => llmebench}/tasks/Offensive.py (88%) rename {arabic_llm_benchmark => llmebench}/tasks/PropagandaMultilabel.py (94%) rename {arabic_llm_benchmark => llmebench}/tasks/PropagandaMultilabelSemEval23.py (94%) rename {arabic_llm_benchmark => llmebench}/tasks/Q2QSimDetect.py (88%) rename {arabic_llm_benchmark => llmebench}/tasks/QA.py (97%) rename {arabic_llm_benchmark => llmebench}/tasks/STSTrack1.py (90%) rename {arabic_llm_benchmark => llmebench}/tasks/STSTrack2.py (90%) rename {arabic_llm_benchmark => llmebench}/tasks/Sarcasm.py (89%) rename {arabic_llm_benchmark => llmebench}/tasks/Sentiment.py (94%) rename {arabic_llm_benchmark => llmebench}/tasks/Spam.py (88%) rename {arabic_llm_benchmark => llmebench}/tasks/StanceKhouja20.py (88%) rename {arabic_llm_benchmark => llmebench}/tasks/StanceUnifiedFC.py (88%) rename {arabic_llm_benchmark => llmebench}/tasks/Subjectivity.py (95%) rename {arabic_llm_benchmark => llmebench}/tasks/XNLI.py (88%) rename {arabic_llm_benchmark => llmebench}/tasks/__init__.py (100%) rename {arabic_llm_benchmark => llmebench}/tasks/task_base.py (100%) rename {arabic_llm_benchmark => llmebench}/utils.py (100%) diff --git a/.github/workflows/code-formatting.yml b/.github/workflows/code-formatting.yml index 18f89895..ee02756b 100644 --- a/.github/workflows/code-formatting.yml +++ b/.github/workflows/code-formatting.yml @@ -33,7 +33,7 @@ jobs: pip install '.[dev]' - name: Run ufmt check on framework code run: | - ufmt check arabic_llm_benchmark + ufmt check llmebench - name: Run ufmt check on test code run: | ufmt check tests diff --git a/README.md b/README.md index 74e5bb8a..a6f22d51 100644 --- a/README.md +++ b/README.md @@ -5,14 +5,14 @@ Clone this repository: ```bash -git clone https://github.com/qcri/Arabic_LLM_Benchmark.git -cd Arabic_LLM_Benchmark +git clone https://github.com/qcri/LLMeBench.git +cd LLMeBench ``` Create a virtual environment: ```bash -python -m venv .envs/arabic_llm_benchmark -source .envs/arabic_llm_benchmark/bin/activate +python -m venv .envs/llmebench +source .envs/llmebench/bin/activate ``` Install the dependencies and benchmarking package: @@ -21,7 +21,7 @@ pip install -e '.[dev,fewshot]' ``` ## Get the benchmark data -Download the benchmark from [here](https://neurox.qcri.org/projects/arabic_llm_benchmark/arabic_llm_benchmark_data.zip), and unzip it into the `Arabic_LLM_Benchmark` folder. After this process, there should be a `data` directory inside the top-level folder of the repository, with roughly the following contents: +Download the benchmark from [here](https://neurox.qcri.org/projects/llmebench/arabic_llm_benchmark_data.zip), and unzip it into the `Arabic_LLM_Benchmark` folder. After this process, there should be a `data` directory inside the top-level folder of the repository, with roughly the following contents: ```bash $ ls data/ @@ -39,7 +39,7 @@ speech A sample benchmark is available in `assets/benchmark_v1`. To run the benchmark, ```bash -python -m arabic_llm_benchmark +python -m llmebench ``` where `` can point to `assets/benchmark_v1` for example. The @@ -58,7 +58,7 @@ git checkout -b feat/sarcasm_task ``` ### Dataset -Check if the dataset used by your task already has an implementation in `arabic_llm_benchmark/datasets`. If not, implement a new dataset module (e.g. `arabic_llm_benchmark/datasets/SemEval23.py`), which implements a class (e.g. `SemEval23Dataset`) which subclasses `DatasetBase`. See an existing dataset module for inspiration. Each new dataset class requires implementing three functions: +Check if the dataset used by your task already has an implementation in `llmebench/datasets`. If not, implement a new dataset module (e.g. `llmebench/datasets/SemEval23.py`), which implements a class (e.g. `SemEval23Dataset`) which subclasses `DatasetBase`. See an existing dataset module for inspiration. Each new dataset class requires implementing three functions: ```python class NewDataset(DatasetBase): @@ -78,10 +78,10 @@ class NewDataset(DatasetBase): # "label": this will be used for evaluation ``` -Once the `Dataset` is implemented, export it in `arabic_llm_benchmark/datasets/__init__.py`. +Once the `Dataset` is implemented, export it in `llmebench/datasets/__init__.py`. ### Task -Check if the task you are adding to the benchmark already has an implementation in `arabic_llm_benchmark/tasks`. If not, implement a new dataset module (e.g. `arabic_llm_benchmark/tasks/Sarcasm.py`), which implements a class (e.g. `SarcasmTask`) which subclasses `TaskBase`. See an existing task module for inspiration. Each new task class requires implementing two functions: +Check if the task you are adding to the benchmark already has an implementation in `llmebench/tasks`. If not, implement a new dataset module (e.g. `llmebench/tasks/Sarcasm.py`), which implements a class (e.g. `SarcasmTask`) which subclasses `TaskBase`. See an existing task module for inspiration. Each new task class requires implementing two functions: ```python class NewTask(TaskBase): @@ -97,10 +97,10 @@ class NewTask(TaskBase): # post_process function ``` -Once the `Task` is implemented, export it in `arabic_llm_benchmark/tasks/__init__.py`. +Once the `Task` is implemented, export it in `llmebench/tasks/__init__.py`. ### Model -Next, check if the model you are trying to run the benchmark for has an implementation in `arabic_llm_benchmark/models`. If not, implement a new model module (e.g. `arabic_llm_benchmark/models/QARiB.py`), which implements a class (e.g. `QARiBModel`) which subclasses `ModelBase`. See an existing model module for inspiration. Each new model class requires implementing two functions: +Next, check if the model you are trying to run the benchmark for has an implementation in `llmebench/models`. If not, implement a new model module (e.g. `llmebench/models/QARiB.py`), which implements a class (e.g. `QARiBModel`) which subclasses `ModelBase`. See an existing model module for inspiration. Each new model class requires implementing two functions: ```python class NewModel(TaskBase): @@ -115,7 +115,7 @@ class NewModel(TaskBase): # run the actual model and return model outputs ``` -Once the `Model` is implemented, export it in `arabic_llm_benchmark/models/__init__.py`. +Once the `Model` is implemented, export it in `llmebench/models/__init__.py`. ### Benchmark Asset Now that the Dataset, Task and Model are defined, the framework expects a given benchmark asset (e.g. "ArabGender" dataset, "GenderClassification" task, "GPT" model and "ZeroShot" prompting setting) to have a `*.py` file with three functions: @@ -145,7 +145,7 @@ def post_process(response): The benchmarking module allows one to run a specific asset instead of the entire benchmark using the `--filter` option. It is also a good idea to use the `--limit` option to limit the tests to few (e.g. 5 samples). Sample command below: ```bash -python -m arabic_llm_benchmark --filter 'demography/gender/AraGend_ChatGPT_ZeroShot' --limit 5 --ignore_cache +python -m llmebench --filter 'demography/gender/AraGend_ChatGPT_ZeroShot' --limit 5 --ignore_cache ``` Make sure to also run `scripts/run_tests.sh` before submitting your code, and once you are ready, you can commit your changes locally and push them to a remote branch: diff --git a/assets/benchmark_v1/MT/AraBench_Ara2Eng_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/MT/AraBench_Ara2Eng_BLOOMZ_ZeroShot.py index 9c95d312..3db0e90f 100644 --- a/assets/benchmark_v1/MT/AraBench_Ara2Eng_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/MT/AraBench_Ara2Eng_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import AraBenchDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import MachineTranslationTask +from llmebench.datasets import AraBenchDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import MachineTranslationTask def config(): diff --git a/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py b/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py index 486bb51c..68db7bb7 100644 --- a/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py +++ b/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import AraBenchDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import MachineTranslationTask +from llmebench.datasets import AraBenchDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import MachineTranslationTask def config(): diff --git a/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT_ZeroShot.py b/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT_ZeroShot.py index 245c9b46..147859b7 100644 --- a/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import AraBenchDataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import MachineTranslationTask +from llmebench.datasets import AraBenchDataset +from llmebench.models import GPTModel +from llmebench.tasks import MachineTranslationTask def config(): diff --git a/assets/benchmark_v1/NER/MGBWords_ChatGPT_ZeroShot.py b/assets/benchmark_v1/NER/MGBWords_ChatGPT_ZeroShot.py index 75be7f08..5541a741 100644 --- a/assets/benchmark_v1/NER/MGBWords_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/NER/MGBWords_ChatGPT_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import MGBWordsDataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import NERTask +from llmebench.datasets import MGBWordsDataset +from llmebench.models import GPTModel +from llmebench.tasks import NERTask def config(): diff --git a/assets/benchmark_v1/NER/MGBWords_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/NER/MGBWords_GPTChatCompletion_ZeroShot.py index 42f047d9..d2abd099 100644 --- a/assets/benchmark_v1/NER/MGBWords_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/NER/MGBWords_GPTChatCompletion_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import MGBWordsDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import NERTask +from llmebench.datasets import MGBWordsDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import NERTask def config(): diff --git a/assets/benchmark_v1/NER/NERANERcorp_ChatGPT_ZeroShot.py b/assets/benchmark_v1/NER/NERANERcorp_ChatGPT_ZeroShot.py index 9b644df8..984d1044 100644 --- a/assets/benchmark_v1/NER/NERANERcorp_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/NER/NERANERcorp_ChatGPT_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import ANERcorpDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import NERTask +from llmebench.datasets import ANERcorpDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import NERTask def config(): diff --git a/assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_FewShot.py index 6a053e27..10710072 100644 --- a/assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_FewShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import ANERcorpDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import NERTask +from llmebench.datasets import ANERcorpDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import NERTask def config(): diff --git a/assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_ZeroShot.py index e507445f..6091e611 100644 --- a/assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/NER/NERANERcorp_GPTChatCompletion_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import ANERcorpDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import NERTask +from llmebench.datasets import ANERcorpDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import NERTask def config(): diff --git a/assets/benchmark_v1/NER/NERAqmar_ChatGPT_ZeroShot.py b/assets/benchmark_v1/NER/NERAqmar_ChatGPT_ZeroShot.py index 6b8f21f7..5f44fbea 100644 --- a/assets/benchmark_v1/NER/NERAqmar_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/NER/NERAqmar_ChatGPT_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import AqmarDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import NERTask +from llmebench.datasets import AqmarDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import NERTask def config(): diff --git a/assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_FewShot.py index a9865520..662d1230 100644 --- a/assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_FewShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import AqmarDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import NERTask +from llmebench.datasets import AqmarDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import NERTask def config(): diff --git a/assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_ZeroShot.py index b42e55bf..05eb8401 100644 --- a/assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/NER/NERAqmar_GPTChatCompletion_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import AqmarDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import NERTask +from llmebench.datasets import AqmarDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import NERTask def config(): diff --git a/assets/benchmark_v1/QA/ARCD_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/QA/ARCD_BLOOMZ_ZeroShot.py index 340e6c07..9e3fe217 100644 --- a/assets/benchmark_v1/QA/ARCD_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/QA/ARCD_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ARCDDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import QATask +from llmebench.datasets import ARCDDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import QATask def config(): diff --git a/assets/benchmark_v1/QA/ARCD_ChatGPT_ZeroShot.py b/assets/benchmark_v1/QA/ARCD_ChatGPT_ZeroShot.py index 35c59d67..44cefc77 100644 --- a/assets/benchmark_v1/QA/ARCD_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/QA/ARCD_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ARCDDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import QATask +from llmebench.datasets import ARCDDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import QATask def config(): diff --git a/assets/benchmark_v1/QA/ARCD_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/QA/ARCD_GPTChatCompletion_FewShot.py index 1a5c7bfa..91ec1a02 100644 --- a/assets/benchmark_v1/QA/ARCD_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/QA/ARCD_GPTChatCompletion_FewShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import ARCDDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import QATask +from llmebench.datasets import ARCDDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import QATask random.seed(3333) diff --git a/assets/benchmark_v1/QA/ARCD_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/QA/ARCD_GPTChatCompletion_ZeroShot.py index 513f81c4..b0fe38da 100644 --- a/assets/benchmark_v1/QA/ARCD_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/QA/ARCD_GPTChatCompletion_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ARCDDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import QATask +from llmebench.datasets import ARCDDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import QATask def config(): diff --git a/assets/benchmark_v1/QA/MLQA_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/QA/MLQA_BLOOMZ_ZeroShot.py index b06c9a5b..8e19a535 100644 --- a/assets/benchmark_v1/QA/MLQA_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/QA/MLQA_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import MLQADataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import QATask +from llmebench.datasets import MLQADataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import QATask def config(): diff --git a/assets/benchmark_v1/QA/MLQA_ChatGPT_ZeroShot.py b/assets/benchmark_v1/QA/MLQA_ChatGPT_ZeroShot.py index 9524c05c..ba57acbb 100644 --- a/assets/benchmark_v1/QA/MLQA_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/QA/MLQA_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import MLQADataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import QATask +from llmebench.datasets import MLQADataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import QATask def config(): diff --git a/assets/benchmark_v1/QA/MLQA_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/QA/MLQA_GPTChatCompletion_FewShot.py index a4ab0d80..aa6bef95 100644 --- a/assets/benchmark_v1/QA/MLQA_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/QA/MLQA_GPTChatCompletion_FewShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import MLQADataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import QATask +from llmebench.datasets import MLQADataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import QATask random.seed(3333) diff --git a/assets/benchmark_v1/QA/MLQA_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/QA/MLQA_GPTChatCompletion_ZeroShot.py index 47030eba..31ac98de 100644 --- a/assets/benchmark_v1/QA/MLQA_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/QA/MLQA_GPTChatCompletion_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import MLQADataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import QATask +from llmebench.datasets import MLQADataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import QATask def config(): diff --git a/assets/benchmark_v1/QA/TyDiQA_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/QA/TyDiQA_BLOOMZ_ZeroShot.py index 6a3539c8..4ad3bc10 100644 --- a/assets/benchmark_v1/QA/TyDiQA_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/QA/TyDiQA_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import TyDiQADataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import QATask +from llmebench.datasets import TyDiQADataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import QATask def config(): diff --git a/assets/benchmark_v1/QA/TyDiQA_ChatGPT_ZeroShot.py b/assets/benchmark_v1/QA/TyDiQA_ChatGPT_ZeroShot.py index a98e1352..4e8a91e7 100644 --- a/assets/benchmark_v1/QA/TyDiQA_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/QA/TyDiQA_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import TyDiQADataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import QATask +from llmebench.datasets import TyDiQADataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import QATask def config(): diff --git a/assets/benchmark_v1/QA/TyDiQA_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/QA/TyDiQA_GPTChatCompletion_FewShot.py index f4b83806..339fbe75 100644 --- a/assets/benchmark_v1/QA/TyDiQA_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/QA/TyDiQA_GPTChatCompletion_FewShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import TyDiQADataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import QATask +from llmebench.datasets import TyDiQADataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import QATask random.seed(3333) diff --git a/assets/benchmark_v1/QA/TydiQA_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/QA/TydiQA_GPTChatCompletion_ZeroShot.py index c2105b5d..e5d80a12 100644 --- a/assets/benchmark_v1/QA/TydiQA_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/QA/TydiQA_GPTChatCompletion_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import TyDiQADataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import QATask +from llmebench.datasets import TyDiQADataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import QATask def config(): diff --git a/assets/benchmark_v1/QA/XQuAD_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/QA/XQuAD_BLOOMZ_ZeroShot.py index 7b8de231..ccc89880 100644 --- a/assets/benchmark_v1/QA/XQuAD_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/QA/XQuAD_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import XQuADDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import QATask +from llmebench.datasets import XQuADDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import QATask def config(): diff --git a/assets/benchmark_v1/QA/XQuAD_ChatGPT_ZeroShot.py b/assets/benchmark_v1/QA/XQuAD_ChatGPT_ZeroShot.py index afb0ba6f..9729931e 100644 --- a/assets/benchmark_v1/QA/XQuAD_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/QA/XQuAD_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import XQuADDataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import QATask +from llmebench.datasets import XQuADDataset +from llmebench.models import GPTModel +from llmebench.tasks import QATask def config(): diff --git a/assets/benchmark_v1/QA/XQuAD_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/QA/XQuAD_GPTChatCompletion_ZeroShot.py index caab5c27..86413e14 100644 --- a/assets/benchmark_v1/QA/XQuAD_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/QA/XQuAD_GPTChatCompletion_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import XQuADDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import QATask +from llmebench.datasets import XQuADDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import QATask def config(): diff --git a/assets/benchmark_v1/QA/XQuaD_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/QA/XQuaD_GPTChatCompletion_FewShot.py index a1df0eb9..01a143c2 100644 --- a/assets/benchmark_v1/QA/XQuaD_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/QA/XQuaD_GPTChatCompletion_FewShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import XQuADDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import QATask +from llmebench.datasets import XQuADDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import QATask random.seed(3333) diff --git a/assets/benchmark_v1/STS/Q2QSim_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/STS/Q2QSim_BLOOMZ_ZeroShot.py index d9aae05b..ea8ceca0 100644 --- a/assets/benchmark_v1/STS/Q2QSim_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/STS/Q2QSim_BLOOMZ_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import Q2QSimDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import Q2QSimDetectionTask +from llmebench.datasets import Q2QSimDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import Q2QSimDetectionTask def config(): diff --git a/assets/benchmark_v1/STS/Q2QSim_ChatGPT_ZeroShot.py b/assets/benchmark_v1/STS/Q2QSim_ChatGPT_ZeroShot.py index 839e94e3..c23e3f7e 100644 --- a/assets/benchmark_v1/STS/Q2QSim_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/STS/Q2QSim_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import Q2QSimDataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import Q2QSimDetectionTask +from llmebench.datasets import Q2QSimDataset +from llmebench.models import GPTModel +from llmebench.tasks import Q2QSimDetectionTask def config(): diff --git a/assets/benchmark_v1/STS/Q2QSim_GPT4_FewShot.py b/assets/benchmark_v1/STS/Q2QSim_GPT4_FewShot.py index 9112d402..791fd0e3 100644 --- a/assets/benchmark_v1/STS/Q2QSim_GPT4_FewShot.py +++ b/assets/benchmark_v1/STS/Q2QSim_GPT4_FewShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import Q2QSimDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import Q2QSimDetectionTask +from llmebench.datasets import Q2QSimDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import Q2QSimDetectionTask def config(): diff --git a/assets/benchmark_v1/STS/Q2QSim_GPT4_ZeroShot.py b/assets/benchmark_v1/STS/Q2QSim_GPT4_ZeroShot.py index 926684fa..20ed5bb3 100644 --- a/assets/benchmark_v1/STS/Q2QSim_GPT4_ZeroShot.py +++ b/assets/benchmark_v1/STS/Q2QSim_GPT4_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import Q2QSimDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import Q2QSimDetectionTask +from llmebench.datasets import Q2QSimDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import Q2QSimDetectionTask def config(): diff --git a/assets/benchmark_v1/STS/STSTrack1_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/STS/STSTrack1_BLOOMZ_ZeroShot.py index 6fb16a88..9fd822f1 100644 --- a/assets/benchmark_v1/STS/STSTrack1_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/STS/STSTrack1_BLOOMZ_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import STSArSemEval17Track1Dataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import STSTrack1Task +from llmebench.datasets import STSArSemEval17Track1Dataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import STSTrack1Task def config(): diff --git a/assets/benchmark_v1/STS/STSTrack1_ChatGPT_ZeroShot.py b/assets/benchmark_v1/STS/STSTrack1_ChatGPT_ZeroShot.py index 1ec7f78a..1b99e05a 100644 --- a/assets/benchmark_v1/STS/STSTrack1_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/STS/STSTrack1_ChatGPT_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import STSArSemEval17Track1Dataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import STSTrack1Task +from llmebench.datasets import STSArSemEval17Track1Dataset +from llmebench.models import GPTModel +from llmebench.tasks import STSTrack1Task def config(): diff --git a/assets/benchmark_v1/STS/STSTrack1_GPT4_FewShot.py b/assets/benchmark_v1/STS/STSTrack1_GPT4_FewShot.py index d1f645a6..aff4ddad 100644 --- a/assets/benchmark_v1/STS/STSTrack1_GPT4_FewShot.py +++ b/assets/benchmark_v1/STS/STSTrack1_GPT4_FewShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import STSArSemEval17Track1Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import STSTrack1Task +from llmebench.datasets import STSArSemEval17Track1Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import STSTrack1Task def config(): diff --git a/assets/benchmark_v1/STS/STSTrack1_GPT4_ZeroShot.py b/assets/benchmark_v1/STS/STSTrack1_GPT4_ZeroShot.py index 01c3a640..3f94a169 100644 --- a/assets/benchmark_v1/STS/STSTrack1_GPT4_ZeroShot.py +++ b/assets/benchmark_v1/STS/STSTrack1_GPT4_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import STSArSemEval17Track1Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import STSTrack1Task +from llmebench.datasets import STSArSemEval17Track1Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import STSTrack1Task def config(): diff --git a/assets/benchmark_v1/STS/STSTrack2_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/STS/STSTrack2_BLOOMZ_ZeroShot.py index e688befd..9b2e6efe 100644 --- a/assets/benchmark_v1/STS/STSTrack2_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/STS/STSTrack2_BLOOMZ_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import STSArSemEval17Track1Dataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import STSTrack1Task +from llmebench.datasets import STSArSemEval17Track1Dataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import STSTrack1Task def config(): diff --git a/assets/benchmark_v1/STS/STSTrack2_ChatGPT_ZeroShot.py b/assets/benchmark_v1/STS/STSTrack2_ChatGPT_ZeroShot.py index fb7a6776..da4dff44 100644 --- a/assets/benchmark_v1/STS/STSTrack2_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/STS/STSTrack2_ChatGPT_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import STSArSemEval17Track2Dataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import STSTrack2Task +from llmebench.datasets import STSArSemEval17Track2Dataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import STSTrack2Task def config(): diff --git a/assets/benchmark_v1/STS/STSTrack2_GPT4_FewShot.py b/assets/benchmark_v1/STS/STSTrack2_GPT4_FewShot.py index c445c4c6..a4e9e840 100644 --- a/assets/benchmark_v1/STS/STSTrack2_GPT4_FewShot.py +++ b/assets/benchmark_v1/STS/STSTrack2_GPT4_FewShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import STSArSemEval17Track2Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import STSTrack2Task +from llmebench.datasets import STSArSemEval17Track2Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import STSTrack2Task def config(): diff --git a/assets/benchmark_v1/STS/STSTrack2_GPT4_ZeroShot.py b/assets/benchmark_v1/STS/STSTrack2_GPT4_ZeroShot.py index 9ce5a288..4be3334c 100644 --- a/assets/benchmark_v1/STS/STSTrack2_GPT4_ZeroShot.py +++ b/assets/benchmark_v1/STS/STSTrack2_GPT4_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import STSArSemEval17Track2Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import STSTrack2Task +from llmebench.datasets import STSArSemEval17Track2Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import STSTrack2Task def config(): diff --git a/assets/benchmark_v1/demography/gender/GenderArabGend_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/demography/gender/GenderArabGend_BLOOMZ_ZeroShot.py index 6877a162..915e1dc9 100644 --- a/assets/benchmark_v1/demography/gender/GenderArabGend_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/demography/gender/GenderArabGend_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ArabGendDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import DemographyGenderTask +from llmebench.datasets import ArabGendDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import DemographyGenderTask def config(): diff --git a/assets/benchmark_v1/demography/gender/GenderArabGend_ChatGPT_ZeroShot.py b/assets/benchmark_v1/demography/gender/GenderArabGend_ChatGPT_ZeroShot.py index 938a7e1b..11a30148 100644 --- a/assets/benchmark_v1/demography/gender/GenderArabGend_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/demography/gender/GenderArabGend_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ArabGendDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import DemographyGenderTask +from llmebench.datasets import ArabGendDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import DemographyGenderTask def config(): diff --git a/assets/benchmark_v1/demography/gender/GenderArabGend_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/demography/gender/GenderArabGend_GPTChatCompletion_ZeroShot.py index 8311a1f1..4cda2896 100644 --- a/assets/benchmark_v1/demography/gender/GenderArabGend_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/demography/gender/GenderArabGend_GPTChatCompletion_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ArabGendDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import DemographyGenderTask +from llmebench.datasets import ArabGendDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import DemographyGenderTask def config(): diff --git a/assets/benchmark_v1/demography/gender/GenderArapTweet_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/demography/gender/GenderArapTweet_BLOOMZ_ZeroShot.py index 1f538c86..d9acb381 100644 --- a/assets/benchmark_v1/demography/gender/GenderArapTweet_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/demography/gender/GenderArapTweet_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ArapTweetDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import DemographyGenderTask +from llmebench.datasets import ArapTweetDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import DemographyGenderTask def config(): diff --git a/assets/benchmark_v1/demography/gender/GenderArapTweet_ChatGPT_ZeroShot.py b/assets/benchmark_v1/demography/gender/GenderArapTweet_ChatGPT_ZeroShot.py index 5ad89801..5c5f0e4d 100644 --- a/assets/benchmark_v1/demography/gender/GenderArapTweet_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/demography/gender/GenderArapTweet_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ArapTweetDataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import DemographyGenderTask +from llmebench.datasets import ArapTweetDataset +from llmebench.models import GPTModel +from llmebench.tasks import DemographyGenderTask def config(): diff --git a/assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_FewShot.py index 73f0c875..587648a2 100644 --- a/assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_FewShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ArapTweetDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import DemographyGenderTask +from llmebench.datasets import ArapTweetDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import DemographyGenderTask def config(): diff --git a/assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_ZeroShot.py index cd19deb7..f6e59513 100644 --- a/assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/demography/gender/GenderArapTweet_GPTChatCompletion_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ArapTweetDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import DemographyGenderTask +from llmebench.datasets import ArapTweetDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import DemographyGenderTask def config(): diff --git a/assets/benchmark_v1/demography/location/Location_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/demography/location/Location_BLOOMZ_ZeroShot.py index 685fcdf7..bf4db900 100644 --- a/assets/benchmark_v1/demography/location/Location_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/demography/location/Location_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import LocationDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import DemographyLocationTask +from llmebench.datasets import LocationDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import DemographyLocationTask def config(): diff --git a/assets/benchmark_v1/demography/location/Location_ChatGPT_ZeroShot.py b/assets/benchmark_v1/demography/location/Location_ChatGPT_ZeroShot.py index 29a98e99..50e28ef0 100644 --- a/assets/benchmark_v1/demography/location/Location_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/demography/location/Location_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import LocationDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import DemographyLocationTask +from llmebench.datasets import LocationDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import DemographyLocationTask def config(): diff --git a/assets/benchmark_v1/demography/location/Location_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/demography/location/Location_GPTChatCompletion_FewShot.py index 81ba853e..e342ebd5 100644 --- a/assets/benchmark_v1/demography/location/Location_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/demography/location/Location_GPTChatCompletion_FewShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import LocationDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import DemographyLocationTask +from llmebench.datasets import LocationDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import DemographyLocationTask def config(): diff --git a/assets/benchmark_v1/demography/location/Location_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/demography/location/Location_GPTChatCompletion_ZeroShot.py index bb39abd1..42e5b344 100644 --- a/assets/benchmark_v1/demography/location/Location_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/demography/location/Location_GPTChatCompletion_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import LocationDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import DemographyLocationTask +from llmebench.datasets import LocationDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import DemographyLocationTask def config(): diff --git a/assets/benchmark_v1/demography/name_info/NameInfo_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/demography/name_info/NameInfo_BLOOMZ_ZeroShot.py index 9424f35a..c9aa0bf9 100644 --- a/assets/benchmark_v1/demography/name_info/NameInfo_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/demography/name_info/NameInfo_BLOOMZ_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import NameInfoDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import DemographyNameInfoTask +from llmebench.datasets import NameInfoDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import DemographyNameInfoTask def config(): diff --git a/assets/benchmark_v1/demography/name_info/NameInfo_ChatGPT_ZeroShot.py b/assets/benchmark_v1/demography/name_info/NameInfo_ChatGPT_ZeroShot.py index 181b5981..0e6bf76a 100644 --- a/assets/benchmark_v1/demography/name_info/NameInfo_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/demography/name_info/NameInfo_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import NameInfoDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import DemographyNameInfoTask +from llmebench.datasets import NameInfoDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import DemographyNameInfoTask def config(): diff --git a/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_FewShot.py index 70c89a13..7f7b5fee 100644 --- a/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_FewShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import NameInfoDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import DemographyNameInfoTask +from llmebench.datasets import NameInfoDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import DemographyNameInfoTask def config(): diff --git a/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_ZeroShot.py index 75f580f3..dda2109b 100644 --- a/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/demography/name_info/NameInfo_GPTChatCompletion_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import NameInfoDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import DemographyNameInfoTask +from llmebench.datasets import NameInfoDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import DemographyNameInfoTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_BLOOMZ_ZeroShot.py index afcbdd81..70c7ffb5 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import AdultDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import AdultTask +from llmebench.datasets import AdultDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import AdultTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_ChatGPT_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_ChatGPT_ZeroShot.py index dbb3da49..9d1b05e8 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import AdultDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import AdultTask +from llmebench.datasets import AdultDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import AdultTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_FewShot.py index 8b71f386..685bd33e 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_FewShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import AdultDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import AdultTask +from llmebench.datasets import AdultDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import AdultTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_ZeroShot.py index 6c2a6d83..e59830f3 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Adult_GPTChatCompletion_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import AdultDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import AdultTask +from llmebench.datasets import AdultDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import AdultTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_BLOOMZ_ZeroShot.py index a8c29cb9..37f15dde 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import AttentionworthyDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import AttentionworthyTask +from llmebench.datasets import AttentionworthyDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import AttentionworthyTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_ChatGPT_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_ChatGPT_ZeroShot.py index d96ecb17..23d7f66c 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_ChatGPT_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import AttentionworthyDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import AttentionworthyTask +from llmebench.datasets import AttentionworthyDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import AttentionworthyTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_Fewshot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_Fewshot.py index 75852f05..92c7a32b 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_Fewshot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_Fewshot.py @@ -2,9 +2,9 @@ import random import re -from arabic_llm_benchmark.datasets import AttentionworthyDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import AttentionworthyTask +from llmebench.datasets import AttentionworthyDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import AttentionworthyTask random.seed(1333) diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_ZeroShot.py index 57018b43..69da7868 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Attentionworthy_GPTChatCompletion_ZeroShot.py @@ -2,9 +2,9 @@ import random import re -from arabic_llm_benchmark.datasets import AttentionworthyDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import AttentionworthyTask +from llmebench.datasets import AttentionworthyDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import AttentionworthyTask random.seed(1333) diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_BLOOMZ_ZeroShot.py index e23814b7..63be543d 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import CovidClaimDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import ClaimDetectionTask +from llmebench.datasets import CovidClaimDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import ClaimDetectionTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_CGPT35_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_CGPT35_ZeroShot.py index f71b72cf..f2107913 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_CGPT35_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_CGPT35_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import CovidClaimDataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import ClaimDetectionTask +from llmebench.datasets import CovidClaimDataset +from llmebench.models import GPTModel +from llmebench.tasks import ClaimDetectionTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_GPT4_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_GPT4_FewShot.py index f9fda31f..8ed99f9a 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_GPT4_FewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVClaimDetect_GPT4_FewShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import CovidClaimDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import ClaimDetectionTask +from llmebench.datasets import CovidClaimDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import ClaimDetectionTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_BLOOMZ_ZeroShot.py index 6f8dff3d..20cb07f7 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import CovidHarmfulDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import HarmfulDetectionTask +from llmebench.datasets import CovidHarmfulDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import HarmfulDetectionTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_CGPT35_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_CGPT35_ZeroShot.py index c1b6871d..fdcf21f5 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_CGPT35_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_CGPT35_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import CovidHarmfulDataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import HarmfulDetectionTask +from llmebench.datasets import CovidHarmfulDataset +from llmebench.models import GPTModel +from llmebench.tasks import HarmfulDetectionTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_GPT4_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_GPT4_FewShot.py index 05f558c8..fe497189 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_GPT4_FewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/COVHarmfulDetect_GPT4_FewShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import CovidHarmfulDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import HarmfulDetectionTask +from llmebench.datasets import CovidHarmfulDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import HarmfulDetectionTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_BGZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_BGZeroShot.py index e94c24a8..10c2a3bf 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_BGZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_BGZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import CheckworthinessTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ENZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ENZeroShot.py index c3add3ff..5bf0e294 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ENZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ENZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import CheckworthinessTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ESZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ESZeroShot.py index f2242843..d8c1a989 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ESZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ESZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import CheckworthinessTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_NLZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_NLZeroShot.py index 82b72bce..06793d52 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_NLZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_NLZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import CheckworthinessTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_TRZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_TRZeroShot.py index c8b8650a..61338f04 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_TRZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_TRZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import CheckworthinessTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ZeroShot.py index b6ee3933..7d7c512f 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import CheckworthinessTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_ChatGPT_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_ChatGPT_ZeroShot.py index 5ab232ea..5e60e8a5 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import CheckworthinessTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGFewShot.py index 79607dbe..495d05d2 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGFewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGFewShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import CheckworthinessTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGZeroShot.py index 1fe1ba4b..2d118f14 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_BGZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import CheckworthinessTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENFewShot.py index 57c61e5a..d0b67175 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENFewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENFewShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import CheckworthinessTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENZeroShot.py index 75800b2e..8cb18b32 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import CheckworthinessTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESFewShot.py index 0d535e0b..e6099674 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESFewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESFewShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import CheckworthinessTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESZeroShot.py index eff5ead0..3ed072e6 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ESZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import CheckworthinessTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_FewShot.py index 90a3adef..444fb4de 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_FewShot.py @@ -2,9 +2,9 @@ import random import re -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import CheckworthinessTask random.seed(1333) diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLFewShot.py index a79f6e33..7f241509 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLFewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLFewShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import CheckworthinessTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLZeroShot.py index 5b527b26..f52b350c 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_NLZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import CheckworthinessTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRFewShot.py index decf403b..c1ae1b57 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRFewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRFewShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import CheckworthinessTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRZeroShot.py index f4e3fdb0..14ebc259 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_TRZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import CheckworthinessTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ZeroShot.py index 71e0a98f..d90cb0ce 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ZeroShot.py @@ -2,9 +2,9 @@ import random import re -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import CheckworthinessTask random.seed(1333) diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_CGPT35_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_CGPT35_ZeroShot.py index cdcf77fa..f8e2ba13 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_CGPT35_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_CGPT35_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import CovidClaimDataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import ClaimDetectionTask +from llmebench.datasets import CovidClaimDataset +from llmebench.models import GPTModel +from llmebench.tasks import ClaimDetectionTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_GPTChatCompletion_ZeroShot.py index eb83bac4..1e9c3c73 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/ClaimDetectCOVID19_GPTChatCompletion_ZeroShot.py @@ -2,9 +2,9 @@ import random import re -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import CheckworthinessTask random.seed(1333) diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_BLOOMZ_ZeroShot.py index 66294b18..fa4dc0dd 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import FactualityCOVID19Dataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import FactualityCOVID19Task +from llmebench.datasets import FactualityCOVID19Dataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import FactualityCOVID19Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_FewShot.py index 264d3207..e165251a 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_FewShot.py @@ -2,9 +2,9 @@ import random import re -from arabic_llm_benchmark.datasets import FactualityCOVID19Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import FactualityCOVID19Task +from llmebench.datasets import FactualityCOVID19Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import FactualityCOVID19Task random.seed(1333) diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_ZeroShot.py index c11207c7..6b74eb7e 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityCOVID19_GPTChatCompletion_ZeroShot.py @@ -2,9 +2,9 @@ import random import re -from arabic_llm_benchmark.datasets import FactualityCOVID19Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import FactualityCOVID19Task +from llmebench.datasets import FactualityCOVID19Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import FactualityCOVID19Task random.seed(1333) diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_BLOOMZ_ZeroShot.py index bc5b1389..7fd21752 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_BLOOMZ_ZeroShot.py @@ -2,9 +2,9 @@ import random import re -from arabic_llm_benchmark.datasets import FactualityKhouja20Dataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import FactualityKhouja20Task +from llmebench.datasets import FactualityKhouja20Dataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import FactualityKhouja20Task random.seed(1333) diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_ChatGPT_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_ChatGPT_ZeroShot.py index a087781f..1dceb259 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import FactualityKhouja20Dataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import FactualityKhouja20Task +from llmebench.datasets import FactualityKhouja20Dataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import FactualityKhouja20Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_FewShot.py index febb0e1c..82fb95ef 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_FewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_FewShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import FactualityKhouja20Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import FactualityKhouja20Task +from llmebench.datasets import FactualityKhouja20Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import FactualityKhouja20Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_ZeroShot.py index 8816f134..add924df 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityKhouja20_GPT4_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import FactualityKhouja20Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import FactualityKhouja20Task +from llmebench.datasets import FactualityKhouja20Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import FactualityKhouja20Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_BLOOMZ_ZeroShot.py index 606e401f..47d99663 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_BLOOMZ_ZeroShot.py @@ -2,9 +2,9 @@ import random import re -from arabic_llm_benchmark.datasets import FactualityUnifiedFCDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import FactualityUnifiedFCTask +from llmebench.datasets import FactualityUnifiedFCDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import FactualityUnifiedFCTask random.seed(1333) diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPT4_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPT4_FewShot.py index c5336d73..a85f1182 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPT4_FewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPT4_FewShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import FactualityUnifiedFCDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import FactualityUnifiedFCTask +from llmebench.datasets import FactualityUnifiedFCDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import FactualityUnifiedFCTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPTChatCompletion_ZeroShot.py index 4f592e02..0c1d21c2 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/FactualityUnifiedFC_GPTChatCompletion_ZeroShot.py @@ -2,9 +2,9 @@ import random import re -from arabic_llm_benchmark.datasets import FactualityUnifiedFCDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import FactualityUnifiedFCTask +from llmebench.datasets import FactualityUnifiedFCDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import FactualityUnifiedFCTask random.seed(1333) diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_CGPT35_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_CGPT35_ZeroShot.py index 262ab383..8db951ae 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_CGPT35_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_CGPT35_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import CovidHarmfulDataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import HarmfulDetectionTask +from llmebench.datasets import CovidHarmfulDataset +from llmebench.models import GPTModel +from llmebench.tasks import HarmfulDetectionTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_GPTChatCompletion_ZeroShot.py index b814dc23..8b0813b0 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/HarmfulDetectCOVID19_GPTChatCompletion_ZeroShot.py @@ -2,9 +2,9 @@ import random import re -from arabic_llm_benchmark.datasets import CheckworthinessDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import CheckworthinessTask +from llmebench.datasets import CheckworthinessDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import CheckworthinessTask random.seed(1333) diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_ChatGPT_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_ChatGPT_ZeroShot.py index f3ded397..2cdd6740 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import HateSpeechDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import HateSpeechTask +from llmebench.datasets import HateSpeechDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import HateSpeechTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_GPTChatCompletion_FewShot.py index 4c07b90e..28e883e8 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/HateSpeech_GPTChatCompletion_FewShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import HateSpeechDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import HateSpeechTask +from llmebench.datasets import HateSpeechDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import HateSpeechTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Offensive_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Offensive_GPTChatCompletion_FewShot.py index 1e76972f..044abf2e 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Offensive_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Offensive_GPTChatCompletion_FewShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import OffensiveDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import OffensiveTask +from llmebench.datasets import OffensiveDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import OffensiveTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ENZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ENZeroShot.py index ff0942f5..6fa517a7 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ENZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ENZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task +from llmebench.datasets import PropagandaSemEval23Dataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import PropagandaMultilabelSemEval23Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_FRZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_FRZeroShot.py index f4716374..cc58c362 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_FRZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_FRZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task +from llmebench.datasets import PropagandaSemEval23Dataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import PropagandaMultilabelSemEval23Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_GEZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_GEZeroShot.py index b57e14d9..7f38863d 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_GEZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_GEZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task +from llmebench.datasets import PropagandaSemEval23Dataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import PropagandaMultilabelSemEval23Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ITZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ITZeroShot.py index ec09905a..9103c03c 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ITZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ITZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task +from llmebench.datasets import PropagandaSemEval23Dataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import PropagandaMultilabelSemEval23Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_POZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_POZeroShot.py index e25f4b3b..4fca8321 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_POZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_POZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task +from llmebench.datasets import PropagandaSemEval23Dataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import PropagandaMultilabelSemEval23Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_RUZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_RUZeroShot.py index 27acdbf8..9b3114e6 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_RUZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_RUZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task +from llmebench.datasets import PropagandaSemEval23Dataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import PropagandaMultilabelSemEval23Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ZeroShot.py index 40d0e5d5..0247eaa6 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_BLOOMZ_ZeroShot.py @@ -2,9 +2,9 @@ import random import re -from arabic_llm_benchmark.datasets import PropagandaTweetDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelTask +from llmebench.datasets import PropagandaTweetDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import PropagandaMultilabelTask random.seed(1333) diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_CGPT35_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_CGPT35_ZeroShot.py index a3052a44..b9ad7f81 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_CGPT35_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_CGPT35_ZeroShot.py @@ -2,9 +2,9 @@ import regex as re -from arabic_llm_benchmark.datasets import PropagandaTweetDataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelTask +from llmebench.datasets import PropagandaTweetDataset +from llmebench.models import GPTModel +from llmebench.tasks import PropagandaMultilabelTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENFewShot.py index 00fe1d12..e7f33f39 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENFewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENFewShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task +from llmebench.datasets import PropagandaSemEval23Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import PropagandaMultilabelSemEval23Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENZeroShot.py index 8dba8ba8..590ca1ab 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ENZeroShot.py @@ -2,9 +2,9 @@ import regex as re -from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task +from llmebench.datasets import PropagandaSemEval23Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import PropagandaMultilabelSemEval23Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRFewShot.py index cd74909d..2c642c7f 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRFewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRFewShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task +from llmebench.datasets import PropagandaSemEval23Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import PropagandaMultilabelSemEval23Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRZeroShot.py index bd3baa14..6c7c4dc7 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_FRZeroShot.py @@ -2,9 +2,9 @@ import regex as re -from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task +from llmebench.datasets import PropagandaSemEval23Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import PropagandaMultilabelSemEval23Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEFewShot.py index edeb07ed..70523f8e 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEFewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEFewShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task +from llmebench.datasets import PropagandaSemEval23Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import PropagandaMultilabelSemEval23Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEZeroShot.py index 92e2779e..e7836e14 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_GEZeroShot.py @@ -2,9 +2,9 @@ import regex as re -from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task +from llmebench.datasets import PropagandaSemEval23Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import PropagandaMultilabelSemEval23Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITFewShot.py index 1a061438..2b782ca3 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITFewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITFewShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task +from llmebench.datasets import PropagandaSemEval23Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import PropagandaMultilabelSemEval23Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITZeroShot.py index 9c1aeda3..511ca715 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ITZeroShot.py @@ -2,9 +2,9 @@ import regex as re -from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task +from llmebench.datasets import PropagandaSemEval23Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import PropagandaMultilabelSemEval23Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POFewShot.py index c716bbda..d9447aaa 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POFewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POFewShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task +from llmebench.datasets import PropagandaSemEval23Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import PropagandaMultilabelSemEval23Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POZeroShot.py index d5699927..cc0a834b 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_POZeroShot.py @@ -2,9 +2,9 @@ import regex as re -from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task +from llmebench.datasets import PropagandaSemEval23Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import PropagandaMultilabelSemEval23Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUFewShot.py index 75cdeac2..d4abcaac 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUFewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUFewShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task +from llmebench.datasets import PropagandaSemEval23Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import PropagandaMultilabelSemEval23Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUZeroShot.py index e1366663..e2557173 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_RUZeroShot.py @@ -2,9 +2,9 @@ import regex as re -from arabic_llm_benchmark.datasets import PropagandaSemEval23Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelSemEval23Task +from llmebench.datasets import PropagandaSemEval23Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import PropagandaMultilabelSemEval23Task def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ZeroShot.py index a34dc865..5311470f 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPT4_ZeroShot.py @@ -2,9 +2,9 @@ import regex as re -from arabic_llm_benchmark.datasets import PropagandaTweetDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelTask +from llmebench.datasets import PropagandaTweetDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import PropagandaMultilabelTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_FewShot.py index 22d0c25c..0b1f913b 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_FewShot.py @@ -2,9 +2,9 @@ import random import re -from arabic_llm_benchmark.datasets import PropagandaTweetDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelTask +from llmebench.datasets import PropagandaTweetDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import PropagandaMultilabelTask random.seed(1333) diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_ZeroShot.py index 9cd765d7..271385fe 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/PropMultilabel_GPTChatCompletion_ZeroShot.py @@ -2,9 +2,9 @@ import random import re -from arabic_llm_benchmark.datasets import PropagandaTweetDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import PropagandaMultilabelTask +from llmebench.datasets import PropagandaTweetDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import PropagandaMultilabelTask random.seed(1333) diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_BLOOMZ_ZeroShot.py index 9f95128f..63fa49b4 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import SubjectivityDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import SubjectivityTask +from llmebench.datasets import SubjectivityDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import SubjectivityTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_ChatGPT_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_ChatGPT_ZeroShot.py index 8aa0d8b8..ac41046e 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import SubjectivityDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import SubjectivityTask +from llmebench.datasets import SubjectivityDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import SubjectivityTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_FewShot.py index 7e25e6de..93859b9f 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_FewShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import SubjectivityDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import SubjectivityTask +from llmebench.datasets import SubjectivityDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import SubjectivityTask def config(): diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_ZeroShot.py index 79cce313..95c46f73 100644 --- a/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Subjectivity_GPTChatCompletion_ZeroShot.py @@ -2,9 +2,9 @@ import random import re -from arabic_llm_benchmark.datasets import SubjectivityDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import SubjectivityTask +from llmebench.datasets import SubjectivityDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import SubjectivityTask random.seed(1333) diff --git a/assets/benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py index 0d55e1ef..48f34c4c 100644 --- a/assets/benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/news_categorization/NewsCat_ASND_BLOOMZ_ZeroShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import NewsCatASNDDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import NewsCatASNDTask +from llmebench.datasets import NewsCatASNDDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import NewsCatASNDTask random.seed(1333) diff --git a/assets/benchmark_v1/news_categorization/NewsCat_ASND_ChatGPT_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_ASND_ChatGPT_ZeroShot.py index 14decb4a..b08edeec 100644 --- a/assets/benchmark_v1/news_categorization/NewsCat_ASND_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/news_categorization/NewsCat_ASND_ChatGPT_ZeroShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import NewsCatASNDDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import NewsCatASNDTask +from llmebench.datasets import NewsCatASNDDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import NewsCatASNDTask random.seed(1333) diff --git a/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_FewShot.py index df934a29..b1d0059e 100644 --- a/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_FewShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import NewsCatASNDDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import NewsCatASNDTask +from llmebench.datasets import NewsCatASNDDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import NewsCatASNDTask random.seed(1333) diff --git a/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_ZeroShot.py index b2a6f30a..67c0c625 100644 --- a/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/news_categorization/NewsCat_ASND_GPTChatCompletion_ZeroShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import NewsCatASNDDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import NewsCatASNDTask +from llmebench.datasets import NewsCatASNDDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import NewsCatASNDTask random.seed(1333) diff --git a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_BLOOMZ_ZeroShot.py index 7d4b193b..a23188ff 100644 --- a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_BLOOMZ_ZeroShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import NewsCatAkhbaronaDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import NewsCatAkhbaronaTask +from llmebench.datasets import NewsCatAkhbaronaDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import NewsCatAkhbaronaTask random.seed(1333) diff --git a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_ChatGPT_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_ChatGPT_ZeroShot.py index 6fd6ea90..aa44e595 100644 --- a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_ChatGPT_ZeroShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import NewsCatAkhbaronaDataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import NewsCatAkhbaronaTask +from llmebench.datasets import NewsCatAkhbaronaDataset +from llmebench.models import GPTModel +from llmebench.tasks import NewsCatAkhbaronaTask random.seed(1333) diff --git a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_FewShot.py index 76c0ad68..698e26f6 100644 --- a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_FewShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import NewsCatAkhbaronaDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import NewsCatAkhbaronaTask +from llmebench.datasets import NewsCatAkhbaronaDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import NewsCatAkhbaronaTask random.seed(1333) diff --git a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_ZeroShot.py index 5ec5f3a9..c029118d 100644 --- a/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/news_categorization/NewsCat_Akhbarona_GPTChatCompletion_ZeroShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import NewsCatAkhbaronaDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import NewsCatAkhbaronaTask +from llmebench.datasets import NewsCatAkhbaronaDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import NewsCatAkhbaronaTask random.seed(1333) diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_BLOOMZ_ZeroShot.py index 4a4f5613..1ce28d74 100644 --- a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_BLOOMZ_ZeroShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import NewsCatAlArabiyaDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import NewsCatAlArabiyaTask +from llmebench.datasets import NewsCatAlArabiyaDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import NewsCatAlArabiyaTask random.seed(1333) diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_ChatGPT_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_ChatGPT_ZeroShot.py index 70df073f..458761f2 100644 --- a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_ChatGPT_ZeroShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import NewsCatAlArabiyaDataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import NewsCatAlArabiyaTask +from llmebench.datasets import NewsCatAlArabiyaDataset +from llmebench.models import GPTModel +from llmebench.tasks import NewsCatAlArabiyaTask random.seed(1333) diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_FewShot.py index 70fa21f6..6f407dd5 100644 --- a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_FewShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import NewsCatAlArabiyaDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import NewsCatAlArabiyaTask +from llmebench.datasets import NewsCatAlArabiyaDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import NewsCatAlArabiyaTask random.seed(1333) diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_ZeroShot.py index 6f3a8b2f..333352ec 100644 --- a/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/news_categorization/NewsCat_AlArabiya_GPTChatCompletion_ZeroShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import NewsCatAlArabiyaDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import NewsCatAlArabiyaTask +from llmebench.datasets import NewsCatAlArabiyaDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import NewsCatAlArabiyaTask random.seed(1333) diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_BLOOMZ_ZeroShot.py index 4a7a5863..03516c58 100644 --- a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_BLOOMZ_ZeroShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import NewsCatAlArabiyaDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import NewsCatAlArabiyaTask +from llmebench.datasets import NewsCatAlArabiyaDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import NewsCatAlArabiyaTask random.seed(1333) diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_ChatGPT_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_ChatGPT_ZeroShot.py index 23d4b7a6..63ae2363 100644 --- a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_ChatGPT_ZeroShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import NewsCatAlKhaleejDataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import NewsCatAlKhaleejTask +from llmebench.datasets import NewsCatAlKhaleejDataset +from llmebench.models import GPTModel +from llmebench.tasks import NewsCatAlKhaleejTask random.seed(1333) diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_FewShot.py index 672fd9fc..c95f9f36 100644 --- a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_FewShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import NewsCatAlKhaleejDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import NewsCatAlKhaleejTask +from llmebench.datasets import NewsCatAlKhaleejDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import NewsCatAlKhaleejTask random.seed(1333) diff --git a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_ZeroShot.py index ee4db9e8..b7f37f16 100644 --- a/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/news_categorization/NewsCat_AlKhaleej_GPTChatCompletion_ZeroShot.py @@ -1,9 +1,9 @@ import os import random -from arabic_llm_benchmark.datasets import NewsCatAlKhaleejDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import NewsCatAlKhaleejTask +from llmebench.datasets import NewsCatAlKhaleejDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import NewsCatAlKhaleejTask random.seed(1333) diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT3_Zeroshot.py b/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT3_Zeroshot.py index f08a6396..45aceebb 100644 --- a/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT3_Zeroshot.py +++ b/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT3_Zeroshot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ArSarcasmDataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import SarcasmTask +from llmebench.datasets import ArSarcasmDataset +from llmebench.models import GPTModel +from llmebench.tasks import SarcasmTask def config(): diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_FewShot.py b/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_FewShot.py index 8005b477..7cbaab33 100644 --- a/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_FewShot.py +++ b/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_FewShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ArSarcasmDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import SarcasmTask +from llmebench.datasets import ArSarcasmDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import SarcasmTask def config(): diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_Zeroshot.py b/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_Zeroshot.py index 935aab4e..a408546b 100644 --- a/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_Zeroshot.py +++ b/assets/benchmark_v1/sarcasm/ArSarcasm2_GPT4_Zeroshot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ArSarcasmDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import SarcasmTask +from llmebench.datasets import ArSarcasmDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import SarcasmTask def config(): diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm_BLOOMZ_Zeroshot.py b/assets/benchmark_v1/sarcasm/ArSarcasm_BLOOMZ_Zeroshot.py index 40d7d016..27e123b2 100644 --- a/assets/benchmark_v1/sarcasm/ArSarcasm_BLOOMZ_Zeroshot.py +++ b/assets/benchmark_v1/sarcasm/ArSarcasm_BLOOMZ_Zeroshot.py @@ -1,10 +1,10 @@ import os -from arabic_llm_benchmark.datasets import ArSarcasmDataset +from llmebench.datasets import ArSarcasmDataset -from arabic_llm_benchmark.models import BLOOMPetalModel +from llmebench.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import SarcasmTask +from llmebench.tasks import SarcasmTask def config(): diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm_GPT3_Zeroshot.py b/assets/benchmark_v1/sarcasm/ArSarcasm_GPT3_Zeroshot.py index d4f2752f..781d36ea 100644 --- a/assets/benchmark_v1/sarcasm/ArSarcasm_GPT3_Zeroshot.py +++ b/assets/benchmark_v1/sarcasm/ArSarcasm_GPT3_Zeroshot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ArSarcasmDataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import SarcasmTask +from llmebench.datasets import ArSarcasmDataset +from llmebench.models import GPTModel +from llmebench.tasks import SarcasmTask def config(): diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Fewshot.py b/assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Fewshot.py index 5ec5cf1e..9b984d8c 100644 --- a/assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Fewshot.py +++ b/assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Fewshot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ArSarcasmDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import SarcasmTask +from llmebench.datasets import ArSarcasmDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import SarcasmTask def config(): diff --git a/assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Zeroshot.py b/assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Zeroshot.py index f8a01ad1..f95798c0 100644 --- a/assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Zeroshot.py +++ b/assets/benchmark_v1/sarcasm/ArSarcasm_GPT4_Zeroshot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ArSarcasmDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import SarcasmTask +from llmebench.datasets import ArSarcasmDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import SarcasmTask def config(): diff --git a/assets/benchmark_v1/semantics/XNLI_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/semantics/XNLI_BLOOMZ_ZeroShot.py index c892056d..0c9a88ad 100644 --- a/assets/benchmark_v1/semantics/XNLI_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/semantics/XNLI_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import XNLIDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import XNLITask +from llmebench.datasets import XNLIDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import XNLITask def config(): diff --git a/assets/benchmark_v1/semantics/XNLI_CGPT4_FewShot.py b/assets/benchmark_v1/semantics/XNLI_CGPT4_FewShot.py index 556a5bc5..67a6b2a1 100644 --- a/assets/benchmark_v1/semantics/XNLI_CGPT4_FewShot.py +++ b/assets/benchmark_v1/semantics/XNLI_CGPT4_FewShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import XNLIDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import XNLITask +from llmebench.datasets import XNLIDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import XNLITask def config(): diff --git a/assets/benchmark_v1/semantics/XNLI_CGPT4_ZeroShot.py b/assets/benchmark_v1/semantics/XNLI_CGPT4_ZeroShot.py index 494a3281..f3aae913 100644 --- a/assets/benchmark_v1/semantics/XNLI_CGPT4_ZeroShot.py +++ b/assets/benchmark_v1/semantics/XNLI_CGPT4_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import XNLIDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import XNLITask +from llmebench.datasets import XNLIDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import XNLITask def config(): diff --git a/assets/benchmark_v1/semantics/XNLI_ChatGPT_ZeroShot.py b/assets/benchmark_v1/semantics/XNLI_ChatGPT_ZeroShot.py index 12956be9..1f5126d8 100644 --- a/assets/benchmark_v1/semantics/XNLI_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/semantics/XNLI_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import XNLIDataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import XNLITask +from llmebench.datasets import XNLIDataset +from llmebench.models import GPTModel +from llmebench.tasks import XNLITask def config(): diff --git a/assets/benchmark_v1/sentiment/emotion/Emotion_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/sentiment/emotion/Emotion_BLOOMZ_ZeroShot.py index 95bdf634..77fc394d 100644 --- a/assets/benchmark_v1/sentiment/emotion/Emotion_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/sentiment/emotion/Emotion_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import EmotionDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import EmotionTask +from llmebench.datasets import EmotionDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import EmotionTask def config(): diff --git a/assets/benchmark_v1/sentiment/emotion/Emotion_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sentiment/emotion/Emotion_ChatGPT_ZeroShot.py index e48dccf8..aadd03d2 100644 --- a/assets/benchmark_v1/sentiment/emotion/Emotion_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/sentiment/emotion/Emotion_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import EmotionDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import EmotionTask +from llmebench.datasets import EmotionDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import EmotionTask def config(): diff --git a/assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_FewShot.py index 98dba6ab..1f5b0e1d 100644 --- a/assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_FewShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import EmotionDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import EmotionTask +from llmebench.datasets import EmotionDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import EmotionTask def config(): diff --git a/assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_ZeroShot.py index a936f89f..89a18cd2 100644 --- a/assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/sentiment/emotion/Emotion_GPTChatCompletion_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import EmotionDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import EmotionTask +from llmebench.datasets import EmotionDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import EmotionTask def config(): diff --git a/assets/benchmark_v1/sentiment/offensive/Offensive_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/sentiment/offensive/Offensive_BLOOMZ_ZeroShot.py index b6ea7fa8..965bde7b 100644 --- a/assets/benchmark_v1/sentiment/offensive/Offensive_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/sentiment/offensive/Offensive_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import OffensiveDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import OffensiveTask +from llmebench.datasets import OffensiveDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import OffensiveTask def config(): diff --git a/assets/benchmark_v1/sentiment/offensive/Offensive_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sentiment/offensive/Offensive_ChatGPT_ZeroShot.py index 2e044114..8752749f 100644 --- a/assets/benchmark_v1/sentiment/offensive/Offensive_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/sentiment/offensive/Offensive_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import OffensiveDataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import OffensiveTask +from llmebench.datasets import OffensiveDataset +from llmebench.models import GPTModel +from llmebench.tasks import OffensiveTask def config(): diff --git a/assets/benchmark_v1/sentiment/offensive/Offensive_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/sentiment/offensive/Offensive_GPTChatCompletion_ZeroShot.py index bbae54b8..d29b2d92 100644 --- a/assets/benchmark_v1/sentiment/offensive/Offensive_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/sentiment/offensive/Offensive_GPTChatCompletion_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import OffensiveDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import OffensiveTask +from llmebench.datasets import OffensiveDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import OffensiveTask def config(): diff --git a/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_BLOOMZ_ZeroShot.py index 4f811ee2..57dd78e2 100644 --- a/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ArSASSentimentDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import SentimentTask +from llmebench.datasets import ArSASSentimentDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import SentimentTask def config(): diff --git a/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_ChatGPT_ZeroShot.py index 507fde3a..7fac34f6 100644 --- a/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ArSASSentimentDataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import SentimentTask +from llmebench.datasets import ArSASSentimentDataset +from llmebench.models import GPTModel +from llmebench.tasks import SentimentTask def config(): diff --git a/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_GPTChatCompletion_ZeroShot.py index 2756384d..b9220fb9 100644 --- a/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/sentiment/sentiment/ArSASSentiment_GPTChatCompletion_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ArSASSentimentDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import SentimentTask +from llmebench.datasets import ArSASSentimentDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import SentimentTask def config(): diff --git a/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_BLOOMZ_ZeroShot.py index 04b877d8..0a40eb64 100644 --- a/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import BanglaSentimentDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import SentimentTask +from llmebench.datasets import BanglaSentimentDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import SentimentTask def config(): diff --git a/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_FewShot.py b/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_FewShot.py index d6d028fc..cfbfea94 100644 --- a/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_FewShot.py +++ b/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_FewShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import BanglaSentimentDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import SentimentTask +from llmebench.datasets import BanglaSentimentDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import SentimentTask def config(): diff --git a/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_ZeroShot.py b/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_ZeroShot.py index 59e14216..78e0fa42 100644 --- a/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_ZeroShot.py +++ b/assets/benchmark_v1/sentiment/sentiment/BanglaSentiment_GPT4_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import BanglaSentimentDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import SentimentTask +from llmebench.datasets import BanglaSentimentDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import SentimentTask def config(): diff --git a/assets/benchmark_v1/sentiment/spam/Spam_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/sentiment/spam/Spam_BLOOMZ_ZeroShot.py index abcea764..0d970b40 100644 --- a/assets/benchmark_v1/sentiment/spam/Spam_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/sentiment/spam/Spam_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import SpamDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import SpamTask +from llmebench.datasets import SpamDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import SpamTask def config(): diff --git a/assets/benchmark_v1/sentiment/spam/Spam_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sentiment/spam/Spam_ChatGPT_ZeroShot.py index 2271d6a4..ba0e4b30 100644 --- a/assets/benchmark_v1/sentiment/spam/Spam_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/sentiment/spam/Spam_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import SpamDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import SpamTask +from llmebench.datasets import SpamDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import SpamTask def config(): diff --git a/assets/benchmark_v1/sentiment/spam/Spam_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/sentiment/spam/Spam_GPTChatCompletion_ZeroShot.py index 679e37a2..5c124c19 100644 --- a/assets/benchmark_v1/sentiment/spam/Spam_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/sentiment/spam/Spam_GPTChatCompletion_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import SpamDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import SpamTask +from llmebench.datasets import SpamDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import SpamTask def config(): diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_BLOOMZ_ZeroShot.py index c89e9638..76f29a4d 100644 --- a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import StanceKhouja20Dataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import StanceKhouja20Task +from llmebench.datasets import StanceKhouja20Dataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import StanceKhouja20Task def config(): diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_ChatGPT_ZeroShot.py index d4a7eef1..d4fbdb93 100644 --- a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import StanceKhouja20Dataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import StanceKhouja20Task +from llmebench.datasets import StanceKhouja20Dataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import StanceKhouja20Task def config(): diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_FewShot.py index 76e7aa87..1caa14ef 100644 --- a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_FewShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import StanceKhouja20Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import StanceKhouja20Task +from llmebench.datasets import StanceKhouja20Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import StanceKhouja20Task def config(): diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_ZeroShot.py index ad281ab5..3958ccb6 100644 --- a/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/sentiment_emotion_others/StanceKhouja20_GPTChatCompletion_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import StanceKhouja20Dataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import StanceKhouja20Task +from llmebench.datasets import StanceKhouja20Dataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import StanceKhouja20Task def config(): diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_BLOOMZ_ZeroShot.py index d82969db..9ac0bb5a 100644 --- a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import StanceUnifiedFCDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import StanceUnifiedFCTask +from llmebench.datasets import StanceUnifiedFCDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import StanceUnifiedFCTask def config(): diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_ChatGPT_ZeroShot.py index 0a273f01..8c0f0568 100644 --- a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import StanceUnifiedFCDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import StanceUnifiedFCTask +from llmebench.datasets import StanceUnifiedFCDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import StanceUnifiedFCTask def config(): diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_FewShot.py index 1fb02e37..3d27a3d5 100644 --- a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_FewShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import StanceUnifiedFCDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import StanceUnifiedFCTask +from llmebench.datasets import StanceUnifiedFCDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import StanceUnifiedFCTask def config(): diff --git a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_ZeroShot.py index f06901a3..a435a768 100644 --- a/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/sentiment_emotion_others/StanceUnifiedFC_GPTChatCompletion_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import StanceUnifiedFCDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import StanceUnifiedFCTask +from llmebench.datasets import StanceUnifiedFCDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import StanceUnifiedFCTask def config(): diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_BLOOMZ_ZeroShot.py index 602253da..1fbdfa10 100644 --- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_BLOOMZ_ZeroShot.py +++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_BLOOMZ_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import DialectADIDataset -from arabic_llm_benchmark.models import BLOOMPetalModel -from arabic_llm_benchmark.tasks import DialectIDTask +from llmebench.datasets import DialectADIDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import DialectIDTask def config(): diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_ChatGPT_ZeroShot.py index f3ba850c..8e243727 100644 --- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import DialectADIDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import DialectIDTask +from llmebench.datasets import DialectADIDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import DialectIDTask def config(): diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_FewShot.py index 705db4fa..ccfe05f2 100644 --- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_FewShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import DialectADIDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel, RandomGPTModel -from arabic_llm_benchmark.tasks import DialectIDTask +from llmebench.datasets import DialectADIDataset +from llmebench.models import GPTChatCompletionModel, RandomGPTModel +from llmebench.tasks import DialectIDTask def config(): diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_ZeroShot.py index a4c12bb1..6cf4d539 100644 --- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectADI_GPTChatCompletion_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import DialectADIDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel, RandomGPTModel -from arabic_llm_benchmark.tasks import DialectIDTask +from llmebench.datasets import DialectADIDataset +from llmebench.models import GPTChatCompletionModel, RandomGPTModel +from llmebench.tasks import DialectIDTask def config(): diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_ChatGPT_ZeroShot.py index cc4b3138..dc87039e 100644 --- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import QADIDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import DialectIDTask +from llmebench.datasets import QADIDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import DialectIDTask def config(): diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_GPTChatCompletion_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_GPTChatCompletion_ZeroShot.py index 60071dff..44254560 100644 --- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_GPTChatCompletion_ZeroShot.py +++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectID_QADI_GPTChatCompletion_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import QADIDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import DialectIDTask +from llmebench.datasets import QADIDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import DialectIDTask def config(): diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_ChatGPT_ZeroShot.py index 64eef7bd..69f4fe5c 100644 --- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_ChatGPT_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import ArabicPOSDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import ArabicPOSTask +from llmebench.datasets import ArabicPOSDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import ArabicPOSTask mapTags = { "UNK": "UNK", diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPT4_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPT4_ZeroShot.py index 7934dec0..18db4d89 100644 --- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPT4_ZeroShot.py +++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPT4_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import ArabicPOSDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import ArabicPOSTask +from llmebench.datasets import ArabicPOSDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import ArabicPOSTask mapTags = { "UNK": "UNK", diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPTChatCompletion_FewShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPTChatCompletion_FewShot.py index 761d13ae..70cf5815 100644 --- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPTChatCompletion_FewShot.py +++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/POS_GPTChatCompletion_FewShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import ArabicPOSDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import ArabicPOSTask +from llmebench.datasets import ArabicPOSDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import ArabicPOSTask mapTags = { "UNK": "UNK", diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py index cf8637a7..0634ab80 100644 --- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import ArabicDiacritizationDataset -from arabic_llm_benchmark.models import GPTModel -from arabic_llm_benchmark.tasks import ArabicDiacritizationTask +from llmebench.datasets import ArabicDiacritizationDataset +from llmebench.models import GPTModel +from llmebench.tasks import ArabicDiacritizationTask def config(): diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/lemmatization/Lemmatization_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/lemmatization/Lemmatization_ChatGPT_ZeroShot.py index ceae7cc9..9950ccac 100644 --- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/lemmatization/Lemmatization_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/lemmatization/Lemmatization_ChatGPT_ZeroShot.py @@ -1,8 +1,8 @@ import os -from arabic_llm_benchmark.datasets import LemmatizationDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import LemmatizationTask +from llmebench.datasets import LemmatizationDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import LemmatizationTask def config(): diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_ChatGPT_ZeroShot.py index 6a093fb4..77a175b7 100644 --- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_ChatGPT_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import ArabicParsingDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import ArabicParsingTask +from llmebench.datasets import ArabicParsingDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import ArabicParsingTask def config(): diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_GPT4_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_GPT4_ZeroShot.py index 9cdfb871..36e3d02f 100644 --- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_GPT4_ZeroShot.py +++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/parsing_GPT4_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import ArabicParsingDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import ArabicParsingTask +from llmebench.datasets import ArabicParsingDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import ArabicParsingTask def config(): diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_ChatGPT_ZeroShot.py index b0830844..d808ce15 100644 --- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_ChatGPT_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import ArabicSegmentationDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel -from arabic_llm_benchmark.tasks import ArabicSegmentationTask +from llmebench.datasets import ArabicSegmentationDataset +from llmebench.models import GPTModel, RandomGPTModel +from llmebench.tasks import ArabicSegmentationTask def config(): diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_GPT4_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_GPT4_ZeroShot.py index d64163ba..e6a3fc8d 100644 --- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_GPT4_ZeroShot.py +++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/segmentation_GPT4_ZeroShot.py @@ -1,9 +1,9 @@ import os import re -from arabic_llm_benchmark.datasets import ArabicSegmentationDataset -from arabic_llm_benchmark.models import GPTChatCompletionModel -from arabic_llm_benchmark.tasks import ArabicSegmentationTask +from llmebench.datasets import ArabicSegmentationDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import ArabicSegmentationTask def config(): diff --git a/arabic_llm_benchmark/__init__.py b/llmebench/__init__.py similarity index 100% rename from arabic_llm_benchmark/__init__.py rename to llmebench/__init__.py diff --git a/arabic_llm_benchmark/__main__.py b/llmebench/__main__.py similarity index 100% rename from arabic_llm_benchmark/__main__.py rename to llmebench/__main__.py diff --git a/arabic_llm_benchmark/benchmark.py b/llmebench/benchmark.py similarity index 100% rename from arabic_llm_benchmark/benchmark.py rename to llmebench/benchmark.py diff --git a/arabic_llm_benchmark/datasets/ANERcorp.py b/llmebench/datasets/ANERcorp.py similarity index 98% rename from arabic_llm_benchmark/datasets/ANERcorp.py rename to llmebench/datasets/ANERcorp.py index d0a77fbb..a9c07b28 100644 --- a/arabic_llm_benchmark/datasets/ANERcorp.py +++ b/llmebench/datasets/ANERcorp.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class ANERcorpDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/ARCD.py b/llmebench/datasets/ARCD.py similarity index 88% rename from arabic_llm_benchmark/datasets/ARCD.py rename to llmebench/datasets/ARCD.py index 78db8772..25d2a00b 100644 --- a/arabic_llm_benchmark/datasets/ARCD.py +++ b/llmebench/datasets/ARCD.py @@ -1,6 +1,6 @@ import json -from arabic_llm_benchmark.datasets.SQuADBase import SQuADBase +from llmebench.datasets.SQuADBase import SQuADBase class ARCDDataset(SQuADBase): diff --git a/arabic_llm_benchmark/datasets/Adult.py b/llmebench/datasets/Adult.py similarity index 95% rename from arabic_llm_benchmark/datasets/Adult.py rename to llmebench/datasets/Adult.py index 7ca6c81f..521ed1db 100644 --- a/arabic_llm_benchmark/datasets/Adult.py +++ b/llmebench/datasets/Adult.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class AdultDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/Aqmar.py b/llmebench/datasets/Aqmar.py similarity index 98% rename from arabic_llm_benchmark/datasets/Aqmar.py rename to llmebench/datasets/Aqmar.py index 0afd5e5a..0c1409f5 100644 --- a/arabic_llm_benchmark/datasets/Aqmar.py +++ b/llmebench/datasets/Aqmar.py @@ -1,6 +1,6 @@ from pathlib import Path -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class AqmarDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/ArSASSentiment.py b/llmebench/datasets/ArSASSentiment.py similarity index 92% rename from arabic_llm_benchmark/datasets/ArSASSentiment.py rename to llmebench/datasets/ArSASSentiment.py index 4d162bfc..2a56c6fb 100644 --- a/arabic_llm_benchmark/datasets/ArSASSentiment.py +++ b/llmebench/datasets/ArSASSentiment.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class ArSASSentimentDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/ArSarcasm.py b/llmebench/datasets/ArSarcasm.py similarity index 95% rename from arabic_llm_benchmark/datasets/ArSarcasm.py rename to llmebench/datasets/ArSarcasm.py index 5c9fab15..ff8da9a2 100644 --- a/arabic_llm_benchmark/datasets/ArSarcasm.py +++ b/llmebench/datasets/ArSarcasm.py @@ -1,6 +1,6 @@ import csv -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class ArSarcasmDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/AraBench.py b/llmebench/datasets/AraBench.py similarity index 96% rename from arabic_llm_benchmark/datasets/AraBench.py rename to llmebench/datasets/AraBench.py index 6900db4a..83220e8f 100644 --- a/arabic_llm_benchmark/datasets/AraBench.py +++ b/llmebench/datasets/AraBench.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class AraBenchDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/ArabGend.py b/llmebench/datasets/ArabGend.py similarity index 93% rename from arabic_llm_benchmark/datasets/ArabGend.py rename to llmebench/datasets/ArabGend.py index db576ce9..537e1a91 100644 --- a/arabic_llm_benchmark/datasets/ArabGend.py +++ b/llmebench/datasets/ArabGend.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class ArabGendDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/ArabicDiacritization.py b/llmebench/datasets/ArabicDiacritization.py similarity index 95% rename from arabic_llm_benchmark/datasets/ArabicDiacritization.py rename to llmebench/datasets/ArabicDiacritization.py index 2d43b14c..0372b8fd 100644 --- a/arabic_llm_benchmark/datasets/ArabicDiacritization.py +++ b/llmebench/datasets/ArabicDiacritization.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class ArabicDiacritizationDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/ArabicPOS.py b/llmebench/datasets/ArabicPOS.py similarity index 95% rename from arabic_llm_benchmark/datasets/ArabicPOS.py rename to llmebench/datasets/ArabicPOS.py index 1e141610..d7448cf3 100644 --- a/arabic_llm_benchmark/datasets/ArabicPOS.py +++ b/llmebench/datasets/ArabicPOS.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class ArabicPOSDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/ArabicParsing.py b/llmebench/datasets/ArabicParsing.py similarity index 96% rename from arabic_llm_benchmark/datasets/ArabicParsing.py rename to llmebench/datasets/ArabicParsing.py index 66fe2f15..c5868e5c 100644 --- a/arabic_llm_benchmark/datasets/ArabicParsing.py +++ b/llmebench/datasets/ArabicParsing.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class ArabicParsingDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/ArabicSegmentation.py b/llmebench/datasets/ArabicSegmentation.py similarity index 96% rename from arabic_llm_benchmark/datasets/ArabicSegmentation.py rename to llmebench/datasets/ArabicSegmentation.py index 76fa272d..e7c12394 100644 --- a/arabic_llm_benchmark/datasets/ArabicSegmentation.py +++ b/llmebench/datasets/ArabicSegmentation.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class ArabicSegmentationDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/ArapTweet.py b/llmebench/datasets/ArapTweet.py similarity index 97% rename from arabic_llm_benchmark/datasets/ArapTweet.py rename to llmebench/datasets/ArapTweet.py index de73e42e..ad67986c 100644 --- a/arabic_llm_benchmark/datasets/ArapTweet.py +++ b/llmebench/datasets/ArapTweet.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class ArapTweetDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/Attentionworthy.py b/llmebench/datasets/Attentionworthy.py similarity index 95% rename from arabic_llm_benchmark/datasets/Attentionworthy.py rename to llmebench/datasets/Attentionworthy.py index 1fdec46a..400d2404 100644 --- a/arabic_llm_benchmark/datasets/Attentionworthy.py +++ b/llmebench/datasets/Attentionworthy.py @@ -1,6 +1,6 @@ import pandas as pd -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class AttentionworthyDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/BanglaSentiment.py b/llmebench/datasets/BanglaSentiment.py similarity index 95% rename from arabic_llm_benchmark/datasets/BanglaSentiment.py rename to llmebench/datasets/BanglaSentiment.py index 29a34479..09d5cca2 100644 --- a/arabic_llm_benchmark/datasets/BanglaSentiment.py +++ b/llmebench/datasets/BanglaSentiment.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class BanglaSentimentDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/Checkworthiness.py b/llmebench/datasets/Checkworthiness.py similarity index 96% rename from arabic_llm_benchmark/datasets/Checkworthiness.py rename to llmebench/datasets/Checkworthiness.py index 90ed51fe..c9fdf837 100644 --- a/arabic_llm_benchmark/datasets/Checkworthiness.py +++ b/llmebench/datasets/Checkworthiness.py @@ -1,6 +1,6 @@ import pandas as pd -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class CheckworthinessDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/Claim.py b/llmebench/datasets/Claim.py similarity index 95% rename from arabic_llm_benchmark/datasets/Claim.py rename to llmebench/datasets/Claim.py index 57745725..ddec7d30 100644 --- a/arabic_llm_benchmark/datasets/Claim.py +++ b/llmebench/datasets/Claim.py @@ -1,6 +1,6 @@ import pandas as pd -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class CovidClaimDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/DialectADI.py b/llmebench/datasets/DialectADI.py similarity index 92% rename from arabic_llm_benchmark/datasets/DialectADI.py rename to llmebench/datasets/DialectADI.py index 466e306b..3b463770 100644 --- a/arabic_llm_benchmark/datasets/DialectADI.py +++ b/llmebench/datasets/DialectADI.py @@ -1,6 +1,6 @@ import pandas as pd -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class DialectADIDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/Emotion.py b/llmebench/datasets/Emotion.py similarity index 94% rename from arabic_llm_benchmark/datasets/Emotion.py rename to llmebench/datasets/Emotion.py index ee656758..18e258a9 100644 --- a/arabic_llm_benchmark/datasets/Emotion.py +++ b/llmebench/datasets/Emotion.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class EmotionDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/FactualityCOVID19.py b/llmebench/datasets/FactualityCOVID19.py similarity index 97% rename from arabic_llm_benchmark/datasets/FactualityCOVID19.py rename to llmebench/datasets/FactualityCOVID19.py index 3ddc5d6f..177e81a4 100644 --- a/arabic_llm_benchmark/datasets/FactualityCOVID19.py +++ b/llmebench/datasets/FactualityCOVID19.py @@ -1,6 +1,6 @@ import pandas as pd -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class FactualityCOVID19Dataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/FactualityKhouja20.py b/llmebench/datasets/FactualityKhouja20.py similarity index 94% rename from arabic_llm_benchmark/datasets/FactualityKhouja20.py rename to llmebench/datasets/FactualityKhouja20.py index ca8a31f2..d8205ba9 100644 --- a/arabic_llm_benchmark/datasets/FactualityKhouja20.py +++ b/llmebench/datasets/FactualityKhouja20.py @@ -1,6 +1,6 @@ import pandas as pd -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class FactualityKhouja20Dataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/FactualityUnifiedFC.py b/llmebench/datasets/FactualityUnifiedFC.py similarity index 97% rename from arabic_llm_benchmark/datasets/FactualityUnifiedFC.py rename to llmebench/datasets/FactualityUnifiedFC.py index cd78971a..a80f1cfb 100644 --- a/arabic_llm_benchmark/datasets/FactualityUnifiedFC.py +++ b/llmebench/datasets/FactualityUnifiedFC.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class FactualityUnifiedFCDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/Harmful.py b/llmebench/datasets/Harmful.py similarity index 95% rename from arabic_llm_benchmark/datasets/Harmful.py rename to llmebench/datasets/Harmful.py index 28b49316..d44eba82 100644 --- a/arabic_llm_benchmark/datasets/Harmful.py +++ b/llmebench/datasets/Harmful.py @@ -1,6 +1,6 @@ import pandas as pd -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class CovidHarmfulDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/HateSpeech.py b/llmebench/datasets/HateSpeech.py similarity index 95% rename from arabic_llm_benchmark/datasets/HateSpeech.py rename to llmebench/datasets/HateSpeech.py index 43246c64..2881c847 100644 --- a/arabic_llm_benchmark/datasets/HateSpeech.py +++ b/llmebench/datasets/HateSpeech.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class HateSpeechDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/Lemmatization.py b/llmebench/datasets/Lemmatization.py similarity index 95% rename from arabic_llm_benchmark/datasets/Lemmatization.py rename to llmebench/datasets/Lemmatization.py index af4a37e0..67f72cd2 100644 --- a/arabic_llm_benchmark/datasets/Lemmatization.py +++ b/llmebench/datasets/Lemmatization.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class LemmatizationDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/Location.py b/llmebench/datasets/Location.py similarity index 94% rename from arabic_llm_benchmark/datasets/Location.py rename to llmebench/datasets/Location.py index bb737435..363f91e3 100644 --- a/arabic_llm_benchmark/datasets/Location.py +++ b/llmebench/datasets/Location.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class LocationDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/MGBWords.py b/llmebench/datasets/MGBWords.py similarity index 96% rename from arabic_llm_benchmark/datasets/MGBWords.py rename to llmebench/datasets/MGBWords.py index 307358af..ffdbc78e 100644 --- a/arabic_llm_benchmark/datasets/MGBWords.py +++ b/llmebench/datasets/MGBWords.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class MGBWordsDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/MLQA.py b/llmebench/datasets/MLQA.py similarity index 88% rename from arabic_llm_benchmark/datasets/MLQA.py rename to llmebench/datasets/MLQA.py index 58d1fdd3..1362ecb3 100644 --- a/arabic_llm_benchmark/datasets/MLQA.py +++ b/llmebench/datasets/MLQA.py @@ -1,6 +1,6 @@ import json -from arabic_llm_benchmark.datasets.SQuADBase import SQuADBase +from llmebench.datasets.SQuADBase import SQuADBase class MLQADataset(SQuADBase): diff --git a/arabic_llm_benchmark/datasets/NameInfo.py b/llmebench/datasets/NameInfo.py similarity index 93% rename from arabic_llm_benchmark/datasets/NameInfo.py rename to llmebench/datasets/NameInfo.py index 0c399f18..17a7026b 100644 --- a/arabic_llm_benchmark/datasets/NameInfo.py +++ b/llmebench/datasets/NameInfo.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class NameInfoDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/NewsCatASND.py b/llmebench/datasets/NewsCatASND.py similarity index 95% rename from arabic_llm_benchmark/datasets/NewsCatASND.py rename to llmebench/datasets/NewsCatASND.py index 069df59d..414a573d 100644 --- a/arabic_llm_benchmark/datasets/NewsCatASND.py +++ b/llmebench/datasets/NewsCatASND.py @@ -1,6 +1,6 @@ import pandas as pd -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class NewsCatASNDDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/NewsCatAkhbarona.py b/llmebench/datasets/NewsCatAkhbarona.py similarity index 95% rename from arabic_llm_benchmark/datasets/NewsCatAkhbarona.py rename to llmebench/datasets/NewsCatAkhbarona.py index 5e6fa125..066cf448 100644 --- a/arabic_llm_benchmark/datasets/NewsCatAkhbarona.py +++ b/llmebench/datasets/NewsCatAkhbarona.py @@ -1,6 +1,6 @@ import pandas as pd -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class NewsCatAkhbaronaDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/NewsCatAlArabiya.py b/llmebench/datasets/NewsCatAlArabiya.py similarity index 95% rename from arabic_llm_benchmark/datasets/NewsCatAlArabiya.py rename to llmebench/datasets/NewsCatAlArabiya.py index b17bf0cc..f95dae45 100644 --- a/arabic_llm_benchmark/datasets/NewsCatAlArabiya.py +++ b/llmebench/datasets/NewsCatAlArabiya.py @@ -1,6 +1,6 @@ import pandas as pd -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class NewsCatAlArabiyaDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/NewsCatAlKhaleej.py b/llmebench/datasets/NewsCatAlKhaleej.py similarity index 95% rename from arabic_llm_benchmark/datasets/NewsCatAlKhaleej.py rename to llmebench/datasets/NewsCatAlKhaleej.py index 6e531798..34d10d58 100644 --- a/arabic_llm_benchmark/datasets/NewsCatAlKhaleej.py +++ b/llmebench/datasets/NewsCatAlKhaleej.py @@ -1,6 +1,6 @@ import pandas as pd -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class NewsCatAlKhaleejDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/Offensive.py b/llmebench/datasets/Offensive.py similarity index 94% rename from arabic_llm_benchmark/datasets/Offensive.py rename to llmebench/datasets/Offensive.py index 3302a42e..19c8a38a 100644 --- a/arabic_llm_benchmark/datasets/Offensive.py +++ b/llmebench/datasets/Offensive.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class OffensiveDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/Propaganda.py b/llmebench/datasets/Propaganda.py similarity index 96% rename from arabic_llm_benchmark/datasets/Propaganda.py rename to llmebench/datasets/Propaganda.py index 5c7dc8c6..5f88e441 100644 --- a/arabic_llm_benchmark/datasets/Propaganda.py +++ b/llmebench/datasets/Propaganda.py @@ -2,7 +2,7 @@ from pathlib import Path -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class PropagandaTweetDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/PropagandaSemEval23.py b/llmebench/datasets/PropagandaSemEval23.py similarity index 98% rename from arabic_llm_benchmark/datasets/PropagandaSemEval23.py rename to llmebench/datasets/PropagandaSemEval23.py index 6708fb35..b655ec38 100644 --- a/arabic_llm_benchmark/datasets/PropagandaSemEval23.py +++ b/llmebench/datasets/PropagandaSemEval23.py @@ -2,7 +2,7 @@ import os from pathlib import Path -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class PropagandaSemEval23Dataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/QADI.py b/llmebench/datasets/QADI.py similarity index 94% rename from arabic_llm_benchmark/datasets/QADI.py rename to llmebench/datasets/QADI.py index 8e8e62f8..1a5c9dbb 100644 --- a/arabic_llm_benchmark/datasets/QADI.py +++ b/llmebench/datasets/QADI.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class QADIDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/SQuADBase.py b/llmebench/datasets/SQuADBase.py similarity index 95% rename from arabic_llm_benchmark/datasets/SQuADBase.py rename to llmebench/datasets/SQuADBase.py index a77b5e59..5ff00d06 100644 --- a/arabic_llm_benchmark/datasets/SQuADBase.py +++ b/llmebench/datasets/SQuADBase.py @@ -1,6 +1,6 @@ import json -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class SQuADBase(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/STSArSemEval17Track1.py b/llmebench/datasets/STSArSemEval17Track1.py similarity index 96% rename from arabic_llm_benchmark/datasets/STSArSemEval17Track1.py rename to llmebench/datasets/STSArSemEval17Track1.py index 15fe31c9..b8fe0180 100644 --- a/arabic_llm_benchmark/datasets/STSArSemEval17Track1.py +++ b/llmebench/datasets/STSArSemEval17Track1.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class STSArSemEval17Track1Dataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/STSArSemEval17Track2.py b/llmebench/datasets/STSArSemEval17Track2.py similarity index 96% rename from arabic_llm_benchmark/datasets/STSArSemEval17Track2.py rename to llmebench/datasets/STSArSemEval17Track2.py index 7f4f2892..67a7178b 100644 --- a/arabic_llm_benchmark/datasets/STSArSemEval17Track2.py +++ b/llmebench/datasets/STSArSemEval17Track2.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class STSArSemEval17Track2Dataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/STSQ2Q.py b/llmebench/datasets/STSQ2Q.py similarity index 95% rename from arabic_llm_benchmark/datasets/STSQ2Q.py rename to llmebench/datasets/STSQ2Q.py index ae8ffe21..1b8c8c5d 100644 --- a/arabic_llm_benchmark/datasets/STSQ2Q.py +++ b/llmebench/datasets/STSQ2Q.py @@ -1,6 +1,6 @@ import pandas as pd -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class Q2QSimDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/Spam.py b/llmebench/datasets/Spam.py similarity index 94% rename from arabic_llm_benchmark/datasets/Spam.py rename to llmebench/datasets/Spam.py index f73ea86c..6bef9d3f 100644 --- a/arabic_llm_benchmark/datasets/Spam.py +++ b/llmebench/datasets/Spam.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class SpamDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/StanceKhouja20.py b/llmebench/datasets/StanceKhouja20.py similarity index 94% rename from arabic_llm_benchmark/datasets/StanceKhouja20.py rename to llmebench/datasets/StanceKhouja20.py index 13eff420..2ad2694b 100644 --- a/arabic_llm_benchmark/datasets/StanceKhouja20.py +++ b/llmebench/datasets/StanceKhouja20.py @@ -1,4 +1,4 @@ -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class StanceKhouja20Dataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/StanceUnifiedFC.py b/llmebench/datasets/StanceUnifiedFC.py similarity index 97% rename from arabic_llm_benchmark/datasets/StanceUnifiedFC.py rename to llmebench/datasets/StanceUnifiedFC.py index 9aaa52cd..0033ae0b 100644 --- a/arabic_llm_benchmark/datasets/StanceUnifiedFC.py +++ b/llmebench/datasets/StanceUnifiedFC.py @@ -3,7 +3,7 @@ import pandas as pd -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class StanceUnifiedFCDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/Subjectivity.py b/llmebench/datasets/Subjectivity.py similarity index 95% rename from arabic_llm_benchmark/datasets/Subjectivity.py rename to llmebench/datasets/Subjectivity.py index 087574c4..313644ad 100644 --- a/arabic_llm_benchmark/datasets/Subjectivity.py +++ b/llmebench/datasets/Subjectivity.py @@ -1,6 +1,6 @@ import pandas as pd -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class SubjectivityDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/TyDiQA.py b/llmebench/datasets/TyDiQA.py similarity index 91% rename from arabic_llm_benchmark/datasets/TyDiQA.py rename to llmebench/datasets/TyDiQA.py index 7ac3ecf8..27602722 100644 --- a/arabic_llm_benchmark/datasets/TyDiQA.py +++ b/llmebench/datasets/TyDiQA.py @@ -1,6 +1,6 @@ import json -from arabic_llm_benchmark.datasets.SQuADBase import SQuADBase +from llmebench.datasets.SQuADBase import SQuADBase class TyDiQADataset(SQuADBase): diff --git a/arabic_llm_benchmark/datasets/XNLI.py b/llmebench/datasets/XNLI.py similarity index 96% rename from arabic_llm_benchmark/datasets/XNLI.py rename to llmebench/datasets/XNLI.py index a19b72b5..3b8e0dcf 100644 --- a/arabic_llm_benchmark/datasets/XNLI.py +++ b/llmebench/datasets/XNLI.py @@ -1,6 +1,6 @@ import pandas as pd -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +from llmebench.datasets.dataset_base import DatasetBase class XNLIDataset(DatasetBase): diff --git a/arabic_llm_benchmark/datasets/XQuAD.py b/llmebench/datasets/XQuAD.py similarity index 90% rename from arabic_llm_benchmark/datasets/XQuAD.py rename to llmebench/datasets/XQuAD.py index d54a4fc2..c6626228 100644 --- a/arabic_llm_benchmark/datasets/XQuAD.py +++ b/llmebench/datasets/XQuAD.py @@ -1,6 +1,6 @@ import json -from arabic_llm_benchmark.datasets.SQuADBase import SQuADBase +from llmebench.datasets.SQuADBase import SQuADBase class XQuADDataset(SQuADBase): diff --git a/arabic_llm_benchmark/datasets/__init__.py b/llmebench/datasets/__init__.py similarity index 100% rename from arabic_llm_benchmark/datasets/__init__.py rename to llmebench/datasets/__init__.py diff --git a/arabic_llm_benchmark/datasets/dataset_base.py b/llmebench/datasets/dataset_base.py similarity index 100% rename from arabic_llm_benchmark/datasets/dataset_base.py rename to llmebench/datasets/dataset_base.py diff --git a/arabic_llm_benchmark/models/BLOOMPetal.py b/llmebench/models/BLOOMPetal.py similarity index 97% rename from arabic_llm_benchmark/models/BLOOMPetal.py rename to llmebench/models/BLOOMPetal.py index 9ba9503f..35bec918 100644 --- a/arabic_llm_benchmark/models/BLOOMPetal.py +++ b/llmebench/models/BLOOMPetal.py @@ -2,7 +2,7 @@ from websockets.sync.client import connect -from arabic_llm_benchmark.models.model_base import ModelBase +from llmebench.models.model_base import ModelBase class BLOOMPetalFailure(Exception): diff --git a/arabic_llm_benchmark/models/GPT.py b/llmebench/models/GPT.py similarity index 98% rename from arabic_llm_benchmark/models/GPT.py rename to llmebench/models/GPT.py index f1a77ba1..a34966bc 100644 --- a/arabic_llm_benchmark/models/GPT.py +++ b/llmebench/models/GPT.py @@ -1,6 +1,6 @@ import openai -from arabic_llm_benchmark.models.model_base import ModelBase +from llmebench.models.model_base import ModelBase class GPTModel(ModelBase): diff --git a/arabic_llm_benchmark/models/RandomGPT.py b/llmebench/models/RandomGPT.py similarity index 94% rename from arabic_llm_benchmark/models/RandomGPT.py rename to llmebench/models/RandomGPT.py index da9ee9b9..5ca725d5 100644 --- a/arabic_llm_benchmark/models/RandomGPT.py +++ b/llmebench/models/RandomGPT.py @@ -1,6 +1,6 @@ import random -from arabic_llm_benchmark.models.model_base import ModelBase +from llmebench.models.model_base import ModelBase class GPTResponseMock(dict): diff --git a/arabic_llm_benchmark/models/__init__.py b/llmebench/models/__init__.py similarity index 100% rename from arabic_llm_benchmark/models/__init__.py rename to llmebench/models/__init__.py diff --git a/arabic_llm_benchmark/models/model_base.py b/llmebench/models/model_base.py similarity index 100% rename from arabic_llm_benchmark/models/model_base.py rename to llmebench/models/model_base.py diff --git a/arabic_llm_benchmark/tasks/Adult.py b/llmebench/tasks/Adult.py similarity index 88% rename from arabic_llm_benchmark/tasks/Adult.py rename to llmebench/tasks/Adult.py index 000bd925..8c99960e 100644 --- a/arabic_llm_benchmark/tasks/Adult.py +++ b/llmebench/tasks/Adult.py @@ -1,6 +1,6 @@ from sklearn.metrics import f1_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class AdultTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/ArabicDiacritization.py b/llmebench/tasks/ArabicDiacritization.py similarity index 98% rename from arabic_llm_benchmark/tasks/ArabicDiacritization.py rename to llmebench/tasks/ArabicDiacritization.py index 5ff83594..c7644bd1 100644 --- a/arabic_llm_benchmark/tasks/ArabicDiacritization.py +++ b/llmebench/tasks/ArabicDiacritization.py @@ -2,7 +2,7 @@ from sklearn.metrics import f1_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase # diff --git a/arabic_llm_benchmark/tasks/ArabicPOS.py b/llmebench/tasks/ArabicPOS.py similarity index 94% rename from arabic_llm_benchmark/tasks/ArabicPOS.py rename to llmebench/tasks/ArabicPOS.py index c9426413..922b6dc5 100644 --- a/arabic_llm_benchmark/tasks/ArabicPOS.py +++ b/llmebench/tasks/ArabicPOS.py @@ -2,7 +2,7 @@ from sklearn.metrics import accuracy_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class ArabicPOSTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/ArabicParsing.py b/llmebench/tasks/ArabicParsing.py similarity index 93% rename from arabic_llm_benchmark/tasks/ArabicParsing.py rename to llmebench/tasks/ArabicParsing.py index 926c364f..25127850 100644 --- a/arabic_llm_benchmark/tasks/ArabicParsing.py +++ b/llmebench/tasks/ArabicParsing.py @@ -2,7 +2,7 @@ from sklearn.metrics import f1_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class ArabicParsingTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/ArabicSegmentation.py b/llmebench/tasks/ArabicSegmentation.py similarity index 94% rename from arabic_llm_benchmark/tasks/ArabicSegmentation.py rename to llmebench/tasks/ArabicSegmentation.py index 86db520e..2c0f3deb 100644 --- a/arabic_llm_benchmark/tasks/ArabicSegmentation.py +++ b/llmebench/tasks/ArabicSegmentation.py @@ -2,7 +2,7 @@ from sklearn.metrics import accuracy_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class ArabicSegmentationTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/Attentionworthy.py b/llmebench/tasks/Attentionworthy.py similarity index 94% rename from arabic_llm_benchmark/tasks/Attentionworthy.py rename to llmebench/tasks/Attentionworthy.py index 00dba977..7daa936f 100644 --- a/arabic_llm_benchmark/tasks/Attentionworthy.py +++ b/llmebench/tasks/Attentionworthy.py @@ -1,6 +1,6 @@ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class AttentionworthyTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/Checkworthiness.py b/llmebench/tasks/Checkworthiness.py similarity index 94% rename from arabic_llm_benchmark/tasks/Checkworthiness.py rename to llmebench/tasks/Checkworthiness.py index 88e763e7..62ec0663 100644 --- a/arabic_llm_benchmark/tasks/Checkworthiness.py +++ b/llmebench/tasks/Checkworthiness.py @@ -1,6 +1,6 @@ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class CheckworthinessTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/ClaimDetection.py b/llmebench/tasks/ClaimDetection.py similarity index 89% rename from arabic_llm_benchmark/tasks/ClaimDetection.py rename to llmebench/tasks/ClaimDetection.py index 7a4d5602..17f334d1 100644 --- a/arabic_llm_benchmark/tasks/ClaimDetection.py +++ b/llmebench/tasks/ClaimDetection.py @@ -1,6 +1,6 @@ from sklearn.metrics import accuracy_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class ClaimDetectionTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/DemographyGender.py b/llmebench/tasks/DemographyGender.py similarity index 88% rename from arabic_llm_benchmark/tasks/DemographyGender.py rename to llmebench/tasks/DemographyGender.py index 3ef3c7b1..d8db7486 100644 --- a/arabic_llm_benchmark/tasks/DemographyGender.py +++ b/llmebench/tasks/DemographyGender.py @@ -1,6 +1,6 @@ from sklearn.metrics import f1_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class DemographyGenderTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/DemographyLocation.py b/llmebench/tasks/DemographyLocation.py similarity index 88% rename from arabic_llm_benchmark/tasks/DemographyLocation.py rename to llmebench/tasks/DemographyLocation.py index b18a7173..ed0473a4 100644 --- a/arabic_llm_benchmark/tasks/DemographyLocation.py +++ b/llmebench/tasks/DemographyLocation.py @@ -1,6 +1,6 @@ from sklearn.metrics import f1_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class DemographyLocationTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/DemographyNameInfo.py b/llmebench/tasks/DemographyNameInfo.py similarity index 89% rename from arabic_llm_benchmark/tasks/DemographyNameInfo.py rename to llmebench/tasks/DemographyNameInfo.py index b0aba3d3..dfc7ac73 100644 --- a/arabic_llm_benchmark/tasks/DemographyNameInfo.py +++ b/llmebench/tasks/DemographyNameInfo.py @@ -1,6 +1,6 @@ from sklearn.metrics import f1_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class DemographyNameInfoTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/DialectID.py b/llmebench/tasks/DialectID.py similarity index 93% rename from arabic_llm_benchmark/tasks/DialectID.py rename to llmebench/tasks/DialectID.py index ac80d878..30f3c7cc 100644 --- a/arabic_llm_benchmark/tasks/DialectID.py +++ b/llmebench/tasks/DialectID.py @@ -1,6 +1,6 @@ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class DialectIDTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/Emotion.py b/llmebench/tasks/Emotion.py similarity index 89% rename from arabic_llm_benchmark/tasks/Emotion.py rename to llmebench/tasks/Emotion.py index 90494e58..af6a243c 100644 --- a/arabic_llm_benchmark/tasks/Emotion.py +++ b/llmebench/tasks/Emotion.py @@ -1,6 +1,6 @@ from sklearn.metrics import jaccard_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class EmotionTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/FactualityCOVID19.py b/llmebench/tasks/FactualityCOVID19.py similarity index 94% rename from arabic_llm_benchmark/tasks/FactualityCOVID19.py rename to llmebench/tasks/FactualityCOVID19.py index d130e979..38f108c5 100644 --- a/arabic_llm_benchmark/tasks/FactualityCOVID19.py +++ b/llmebench/tasks/FactualityCOVID19.py @@ -1,6 +1,6 @@ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class FactualityCOVID19Task(TaskBase): diff --git a/arabic_llm_benchmark/tasks/FactualityKhouja20.py b/llmebench/tasks/FactualityKhouja20.py similarity index 88% rename from arabic_llm_benchmark/tasks/FactualityKhouja20.py rename to llmebench/tasks/FactualityKhouja20.py index 665f213a..079d2c68 100644 --- a/arabic_llm_benchmark/tasks/FactualityKhouja20.py +++ b/llmebench/tasks/FactualityKhouja20.py @@ -1,6 +1,6 @@ from sklearn.metrics import f1_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class FactualityKhouja20Task(TaskBase): diff --git a/arabic_llm_benchmark/tasks/FactualityUnifiedFC.py b/llmebench/tasks/FactualityUnifiedFC.py similarity index 94% rename from arabic_llm_benchmark/tasks/FactualityUnifiedFC.py rename to llmebench/tasks/FactualityUnifiedFC.py index 61f94e81..d87c8fd4 100644 --- a/arabic_llm_benchmark/tasks/FactualityUnifiedFC.py +++ b/llmebench/tasks/FactualityUnifiedFC.py @@ -1,6 +1,6 @@ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class FactualityUnifiedFCTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/HarmfulDetection.py b/llmebench/tasks/HarmfulDetection.py similarity index 90% rename from arabic_llm_benchmark/tasks/HarmfulDetection.py rename to llmebench/tasks/HarmfulDetection.py index 2cb4731f..f3dffa4d 100644 --- a/arabic_llm_benchmark/tasks/HarmfulDetection.py +++ b/llmebench/tasks/HarmfulDetection.py @@ -1,6 +1,6 @@ from sklearn.metrics import f1_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class HarmfulDetectionTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/HateSpeech.py b/llmebench/tasks/HateSpeech.py similarity index 88% rename from arabic_llm_benchmark/tasks/HateSpeech.py rename to llmebench/tasks/HateSpeech.py index 3b836227..b7b1c538 100644 --- a/arabic_llm_benchmark/tasks/HateSpeech.py +++ b/llmebench/tasks/HateSpeech.py @@ -1,6 +1,6 @@ from sklearn.metrics import f1_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class HateSpeechTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/Lemmatization.py b/llmebench/tasks/Lemmatization.py similarity index 92% rename from arabic_llm_benchmark/tasks/Lemmatization.py rename to llmebench/tasks/Lemmatization.py index f32f022f..53c74566 100644 --- a/arabic_llm_benchmark/tasks/Lemmatization.py +++ b/llmebench/tasks/Lemmatization.py @@ -1,6 +1,6 @@ from sklearn.metrics import accuracy_score, f1_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class LemmatizationTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/MachineTranslation.py b/llmebench/tasks/MachineTranslation.py similarity index 87% rename from arabic_llm_benchmark/tasks/MachineTranslation.py rename to llmebench/tasks/MachineTranslation.py index 59be8d7f..6282a737 100644 --- a/arabic_llm_benchmark/tasks/MachineTranslation.py +++ b/llmebench/tasks/MachineTranslation.py @@ -1,6 +1,6 @@ from nltk.translate.bleu_score import corpus_bleu -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class MachineTranslationTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/NER.py b/llmebench/tasks/NER.py similarity index 96% rename from arabic_llm_benchmark/tasks/NER.py rename to llmebench/tasks/NER.py index 35304d1c..9b34410a 100644 --- a/arabic_llm_benchmark/tasks/NER.py +++ b/llmebench/tasks/NER.py @@ -1,6 +1,6 @@ from sklearn.metrics import f1_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class NERTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/NewsCatASND.py b/llmebench/tasks/NewsCatASND.py similarity index 94% rename from arabic_llm_benchmark/tasks/NewsCatASND.py rename to llmebench/tasks/NewsCatASND.py index 3b636fd0..5ab6091e 100644 --- a/arabic_llm_benchmark/tasks/NewsCatASND.py +++ b/llmebench/tasks/NewsCatASND.py @@ -6,7 +6,7 @@ recall_score, ) -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class NewsCatASNDTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/NewsCatAkhbarona.py b/llmebench/tasks/NewsCatAkhbarona.py similarity index 94% rename from arabic_llm_benchmark/tasks/NewsCatAkhbarona.py rename to llmebench/tasks/NewsCatAkhbarona.py index a6b0f80f..c25bbfb9 100644 --- a/arabic_llm_benchmark/tasks/NewsCatAkhbarona.py +++ b/llmebench/tasks/NewsCatAkhbarona.py @@ -6,7 +6,7 @@ recall_score, ) -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class NewsCatAkhbaronaTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/NewsCatAlArabiya.py b/llmebench/tasks/NewsCatAlArabiya.py similarity index 94% rename from arabic_llm_benchmark/tasks/NewsCatAlArabiya.py rename to llmebench/tasks/NewsCatAlArabiya.py index b118f5bc..0ed8e697 100644 --- a/arabic_llm_benchmark/tasks/NewsCatAlArabiya.py +++ b/llmebench/tasks/NewsCatAlArabiya.py @@ -6,7 +6,7 @@ recall_score, ) -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class NewsCatAlArabiyaTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/NewsCatAlKhaleej.py b/llmebench/tasks/NewsCatAlKhaleej.py similarity index 94% rename from arabic_llm_benchmark/tasks/NewsCatAlKhaleej.py rename to llmebench/tasks/NewsCatAlKhaleej.py index a5e5d706..176a08ba 100644 --- a/arabic_llm_benchmark/tasks/NewsCatAlKhaleej.py +++ b/llmebench/tasks/NewsCatAlKhaleej.py @@ -6,7 +6,7 @@ recall_score, ) -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class NewsCatAlKhaleejTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/Offensive.py b/llmebench/tasks/Offensive.py similarity index 88% rename from arabic_llm_benchmark/tasks/Offensive.py rename to llmebench/tasks/Offensive.py index 2918ad2a..b2f37675 100644 --- a/arabic_llm_benchmark/tasks/Offensive.py +++ b/llmebench/tasks/Offensive.py @@ -1,6 +1,6 @@ from sklearn.metrics import f1_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class OffensiveTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/PropagandaMultilabel.py b/llmebench/tasks/PropagandaMultilabel.py similarity index 94% rename from arabic_llm_benchmark/tasks/PropagandaMultilabel.py rename to llmebench/tasks/PropagandaMultilabel.py index fdbe77e3..c661a2a1 100644 --- a/arabic_llm_benchmark/tasks/PropagandaMultilabel.py +++ b/llmebench/tasks/PropagandaMultilabel.py @@ -3,7 +3,7 @@ from sklearn import preprocessing from sklearn.metrics import f1_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class PropagandaMultilabelTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/PropagandaMultilabelSemEval23.py b/llmebench/tasks/PropagandaMultilabelSemEval23.py similarity index 94% rename from arabic_llm_benchmark/tasks/PropagandaMultilabelSemEval23.py rename to llmebench/tasks/PropagandaMultilabelSemEval23.py index 8d562f2e..5e9b6dcd 100644 --- a/arabic_llm_benchmark/tasks/PropagandaMultilabelSemEval23.py +++ b/llmebench/tasks/PropagandaMultilabelSemEval23.py @@ -3,7 +3,7 @@ from sklearn import preprocessing from sklearn.metrics import f1_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class PropagandaMultilabelSemEval23Task(TaskBase): diff --git a/arabic_llm_benchmark/tasks/Q2QSimDetect.py b/llmebench/tasks/Q2QSimDetect.py similarity index 88% rename from arabic_llm_benchmark/tasks/Q2QSimDetect.py rename to llmebench/tasks/Q2QSimDetect.py index 7d916a13..841ed732 100644 --- a/arabic_llm_benchmark/tasks/Q2QSimDetect.py +++ b/llmebench/tasks/Q2QSimDetect.py @@ -1,6 +1,6 @@ from sklearn.metrics import f1_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class Q2QSimDetectionTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/QA.py b/llmebench/tasks/QA.py similarity index 97% rename from arabic_llm_benchmark/tasks/QA.py rename to llmebench/tasks/QA.py index 9956582e..7b5b94c8 100644 --- a/arabic_llm_benchmark/tasks/QA.py +++ b/llmebench/tasks/QA.py @@ -3,7 +3,7 @@ import sys from collections import Counter -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class QATask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/STSTrack1.py b/llmebench/tasks/STSTrack1.py similarity index 90% rename from arabic_llm_benchmark/tasks/STSTrack1.py rename to llmebench/tasks/STSTrack1.py index 961ea526..20b9a0e0 100644 --- a/arabic_llm_benchmark/tasks/STSTrack1.py +++ b/llmebench/tasks/STSTrack1.py @@ -2,7 +2,7 @@ import numpy as np -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class STSTrack1Task(TaskBase): diff --git a/arabic_llm_benchmark/tasks/STSTrack2.py b/llmebench/tasks/STSTrack2.py similarity index 90% rename from arabic_llm_benchmark/tasks/STSTrack2.py rename to llmebench/tasks/STSTrack2.py index ff1463a5..562af7a6 100644 --- a/arabic_llm_benchmark/tasks/STSTrack2.py +++ b/llmebench/tasks/STSTrack2.py @@ -2,7 +2,7 @@ import numpy as np -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class STSTrack2Task(TaskBase): diff --git a/arabic_llm_benchmark/tasks/Sarcasm.py b/llmebench/tasks/Sarcasm.py similarity index 89% rename from arabic_llm_benchmark/tasks/Sarcasm.py rename to llmebench/tasks/Sarcasm.py index 454a6c2f..b6427ffb 100644 --- a/arabic_llm_benchmark/tasks/Sarcasm.py +++ b/llmebench/tasks/Sarcasm.py @@ -1,6 +1,6 @@ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class SarcasmTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/Sentiment.py b/llmebench/tasks/Sentiment.py similarity index 94% rename from arabic_llm_benchmark/tasks/Sentiment.py rename to llmebench/tasks/Sentiment.py index ae5beb89..5b35efe5 100644 --- a/arabic_llm_benchmark/tasks/Sentiment.py +++ b/llmebench/tasks/Sentiment.py @@ -1,6 +1,6 @@ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class SentimentTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/Spam.py b/llmebench/tasks/Spam.py similarity index 88% rename from arabic_llm_benchmark/tasks/Spam.py rename to llmebench/tasks/Spam.py index 9fe3c9eb..5ee9df4d 100644 --- a/arabic_llm_benchmark/tasks/Spam.py +++ b/llmebench/tasks/Spam.py @@ -1,6 +1,6 @@ from sklearn.metrics import f1_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class SpamTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/StanceKhouja20.py b/llmebench/tasks/StanceKhouja20.py similarity index 88% rename from arabic_llm_benchmark/tasks/StanceKhouja20.py rename to llmebench/tasks/StanceKhouja20.py index dd9d9a6f..bbb5930c 100644 --- a/arabic_llm_benchmark/tasks/StanceKhouja20.py +++ b/llmebench/tasks/StanceKhouja20.py @@ -1,6 +1,6 @@ from sklearn.metrics import f1_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class StanceKhouja20Task(TaskBase): diff --git a/arabic_llm_benchmark/tasks/StanceUnifiedFC.py b/llmebench/tasks/StanceUnifiedFC.py similarity index 88% rename from arabic_llm_benchmark/tasks/StanceUnifiedFC.py rename to llmebench/tasks/StanceUnifiedFC.py index dd5e8cd5..5ab04762 100644 --- a/arabic_llm_benchmark/tasks/StanceUnifiedFC.py +++ b/llmebench/tasks/StanceUnifiedFC.py @@ -1,6 +1,6 @@ from sklearn.metrics import f1_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class StanceUnifiedFCTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/Subjectivity.py b/llmebench/tasks/Subjectivity.py similarity index 95% rename from arabic_llm_benchmark/tasks/Subjectivity.py rename to llmebench/tasks/Subjectivity.py index 9ba35380..96272969 100644 --- a/arabic_llm_benchmark/tasks/Subjectivity.py +++ b/llmebench/tasks/Subjectivity.py @@ -6,7 +6,7 @@ recall_score, ) -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class SubjectivityTask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/XNLI.py b/llmebench/tasks/XNLI.py similarity index 88% rename from arabic_llm_benchmark/tasks/XNLI.py rename to llmebench/tasks/XNLI.py index 380ef0dc..6d2884d9 100644 --- a/arabic_llm_benchmark/tasks/XNLI.py +++ b/llmebench/tasks/XNLI.py @@ -1,6 +1,6 @@ from sklearn.metrics import accuracy_score -from arabic_llm_benchmark.tasks.task_base import TaskBase +from llmebench.tasks.task_base import TaskBase class XNLITask(TaskBase): diff --git a/arabic_llm_benchmark/tasks/__init__.py b/llmebench/tasks/__init__.py similarity index 100% rename from arabic_llm_benchmark/tasks/__init__.py rename to llmebench/tasks/__init__.py diff --git a/arabic_llm_benchmark/tasks/task_base.py b/llmebench/tasks/task_base.py similarity index 100% rename from arabic_llm_benchmark/tasks/task_base.py rename to llmebench/tasks/task_base.py diff --git a/arabic_llm_benchmark/utils.py b/llmebench/utils.py similarity index 100% rename from arabic_llm_benchmark/utils.py rename to llmebench/utils.py diff --git a/scripts/format_code.sh b/scripts/format_code.sh index 8b1c17c8..11d9f636 100755 --- a/scripts/format_code.sh +++ b/scripts/format_code.sh @@ -3,12 +3,12 @@ # exit when any command fails set -e -if [[ ! -f setup.cfg ]] || [[ ! -d arabic_llm_benchmark ]] || [[ ! -d assets ]] || [[ ! -d scripts ]] || [[ ! -d tests ]] +if [[ ! -f setup.cfg ]] || [[ ! -d llmebench ]] || [[ ! -d assets ]] || [[ ! -d scripts ]] || [[ ! -d tests ]] then echo "format_code.sh must be run from the root of the repository" exit 1 fi ufmt format assets -ufmt format arabic_llm_benchmark +ufmt format llmebench ufmt format tests diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index e8e02785..a8e84718 100755 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -3,7 +3,7 @@ # exit when any command fails set -e -if [[ ! -f setup.cfg ]] || [[ ! -d arabic_llm_benchmark ]] || [[ ! -d assets ]] || [[ ! -d scripts ]] || [[ ! -d tests ]] +if [[ ! -f setup.cfg ]] || [[ ! -d llmebench ]] || [[ ! -d assets ]] || [[ ! -d scripts ]] || [[ ! -d tests ]] then echo "run_tests.sh must be run from the root of the repository" exit 1 diff --git a/setup.cfg b/setup.cfg index 9a3e48d9..64f25410 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,9 +1,9 @@ [metadata] -name = arabic_llm_benchmark -version = 0.0.1 +name = llmebench +version = 0.1.0 author = Fahim Dalvi author_email = faimaduddin@hbku.edu.qa -description = Arabic Benchmark for LLMs +description = Benchmarking for LLMs long_description = file: README.md long_description_content_type = text/markdown url = https://github.com/qcri diff --git a/tests/datasets/test_exports.py b/tests/datasets/test_exports.py index db82badb..b22935d3 100644 --- a/tests/datasets/test_exports.py +++ b/tests/datasets/test_exports.py @@ -4,16 +4,16 @@ from glob import glob from pathlib import Path -import arabic_llm_benchmark.datasets as datasets -from arabic_llm_benchmark import utils -from arabic_llm_benchmark.datasets.dataset_base import DatasetBase +import llmebench.datasets as datasets +from llmebench import utils +from llmebench.datasets.dataset_base import DatasetBase class TestDatasetExports(unittest.TestCase): @classmethod def setUpClass(cls): # Search for all implemented datasets - framework_dir = Path("arabic_llm_benchmark") + framework_dir = Path("llmebench") cls.implemented_datasets = [ dataset_path for dataset_path in glob(str(framework_dir / "datasets" / "*.py")) diff --git a/tests/models/test_BLOOMPetal.py b/tests/models/test_BLOOMPetal.py index 8795c7db..ad8783eb 100644 --- a/tests/models/test_BLOOMPetal.py +++ b/tests/models/test_BLOOMPetal.py @@ -4,8 +4,8 @@ from unittest.mock import patch -from arabic_llm_benchmark import Benchmark -from arabic_llm_benchmark.models import BLOOMPetalModel +from llmebench import Benchmark +from llmebench.models import BLOOMPetalModel class TestAssetsForBLOOMPetalPrompts(unittest.TestCase): diff --git a/tests/models/test_GPT.py b/tests/models/test_GPT.py index 45cac1a5..a921bd27 100644 --- a/tests/models/test_GPT.py +++ b/tests/models/test_GPT.py @@ -4,8 +4,8 @@ from unittest.mock import patch -from arabic_llm_benchmark import Benchmark -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel +from llmebench import Benchmark +from llmebench.models import GPTModel, RandomGPTModel class TestAssetsForGPTPrompts(unittest.TestCase): diff --git a/tests/models/test_GPTChatCompletion.py b/tests/models/test_GPTChatCompletion.py index 5c3c220b..47404e69 100644 --- a/tests/models/test_GPTChatCompletion.py +++ b/tests/models/test_GPTChatCompletion.py @@ -4,8 +4,8 @@ from unittest.mock import patch -from arabic_llm_benchmark import Benchmark -from arabic_llm_benchmark.models import GPTChatCompletionModel +from llmebench import Benchmark +from llmebench.models import GPTChatCompletionModel class TestAssetsForGPTChatCompletionPrompts(unittest.TestCase): diff --git a/tests/models/test_exports.py b/tests/models/test_exports.py index b43a0889..41ac1881 100644 --- a/tests/models/test_exports.py +++ b/tests/models/test_exports.py @@ -4,16 +4,16 @@ from glob import glob from pathlib import Path -import arabic_llm_benchmark.models as models -from arabic_llm_benchmark import utils -from arabic_llm_benchmark.models.model_base import ModelBase +import llmebench.models as models +from llmebench import utils +from llmebench.models.model_base import ModelBase class TestDatasetExports(unittest.TestCase): @classmethod def setUpClass(cls): # Search for all implemented models - framework_dir = Path("arabic_llm_benchmark") + framework_dir = Path("llmebench") cls.implemented_models = [ model_path for model_path in glob(str(framework_dir / "models" / "*.py")) diff --git a/tests/tasks/test_evaluation.py b/tests/tasks/test_evaluation.py index 603f003c..d7e687f6 100644 --- a/tests/tasks/test_evaluation.py +++ b/tests/tasks/test_evaluation.py @@ -4,7 +4,7 @@ from unittest.mock import patch -from arabic_llm_benchmark import Benchmark +from llmebench import Benchmark class TestAssetsTaskEvaluation(unittest.TestCase): diff --git a/tests/tasks/test_exports.py b/tests/tasks/test_exports.py index 31e8a49f..08e686c8 100644 --- a/tests/tasks/test_exports.py +++ b/tests/tasks/test_exports.py @@ -4,16 +4,16 @@ from glob import glob from pathlib import Path -import arabic_llm_benchmark.tasks as tasks -from arabic_llm_benchmark import Benchmark, utils -from arabic_llm_benchmark.tasks.task_base import TaskBase +import llmebench.tasks as tasks +from llmebench import Benchmark, utils +from llmebench.tasks.task_base import TaskBase class TestTaskExports(unittest.TestCase): @classmethod def setUpClass(cls): # Search for all implemented tasks - framework_dir = Path("arabic_llm_benchmark") + framework_dir = Path("llmebench") cls.implemented_tasks = [ task_path for task_path in glob(str(framework_dir / "tasks" / "*.py")) diff --git a/tests/test_benchmark_assets.py b/tests/test_benchmark_assets.py index 3e417d34..da71ab7d 100644 --- a/tests/test_benchmark_assets.py +++ b/tests/test_benchmark_assets.py @@ -3,7 +3,7 @@ import unittest from unittest.mock import patch -from arabic_llm_benchmark import Benchmark +from llmebench import Benchmark class TestBenchmarkAssets(unittest.TestCase):